In [1]:
import os
import glob
import pandas as pd

def read_imdb_data(data_dir='data/aclImdb'):
    data = {}
    labels = {}
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
                    
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)
                
    return data, labels

In [2]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [3]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    """Prepare training and test sets from IMDb movie reviews."""
    
    #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
    
    # Return a unified training data, test data, training labels, test labets
    return data_train, data_test, labels_train, labels_test

In [4]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


In [5]:
train_X[1]

"Maybe I'm a sap but this is the sweetest movies ever! I saw it for the first time when I was around 4 or 5, and I cried my eyes out. Between then and now (embarrassed at age 15) I have seen it over 25 times and have sobbed each and every one of them. Don't worry they're tears of happiness! And it's not all sap! There's a lot of humor and comedy in it too. Usually the whole talking animal thing can be a huge drag but in this movie it's not the case. My only word of advice: Even if you love this-Don't see the sequal...cornyness! I suggest everyone checks this out...you won't be sorry, no matter how old or young you are!"

In [6]:
train_raw = pd.DataFrame(list(zip(train_X, train_y)), columns =['text','target']) 
test_raw = pd.DataFrame(list(zip(test_X, test_y)), columns =['text','target']) 

In [7]:
train_raw.head()

Unnamed: 0,text,target
0,"Last night I finished re-watching ""Jane Eyre"" ...",1
1,Maybe I'm a sap but this is the sweetest movie...,1
2,In keeping with Disney's well-known practice o...,0
3,John Waters has given us a genuinely enjoyable...,1
4,I'm starting to write this review during a bre...,0


In [8]:
test_raw.head()

Unnamed: 0,text,target
0,From the late Sydney Pollack comes a grown up ...,1
1,A fairly interesting look at some characters f...,1
2,I had the opportunity to see this last evening...,1
3,Another Raquel Welch Classic! This Picture hit...,1
4,Rocketship X-M should be viewed by any serious...,0


In [9]:
train_raw.to_csv("train_raw.csv",index=False)
test_raw.to_csv("test_raw.csv",index=False)