In [197]:
import pandas as pd
import numpy as np
import re
import collections

In [198]:
# read training data for IMDB and yelp
ITrain = pd.read_csv('IMDB-train.txt',sep = '\t',header = None)
YTrain = pd.read_csv('yelp-train.txt',sep = '\t',header = None)

def prep(df,vocab):
    # delete everything in the review that is not a letter or a whitespace
    # lowercasing all the letters
    for i in range(0,len(df[0])):
        df.iloc[i,0] = re.sub(r'[^ a-zA-Z]+', "", df.iloc[i,0]).lower().lstrip().rstrip()
        df.iloc[i,0] = re.sub(r'[ ]+', " ", df.iloc[i,0])
    
    if vocab == False:
        cnt = collections.Counter()
        for i in range(0,len(df[0])):
            words =  df.iloc[i,0].split(" ")
            for word in words:
                cnt[word] +=1

        # get word and corresponding frequency for 100000 most common vocab
        w,f = zip(*cnt.most_common(10000))
    else:
        w,f = zip(*vocab)
    
    # substitute review with matching vocab id
    for i in range(0,len(df[0])):
        review = df.iloc[i,0].split(" ")
        for j in range(0,len(review)):
            try:
                review[j] = w.index(review[j])
            except ValueError:
                review[j] = ''
        df.iloc[i,0] = " ".join(str(e) for e in review)
        df.iloc[i,0] = re.sub(r'[ ]+', " ", df.iloc[i,0]).lstrip().rstrip()
    
    return df,w,f

In [182]:
# get preprocessed training data
IT,Iw,If = prep(ITrain,vocab = False)
YT,Yw,Yf = prep(YTrain,vocab = False)

In [194]:
IT.to_csv('IMDB-train-prep.txt',sep='\t',header=None,index=False)
YT.to_csv('yelp-train-prep.txt',sep='\t',header=None,index=False)

In [201]:
# read validating data for IMDB and yelp
IValid = pd.read_csv('IMDB-valid.txt',sep = '\t',header = None)
YValid = pd.read_csv('yelp-valid.txt',sep = '\t',header = None)

# get preprocessed validating data
IV,Iw,If = prep(IValid,vocab = zip(Iw,If))
YV,Yw,Yf = prep(YValid,vocab = zip(Yw,Yf))

In [205]:
IV.to_csv('IMDB-valid-prep.txt',sep='\t',header=None,index=False)
YV.to_csv('yelp-valid-prep.txt',sep='\t',header=None,index=False)

In [206]:
# read test data for IMDB and yelp
ITest = pd.read_csv('IMDB-test.txt',sep = '\t',header = None)
YTest = pd.read_csv('yelp-test.txt',sep = '\t',header = None)

# get preprocessed test data
ITe,Iw,If = prep(ITest,vocab = zip(Iw,If))
YTe,Yw,Yf = prep(YTest,vocab = zip(Yw,Yf))

ITe.to_csv('IMDB-test-prep.txt',sep='\t',header=None,index=False)
YTe.to_csv('yelp-test-prep.txt',sep='\t',header=None,index=False)

In [212]:
Iv = list(zip(Iw,range(0,10000),If))
pd.DataFrame(Iv).to_csv('IMDB-vocab.txt',sep='\t',header=None,index=False)
Yv = list(zip(Yw,range(0,10000),Yf))
pd.DataFrame(Yv).to_csv('yelp-vocab.txt',sep='\t',header=None,index=False)