In [12]:
import os
import pandas as pd

# directories are:
# './data/train/pos'
# './data/train/neg'
# './data/test/pos'
# './data/test/neg'

In [62]:
def getRating(name):
    name = name.split('_')[1]
    name = name.split('.')[0]
    return int(name)

def clean_file_as_str(content):
    ret = ''
    for line in content:
        t = line.replace('<br />', ' ')
        t.strip('.#)(*&^!@,?')
        ret += ' ' + t.replace('\n', ' ')
    return ret

In [65]:
# format as DataFrame, [content, rating, positiveness]
def format_folder(path):
    table = list()
    names = os.listdir(path)
    positiveness = path.split('/')[-1]
    for name in names:
        rating = getRating(name)
        with open(path + '/' + name) as file:
            content = file.readlines()
            content = clean_file_as_str(content)
            file.close()
        table.append([content, rating, positiveness])
    df = pd.DataFrame(data=table, columns=['content', 'rating', 'positiveness'])
    return df
    

## Excract data from files

In [68]:
df1 = format_folder('./data/test/neg')
df1.head()

Unnamed: 0,content,rating,positiveness
0,Alan Rickman & Emma Thompson give good perfor...,4,neg
1,I have seen this movie and I did not care for...,1,neg
2,"In Los Angeles, the alcoholic and lazy Hank C...",4,neg
3,"This film is bundled along with ""Gli fumavano...",2,neg
4,I only comment on really very good films and ...,1,neg


In [70]:
df1.describe()

Unnamed: 0,rating
count,12500.0
mean,2.22312
std,1.182611
min,1.0
25%,1.0
50%,2.0
75%,3.0
max,4.0


In [71]:
df2 = format_folder('./data/train/neg')
df2.head()

Unnamed: 0,content,rating,positiveness
0,Working with one of the best Shakespeare sour...,4,neg
1,"Well...tremors I, the original started off in...",1,neg
2,Ouch! This one was a bit painful to sit throu...,4,neg
3,"I've seen some crappy movies in my life, but ...",1,neg
4,"""Carriers"" follows the exploits of two guys a...",3,neg


In [72]:
df2.describe()

Unnamed: 0,rating
count,12500.0
mean,2.21696
std,1.19053
min,1.0
25%,1.0
50%,2.0
75%,3.0
max,4.0


In [73]:
df3 = format_folder('./data/test/pos')
df3.head()

Unnamed: 0,content,rating,positiveness
0,"Based on an actual story, John Boorman shows ...",9,pos
1,This is a gem. As a Film Four production - th...,9,pos
2,"I really like this show. It has drama, romanc...",9,pos
3,This is the best 3-D experience Disney has at...,10,pos
4,"Of the Korean movies I've seen, only three ha...",10,pos


In [74]:
df3.describe()

Unnamed: 0,rating
count,12500.0
mean,8.8028
std,1.152119
min,7.0
25%,8.0
50%,9.0
75%,10.0
max,10.0


In [75]:
df4 = format_folder('./data/train/pos')
df4.head()

Unnamed: 0,content,rating,positiveness
0,For a movie that gets no respect there sure a...,9,pos
1,Bizarre horror movie filled with famous faces...,8,pos
2,"A solid, if unremarkable film. Matthau, as Ei...",7,pos
3,It's a strange feeling to sit alone in a thea...,8,pos
4,"You probably all already know this by now, bu...",10,pos


In [76]:
df4.describe()

Unnamed: 0,rating
count,12500.0
mean,8.73848
std,1.161772
min,7.0
25%,8.0
50%,9.0
75%,10.0
max,10.0


## Form a unified dataframe for neg and pos

In [77]:
df_neg = pd.concat(objs=[df1,df2], axis=0)
df_neg.describe()

Unnamed: 0,rating
count,25000.0
mean,2.22004
std,1.186558
min,1.0
25%,1.0
50%,2.0
75%,3.0
max,4.0


In [78]:
df_pos = pd.concat(objs=[df3,df4], axis=0)
df_pos.describe()

Unnamed: 0,rating
count,25000.0
mean,8.77064
std,1.15738
min,7.0
25%,8.0
50%,9.0
75%,10.0
max,10.0


In [80]:
df_pos.iloc[0][0]

' Based on an actual story, John Boorman shows the struggle of an American doctor, whose husband and son were murdered and she was continually plagued with her loss. A holiday to Burma with her sister seemed like a good idea to get away from it all, but when her passport was stolen in Rangoon, she could not leave the country with her sister, and was forced to stay back until she could get I.D. papers from the American embassy. To fill in a day before she could fly out, she took a trip into the countryside with a tour guide. "I tried finding something in those stone statues, but nothing stirred in me. I was stone myself."   Suddenly all hell broke loose and she was caught in a political revolt. Just when it looked like she had escaped and safely boarded a train, she saw her tour guide get beaten and shot. In a split second she decided to jump from the moving train and try to rescue him, with no thought of herself. Continually her life was in danger.   Here is a woman who demonstrated sp

## Pickle all the data

In [82]:
import pickle
pickle.dump(obj=df_neg, file=open('./data/pickle/neg.pickle', 'wb'))
pickle.dump(obj=df_pos, file=open('./data/pickle/pos.pickle', 'wb'))

## Equality check

In [86]:
df_pos_load = pickle.load(open('./data/pickle/pos.pickle', 'rb'))
df_pos.equals(df_pos_load)

True

In [90]:
df_neg_load = pickle.load(open('./data/pickle/neg.pickle', 'rb'))
df_neg.equals(df_neg_load)

True