In [3]:
import pandas as pd
import numpy as np
from glob import glob

In [7]:
train_pos_list = []
train_neg_list = []

pos_list = ['train/pos/*.txt', 'test/pos/*.txt']
neg_list = ['train/neg/*.txt', 'test/neg/*.txt']

for pos in pos_list:
    for f in glob(pos):
        with open(f) as temp_f:
            train_pos_list.append(temp_f.read().replace('<br />',''))
series_pos = pd.Series(train_pos_list)


for neg in neg_list:
    for f in glob(neg):
        with open(f) as temp_f:
            train_neg_list.append(temp_f.read().replace('<br />',''))
series_neg = pd.Series(train_neg_list)

In [8]:
df_pos = pd.DataFrame(columns=['review'],data=series_pos) # all positive reviews
df_neg = pd.DataFrame(columns=['review'],data=series_neg) # all negative reviews

In [9]:
df_pos['label'] = [1 for i in range(0, 25000)] # add labels
df_neg['label'] = [0 for i in range(0, 25000)] # add labels
df_all_reviews = pd.concat([df_pos, df_neg]).reset_index(drop=True) # merge pos reviews & neg reviews

In [10]:
df_pos

Unnamed: 0,review,label
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1
5,I saw the movie with two grown children. Altho...,1
6,You're using the IMDb.You've given some hefty ...,1
7,This was a good film with a powerful message o...,1
8,"Made after QUARTET was, TRIO continued the qua...",1
9,"For a mature man, to admit that he shed a tear...",1


In [11]:
df_neg

Unnamed: 0,review,label
0,Working with one of the best Shakespeare sourc...,0
1,"Well...tremors I, the original started off in ...",0
2,Ouch! This one was a bit painful to sit throug...,0
3,"I've seen some crappy movies in my life, but t...",0
4,"""Carriers"" follows the exploits of two guys an...",0
5,I had been looking forward to seeing this film...,0
6,Effect(s) without cause is generally not possi...,0
7,"This picture started out with good intentions,...",0
8,I chose to see this movie because it got a goo...,0
9,This film has to be the worst I have ever seen...,0


In [12]:
df_all_reviews

Unnamed: 0,review,label
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1
5,I saw the movie with two grown children. Altho...,1
6,You're using the IMDb.You've given some hefty ...,1
7,This was a good film with a powerful message o...,1
8,"Made after QUARTET was, TRIO continued the qua...",1
9,"For a mature man, to admit that he shed a tear...",1


In [4]:
from sklearn.model_selection import train_test_split

In [13]:
X = df_all_reviews['review']
y = df_all_reviews['label']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [15]:
print(y_train.mean(), y_test.mean()) #  the ratio of positive to negative reviews is 1:1

0.5 0.5


# Using sklearn


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [170]:
# Vectorizer: CountVectorizer
# Model: MultinomialNB
# Grams: 1

clf_count = Pipeline([                 
    ('vec', CountVectorizer()),  
    ('nb', MultinomialNB())      
])
clf_count.fit(X_train, y_train)   
y_pred = clf_count.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.83      0.88      0.85      7500
          1       0.87      0.82      0.84      7500

avg / total       0.85      0.85      0.85     15000

Accuracy:  0.8468666666666667


In [171]:
# Vectorizer: TfidfVectorizer
# Model: MultinomialNB
# Grams: 1

clf_tf = Pipeline([                 
    ('vec', TfidfVectorizer()),  
    ('nb', MultinomialNB())      
])
clf_tf.fit(X_train, y_train)   
y_pred = clf_tf.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.85      0.89      0.87      7500
          1       0.88      0.84      0.86      7500

avg / total       0.87      0.87      0.87     15000

Accuracy:  0.8654666666666667


In [173]:
# Vectorizer: CountVectorizer
# Model: LogisticRegression
# Grams: 1

clf_lr_count = Pipeline([                 
    ('vec', CountVectorizer()),  
    ('lr', LogisticRegression())      
])
clf_lr_count.fit(X_train, y_train)   
y_pred = clf_lr_count.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.90      0.88      0.89      7500
          1       0.88      0.90      0.89      7500

avg / total       0.89      0.89      0.89     15000

Accuracy:  0.8914


In [174]:
# Vectorizer: TfidfVectorizer
# Model: LogisticRegression
# Grams: 1

clf_lr_tf = Pipeline([                 
    ('vec', TfidfVectorizer()),  
    ('lr', LogisticRegression())      
])
clf_lr_tf.fit(X_train, y_train)   
y_pred = clf_lr_tf.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.91      0.89      0.90      7500
          1       0.89      0.91      0.90      7500

avg / total       0.90      0.90      0.90     15000

Accuracy:  0.8978666666666667


In [176]:
# Vectorizer: CountVectorizer
# Model: MultinomialNB
# Grams: 2

clf_count_bigram = Pipeline([                 
    ('vec', CountVectorizer(ngram_range=(1, 2))),  
    ('nb', MultinomialNB())
])
clf_count_bigram.fit(X_train, y_train)   
y_pred = clf_count_bigram.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.87      0.90      0.88      7500
          1       0.89      0.87      0.88      7500

avg / total       0.88      0.88      0.88     15000

Accuracy:  0.8816


In [171]:
# Vectorizer: TfidfVectorizer
# Model: MultinomialNB
# Grams: 2

clf_tf_bigram = Pipeline([                 
    ('vec', TfidfVectorizer(ngram_range=(1, 2))),  
    ('nb', MultinomialNB())      
])
clf_tf_bigram.fit(X_train, y_train)   
y_pred = clf_tf_bigram.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.85      0.89      0.87      7500
          1       0.88      0.84      0.86      7500

avg / total       0.87      0.87      0.87     15000

Accuracy:  0.8654666666666667


In [168]:
# Vectorizer: CountVectorizer
# Model: LogisticRegression
# Grams: 2

clf_lr_count_bigram = Pipeline([                 
    ('vec', CountVectorizer(ngram_range=(1, 2))),  
    ('lr', LogisticRegression())      
])
clf_lr_count_bigram.fit(X_train, y_train)   
y_pred = clf_lr_count_bigram.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.92      0.90      0.91      7500
          1       0.90      0.92      0.91      7500

avg / total       0.91      0.91      0.91     15000

Accuracy:  0.9102666666666667


In [174]:
# Vectorizer: TfidfVectorizer
# Model: LogisticRegression
# Grams: 2

clf_lr_tf_bigram = Pipeline([                 
    ('vec', TfidfVectorizer(ngram_range=(1, 2))),  
    ('lr', LogisticRegression())      
])
clf_lr_tf_bigram.fit(X_train, y_train)   
y_pred = clf_lr_tf_bigram.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.91      0.89      0.90      7500
          1       0.89      0.91      0.90      7500

avg / total       0.90      0.90      0.90     15000

Accuracy:  0.8978666666666667


In [315]:
import pickle

# The model useing following parameters has the highest score.
# Vectorizer: CountVectorizer
# Model: LogisticRegression
# Grams: 2

with open('model.pkl','wb') as f:
    pickle.dump(clf_lr_count_bigram, f)

In [316]:
with open('model.pkl','rb') as f:
    model_sklearn = pickle.load(f)

In [317]:
model_sklearn.predict_proba(['This is a good movie.'])

array([[0.22564186, 0.77435814]])

# Using fastText 


In [16]:
df_fast_train = pd.DataFrame({'label':y_train, 'review':X_train})
df_fast_train = df_fast_train.replace(1,'__label__positive')
df_fast_train = df_fast_train.replace(0,'__label__negative')

In [17]:
df_fast_train

Unnamed: 0,label,review
11978,__label__positive,"Before there was Crash, there was this interes..."
1532,__label__positive,In the mid-1930s Hollywood was regaining its c...
25683,__label__negative,I have a piece of advice for the people who ma...
45025,__label__negative,First the premise stinks...little boy likes to...
2700,__label__positive,"This is a so called 'feel-good' movies, howeve..."
23118,__label__positive,The best so-bad-it's-good movie ever made. Rud...
40091,__label__negative,"Acting 10, Script 1. ""Hurlyburly"" is from that..."
9550,__label__positive,This film is more about how children make sens...
11581,__label__positive,"Wow, it's been years since I last saw this mov..."
229,__label__positive,We fans of Ed Wood tend to be an obsessive bun...


In [18]:
df_fast_test = pd.DataFrame({'label':y_test, 'review':X_test})
df_fast_test = df_fast_test.replace(1,'__label__positive')
df_fast_test = df_fast_test.replace(0,'__label__negative')

In [19]:
df_fast_test

Unnamed: 0,label,review
40518,__label__negative,That's how I was when I walked (staggered) out...
43509,__label__negative,A true yawner and a bad film even for the Chan...
23419,__label__positive,A touching movie. It is full of emotions and w...
47043,__label__negative,A feminist tract in which if you the viewer be...
49474,__label__negative,I picked out this DVD out of the cheepo bin at...
4219,__label__positive,people claim its edited funny but they had to ...
92,__label__positive,"I happened upon this film by accident, and rea..."
21136,__label__positive,The first bottom movie was an absolute laugh f...
38411,__label__negative,We screened this movie in a club as an example...
31400,__label__negative,I remember watching this movie when I was youn...


In [52]:
df_fast_train.to_csv('train.txt',sep=' ',index=False,header=False)
df_fast_test.to_csv('test.txt',sep=' ',index=False,header=False)

In [250]:
X_test.to_csv('test_fasttext.txt',sep=' ',index=False,header=False) # dump reviews without labels, used to test

In [23]:
from fastText import train_supervised
from fastText import load_model

In [245]:
model = train_supervised(
     input="train.txt",
     epoch=50,
     minCount=1,
     wordNgrams=2,
)

model.save_model("model.bin")  

In [24]:
model = load_model("model.bin")

In [314]:
result = model.test("test.txt")
result

(15000, 0.8989333333333334, 0.8989333333333334)

In [25]:
y_true_fasttext = df_fast_test['label']

In [26]:
y_pred_fasttext = []

with open('test_fasttext.txt') as reviews:
    for review in reviews:
        y_pred_label, score = model.predict(review.strip('\n'))   
        y_pred_fasttext.append(y_pred_label[0])

In [27]:
# computer the metrics of fastText
print(classification_report(y_true_fasttext, y_pred_fasttext))
print('Accuracy: ',accuracy_score(y_true_fasttext, y_pred_fasttext))

                   precision    recall  f1-score   support

__label__negative       0.91      0.89      0.90      7500
__label__positive       0.89      0.91      0.90      7500

      avg / total       0.90      0.90      0.90     15000

Accuracy:  0.8989333333333334
