In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import utils
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

import multiprocessing
cores = multiprocessing.cpu_count()
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import nltk
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [2]:
data = pd.read_csv('labeled_data.csv', engine='python')
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,lemma_words,polarity
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[('past', 'a'), ('PM', 'n'), ('finished', 'v')...",past PM finish watch Francis Ford Coppola Go...,"['past', 'PM', 'finish', 'watch', 'Francis', '...",negative
1,I should probably go to bed.,I should probably go to bed,"[('probably', 'r'), ('go', 'v'), ('bed', 'v')]",probably go bed,"['probably', 'go', 'bed']",neutral
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[('late', 'a'), ('tomorrow', 'n'), ('wake', 'v...",late tomorrow wake bit early,"['late', 'tomorrow', 'wake', 'bit', 'early']",neutral
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[('early', 'r'), ('enough', 'r'), ('postpone',...",early enough postpone write line,"['early', 'enough', 'postpone', 'write', 'line']",positive
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[('seen', 'v'), ('three', None), ('times', 'v'...",see three time opportunity share thought ref...,"['see', 'three', 'time', 'opportunity', 'share...",positive


In [3]:
data = data[['lemma','lemma_words','polarity']]
data

Unnamed: 0,lemma,lemma_words,polarity
0,past PM finish watch Francis Ford Coppola Go...,"['past', 'PM', 'finish', 'watch', 'Francis', '...",negative
1,probably go bed,"['probably', 'go', 'bed']",neutral
2,late tomorrow wake bit early,"['late', 'tomorrow', 'wake', 'bit', 'early']",neutral
3,early enough postpone write line,"['early', 'enough', 'postpone', 'write', 'line']",positive
4,see three time opportunity share thought ref...,"['see', 'three', 'time', 'opportunity', 'share...",positive
...,...,...,...
181911,Yes,['Yes'],
181912,best way possible,"['best', 'way', 'possible']",positive
181913,Troll II classic,"['Troll', 'II', 'classic']",positive
181914,Joshua family take trip Nilbog unbeknown kin...,"['Joshua', 'family', 'take', 'trip', 'Nilbog',...",negative


In [4]:
data_each_polarity = 40000

In [5]:
pos_data = data.loc[data['polarity'] == 'positive']
pos_data = pos_data.sample(n=data_each_polarity)
pos_data

Unnamed: 0,lemma,lemma_words,polarity
34272,classic Scorsese moment favorite scene DiCap...,"['classic', 'Scorsese', 'moment', 'favorite', ...",positive
126018,gore zombie look fake zombie suppose smart e...,"['gore', 'zombie', 'look', 'fake', 'zombie', '...",positive
88491,movie make stop think think go buy ticket wa...,"['movie', 'make', 'stop', 'think', 'think', 'g...",positive
105346,review helpful,"['review', 'helpful']",positive
52657,watch tonight,"['watch', 'tonight']",positive
...,...,...,...
91247,even see trailer yet,"['even', 'see', 'trailer', 'yet']",positive
158882,guess character suppose persuasive,"['guess', 'character', 'suppose', 'persuasive']",positive
24519,review helpful,"['review', 'helpful']",positive
86399,realize drug end even beautiful dream charac...,"['realize', 'drug', 'end', 'even', 'beautiful'...",positive


In [6]:
neg_data = data.loc[data['polarity'] == 'negative']
neg_data = neg_data.sample(n=data_each_polarity)
neg_data

Unnamed: 0,lemma,lemma_words,polarity
97991,many people quite either prologue leave Scot...,"['many', 'people', 'quite', 'either', 'prologu...",negative
51356,end Matrix however leave think great,"['end', 'Matrix', 'however', 'leave', 'think',...",negative
46911,One team sacrifice attain one Infinity Stones,"['One', 'team', 'sacrifice', 'attain', 'one', ...",negative
180975,Everything wrong right bad toothed vegetaria...,"['Everything', 'wrong', 'right', 'bad', 'tooth...",negative
97626,complex plot tell retired cop terror height ...,"['complex', 'plot', 'tell', 'retired', 'cop', ...",negative
...,...,...,...
84406,film stay Aronofsky give last impression gre...,"['film', 'stay', 'Aronofsky', 'give', 'last', ...",negative
174964,Mario Van Peebles annoy way Shark kill ridic...,"['Mario', 'Van', 'Peebles', 'annoy', 'way', 'S...",negative
9556,people talk piece crap usually bring distoib...,"['people', 'talk', 'piece', 'crap', 'usually',...",negative
166119,Wow probably one ugly movie ever see,"['Wow', 'probably', 'one', 'ugly', 'movie', 'e...",negative


In [7]:
data = pd.concat([pos_data, neg_data], ignore_index=True)
data = data.replace({'polarity': {'negative': 0, 'positive': 1}})
data

Unnamed: 0,lemma,lemma_words,polarity
0,classic Scorsese moment favorite scene DiCap...,"['classic', 'Scorsese', 'moment', 'favorite', ...",1
1,gore zombie look fake zombie suppose smart e...,"['gore', 'zombie', 'look', 'fake', 'zombie', '...",1
2,movie make stop think think go buy ticket wa...,"['movie', 'make', 'stop', 'think', 'think', 'g...",1
3,review helpful,"['review', 'helpful']",1
4,watch tonight,"['watch', 'tonight']",1
...,...,...,...
79995,film stay Aronofsky give last impression gre...,"['film', 'stay', 'Aronofsky', 'give', 'last', ...",0
79996,Mario Van Peebles annoy way Shark kill ridic...,"['Mario', 'Van', 'Peebles', 'annoy', 'way', 'S...",0
79997,people talk piece crap usually bring distoib...,"['people', 'talk', 'piece', 'crap', 'usually',...",0
79998,Wow probably one ugly movie ever see,"['Wow', 'probably', 'one', 'ugly', 'movie', 'e...",0


In [8]:
def fit_and_benchmark(model, X_train,X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(classification_report(y_pred,y_test))
  print("Accuracy:",accuracy_score(y_pred, y_test))

In [9]:
rfclf = RandomForestClassifier()
xgb_clf= GradientBoostingClassifier()
lgr = LogisticRegression(max_iter=1000)

In [10]:
target = data['polarity']
lemma_data = data['lemma']
lemma_words_data = data['lemma_words'].apply(lambda x: x[1:-1].split(','))

### TF-IDF

In [11]:
def transform_and_split(model, data, target):
  X = model.fit_transform(data)
  X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, random_state=42
  )
  return X_train, X_test, y_train, y_test

In [12]:
vectorizer = TfidfVectorizer()

In [13]:
X_train, X_test, y_train, y_test = transform_and_split(vectorizer ,lemma_data, target)

In [14]:
fit_and_benchmark(rfclf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      8103
           1       0.84      0.85      0.85      7897

    accuracy                           0.85     16000
   macro avg       0.85      0.85      0.85     16000
weighted avg       0.85      0.85      0.85     16000

Accuracy: 0.85025


In [15]:
fit_and_benchmark(xgb_clf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.90      0.67      0.76     10806
           1       0.55      0.84      0.66      5194

    accuracy                           0.72     16000
   macro avg       0.72      0.75      0.71     16000
weighted avg       0.78      0.72      0.73     16000

Accuracy: 0.7215625


In [16]:
fit_and_benchmark(lgr, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      8123
           1       0.89      0.90      0.89      7877

    accuracy                           0.90     16000
   macro avg       0.90      0.90      0.90     16000
weighted avg       0.90      0.90      0.90     16000

Accuracy: 0.895625


### doc2vec

In [41]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [42]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r.lemma_words, tags=[r.polarity]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r.lemma_words, tags=[r.polarity]), axis=1)

In [43]:
d2v = Doc2Vec(dm=0, vector_size=300, min_count=2, workers=cores)
d2v.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 64000/64000 [00:00<00:00, 1527849.15it/s]


In [44]:
%%time
for epoch in range(30):
    d2v.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    d2v.alpha -= 0.002
    d2v.min_alpha = d2v.alpha

100%|██████████| 64000/64000 [00:00<00:00, 1833475.33it/s]
100%|██████████| 64000/64000 [00:00<00:00, 3055611.34it/s]
100%|██████████| 64000/64000 [00:00<00:00, 4007516.18it/s]
100%|██████████| 64000/64000 [00:00<00:00, 3558358.60it/s]
100%|██████████| 64000/64000 [00:00<00:00, 2916920.64it/s]
100%|██████████| 64000/64000 [00:00<00:00, 2059517.54it/s]
100%|██████████| 64000/64000 [00:00<00:00, 2002636.92it/s]
100%|██████████| 64000/64000 [00:00<00:00, 3430616.59it/s]
100%|██████████| 64000/64000 [00:00<00:00, 1688676.89it/s]
100%|██████████| 64000/64000 [00:00<00:00, 1837252.53it/s]
100%|██████████| 64000/64000 [00:00<00:00, 2911447.46it/s]
100%|██████████| 64000/64000 [00:00<00:00, 2790157.33it/s]
100%|██████████| 64000/64000 [00:00<00:00, 3371288.25it/s]
100%|██████████| 64000/64000 [00:00<00:00, 2069983.47it/s]
100%|██████████| 64000/64000 [00:00<00:00, 3557274.04it/s]
100%|██████████| 64000/64000 [00:00<00:00, 3523468.61it/s]
100%|██████████| 64000/64000 [00:00<00:00, 3775304.22it/

CPU times: total: 12.3 s
Wall time: 1min 44s


In [45]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(list(doc.words))) for doc in sents])
    return targets, regressors

In [46]:
y_train, X_train = vec_for_learning(d2v, train_tagged)
y_test, X_test = vec_for_learning(d2v, test_tagged)

In [47]:
fit_and_benchmark(rfclf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.63      0.53      0.57      9502
           1       0.44      0.54      0.48      6498

    accuracy                           0.53     16000
   macro avg       0.53      0.53      0.53     16000
weighted avg       0.55      0.53      0.54     16000

Accuracy: 0.5333125


In [48]:
fit_and_benchmark(xgb_clf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.81      0.52      0.64     12429
           1       0.26      0.58      0.36      3571

    accuracy                           0.54     16000
   macro avg       0.53      0.55      0.50     16000
weighted avg       0.69      0.54      0.57     16000

Accuracy: 0.535625


In [49]:
fit_and_benchmark(lgr, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.65      0.52      0.58     10053
           1       0.40      0.53      0.45      5947

    accuracy                           0.52     16000
   macro avg       0.52      0.53      0.52     16000
weighted avg       0.56      0.52      0.53     16000

Accuracy: 0.52425
