In [73]:
!pip install "gensim==3.8.3"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from gensim.test.utils import common_texts
from gensim.sklearn_api import D2VTransformer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [75]:
data = pd.read_csv('labeled_data.csv', engine='python')
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,lemma_words,polarity
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[('past', 'a'), ('PM', 'n'), ('finished', 'v')...",past PM finish watch Francis Ford Coppola Go...,"['past', 'PM', 'finish', 'watch', 'Francis', '...",negative
1,I should probably go to bed.,I should probably go to bed,"[('probably', 'r'), ('go', 'v'), ('bed', 'v')]",probably go bed,"['probably', 'go', 'bed']",neutral
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[('late', 'a'), ('tomorrow', 'n'), ('wake', 'v...",late tomorrow wake bit early,"['late', 'tomorrow', 'wake', 'bit', 'early']",neutral
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[('early', 'r'), ('enough', 'r'), ('postpone',...",early enough postpone write line,"['early', 'enough', 'postpone', 'write', 'line']",positive
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[('seen', 'v'), ('three', None), ('times', 'v'...",see three time opportunity share thought ref...,"['see', 'three', 'time', 'opportunity', 'share...",positive


In [76]:
data = data[['lemma','lemma_words','polarity']]
data

Unnamed: 0,lemma,lemma_words,polarity
0,past PM finish watch Francis Ford Coppola Go...,"['past', 'PM', 'finish', 'watch', 'Francis', '...",negative
1,probably go bed,"['probably', 'go', 'bed']",neutral
2,late tomorrow wake bit early,"['late', 'tomorrow', 'wake', 'bit', 'early']",neutral
3,early enough postpone write line,"['early', 'enough', 'postpone', 'write', 'line']",positive
4,see three time opportunity share thought ref...,"['see', 'three', 'time', 'opportunity', 'share...",positive
...,...,...,...
181911,Yes,['Yes'],
181912,best way possible,"['best', 'way', 'possible']",positive
181913,Troll II classic,"['Troll', 'II', 'classic']",positive
181914,Joshua family take trip Nilbog unbeknown kin...,"['Joshua', 'family', 'take', 'trip', 'Nilbog',...",negative


In [77]:
data_each_polarity = 40000

In [78]:
pos_data = data.loc[data['polarity'] == 'positive']
pos_data = pos_data.sample(n=data_each_polarity)
pos_data

Unnamed: 0,lemma,lemma_words,polarity
8124,seem people love movie act talk like movie w...,"['seem', 'people', 'love', 'movie', 'act', 'ta...",positive
69232,Strong movie definitely first film opinion s...,"['Strong', 'movie', 'definitely', 'first', 'fi...",positive
139867,truly script make laugh cry come back,"['truly', 'script', 'make', 'laugh', 'cry', 'c...",positive
71456,review helpful,"['review', 'helpful']",positive
45071,side Nebula open Quantum gap thanos army,"['side', 'Nebula', 'open', 'Quantum', 'gap', '...",positive
...,...,...,...
78215,jarring viewer transport completely differen...,"['jarring', 'viewer', 'transport', 'completely...",positive
144681,review helpful,"['review', 'helpful']",positive
107453,Davies Living Daylights play Indiana friend ...,"['Davies', 'Living', 'Daylights', 'play', 'Ind...",positive
49457,define real,"['define', 'real']",positive


In [79]:
neg_data = data.loc[data['polarity'] == 'negative']
neg_data = neg_data.sample(n=data_each_polarity)
neg_data

Unnamed: 0,lemma,lemma_words,polarity
27424,Blowing stuff,"['Blowing', 'stuff']",negative
135055,little know Scientology find picture think s...,"['little', 'know', 'Scientology', 'find', 'pic...",negative
92729,intense scene movie opinion Jodie Foster sta...,"['intense', 'scene', 'movie', 'opinion', 'Jodi...",negative
13601,Henry become like thug bis life crumbles sur...,"['Henry', 'become', 'like', 'thug', 'bis', 'li...",negative
142864,personal history way hind sight know lot hea...,"['personal', 'history', 'way', 'hind', 'sight'...",negative
...,...,...,...
61833,face Nazi charnel house statement make sink ...,"['face', 'Nazi', 'charnel', 'house', 'statemen...",negative
20046,Given role narrator mirror cum doppelg nger ...,"['Given', 'role', 'narrator', 'mirror', 'cum',...",negative
89768,suicide never react appropriate depth,"['suicide', 'never', 'react', 'appropriate', '...",negative
138699,sure stop Jonnie embrace within first minute...,"['sure', 'stop', 'Jonnie', 'embrace', 'within'...",negative


In [80]:
data = pd.concat([pos_data, neg_data], ignore_index=True)
data = data.replace({'polarity': {'negative': 0, 'positive': 1}})
data

Unnamed: 0,lemma,lemma_words,polarity
0,seem people love movie act talk like movie w...,"['seem', 'people', 'love', 'movie', 'act', 'ta...",1
1,Strong movie definitely first film opinion s...,"['Strong', 'movie', 'definitely', 'first', 'fi...",1
2,truly script make laugh cry come back,"['truly', 'script', 'make', 'laugh', 'cry', 'c...",1
3,review helpful,"['review', 'helpful']",1
4,side Nebula open Quantum gap thanos army,"['side', 'Nebula', 'open', 'Quantum', 'gap', '...",1
...,...,...,...
79995,face Nazi charnel house statement make sink ...,"['face', 'Nazi', 'charnel', 'house', 'statemen...",0
79996,Given role narrator mirror cum doppelg nger ...,"['Given', 'role', 'narrator', 'mirror', 'cum',...",0
79997,suicide never react appropriate depth,"['suicide', 'never', 'react', 'appropriate', '...",0
79998,sure stop Jonnie embrace within first minute...,"['sure', 'stop', 'Jonnie', 'embrace', 'within'...",0


In [81]:
def transform_and_split(model, data, target):
  X = model.fit_transform(data)
  X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, random_state=42
  )
  return X_train, X_test, y_train, y_test

In [82]:
def fit_and_benchmark(model, X_train,X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(classification_report(y_pred,y_test))
  print("Accuracy:",accuracy_score(y_pred, y_test))

In [83]:
vectorizer = TfidfVectorizer()
doc2vec = D2VTransformer(min_count=1, size=300)

In [84]:
rfclf = RandomForestClassifier()
xgb_clf= GradientBoostingClassifier()
lgr = LogisticRegression(max_iter=1000)

In [96]:
target = data['polarity']
lemma_data = data['lemma']
lemma_words_data = data['lemma_words'].apply(lambda x: x[1:-1].split(','))

In [86]:
X_train, X_test, y_train, y_test = transform_and_split(vectorizer ,lemma_data, target)

In [87]:
fit_and_benchmark(rfclf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.84      0.85      0.84      7970
           1       0.85      0.84      0.84      8030

    accuracy                           0.84     16000
   macro avg       0.84      0.84      0.84     16000
weighted avg       0.84      0.84      0.84     16000

Accuracy: 0.8433125


In [88]:
fit_and_benchmark(xgb_clf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.90      0.67      0.77     10753
           1       0.55      0.84      0.67      5247

    accuracy                           0.72     16000
   macro avg       0.72      0.75      0.72     16000
weighted avg       0.78      0.72      0.73     16000

Accuracy: 0.72475


In [89]:
fit_and_benchmark(lgr, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      8090
           1       0.89      0.90      0.89      7910

    accuracy                           0.89     16000
   macro avg       0.89      0.89      0.89     16000
weighted avg       0.89      0.89      0.89     16000

Accuracy: 0.8946875


In [99]:
X_train, X_test, y_train, y_test = transform_and_split(doc2vec ,lemma_words_data, target)

In [100]:
fit_and_benchmark(rfclf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.71      0.61      0.66      9376
           1       0.54      0.65      0.59      6624

    accuracy                           0.63     16000
   macro avg       0.63      0.63      0.62     16000
weighted avg       0.64      0.63      0.63     16000

Accuracy: 0.6265625


In [101]:
fit_and_benchmark(xgb_clf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.73      0.61      0.67      9676
           1       0.53      0.66      0.59      6324

    accuracy                           0.63     16000
   macro avg       0.63      0.64      0.63     16000
weighted avg       0.65      0.63      0.63     16000

Accuracy: 0.6306875


In [102]:
fit_and_benchmark(lgr, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.64      0.61      0.63      8437
           1       0.59      0.62      0.61      7563

    accuracy                           0.62     16000
   macro avg       0.62      0.62      0.62     16000
weighted avg       0.62      0.62      0.62     16000

Accuracy: 0.617
