In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [97]:
data = pd.read_csv('labeled_data.csv', engine='python')

In [98]:
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,polarity
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[('past', 'a'), ('PM', 'n'), ('finished', 'v')...",past PM finish watch Francis Ford Coppola Go...,negative
1,I should probably go to bed.,I should probably go to bed,"[('probably', 'r'), ('go', 'v'), ('bed', 'v')]",probably go bed,neutral
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[('late', 'a'), ('tomorrow', 'n'), ('wake', 'v...",late tomorrow wake bit early,neutral
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[('early', 'r'), ('enough', 'r'), ('postpone',...",early enough postpone write line,positive
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[('seen', 'v'), ('three', None), ('times', 'v'...",see three time opportunity share thought ref...,positive


In [99]:
data = data[['lemma', 'polarity']]
data

Unnamed: 0,lemma,polarity
0,past PM finish watch Francis Ford Coppola Go...,negative
1,probably go bed,neutral
2,late tomorrow wake bit early,neutral
3,early enough postpone write line,positive
4,see three time opportunity share thought ref...,positive
...,...,...
183651,way make movie purpose unless crazy,negative
183652,found helpful,neutral
183653,review helpful,positive
183654,Sign vote,neutral


In [100]:
data_each_polarity = 40000

In [101]:
pos_data = data.loc[data['polarity'] == 'positive']
pos_data = pos_data.sample(n=data_each_polarity)
pos_data

Unnamed: 0,lemma,polarity
39038,start comment let assure Q Tarantino fan lov...,positive
109565,fun watch Disaster Movie,positive
45998,may much actor sheer awfulness performance B...,positive
89039,Jack Nicholsons performance best actor right...,positive
61705,Whiplash also feature one intense epic endin...,positive
...,...,...
25508,dream reality importance truly live,positive
28893,big screen far away best place view masterpi...,positive
64540,final sequence cheer Teller drum slap Simmons,positive
48521,story family watch,positive


In [102]:
neg_data = data.loc[data['polarity'] == 'negative']
neg_data = neg_data.sample(n=data_each_polarity)
neg_data

Unnamed: 0,lemma,polarity
85683,Silence Lambs also become bona fide cinemati...,negative
143049,film adaptation Antigone English class view ...,negative
65077,Arthur Fleck Joaquin Phoenix mentally unstab...,negative
113879,quite remarkable though crew yes crew manage...,negative
137243,Travolta vanity also blame movie undo,negative
...,...,...
164776,suppose believe one extraordinarily trashy m...,negative
29380,Shore score perfect pure cinematic fantasy S...,negative
86238,OK put mentally ill people kind prison anymo...,negative
112083,Fans extremely cheesy cinema ala Ed Wood mis...,negative


In [103]:
data = pd.concat([pos_data, neg_data], ignore_index=True)
data = data.replace({'polarity': {'negative': 0, 'positive': 1}})
data

Unnamed: 0,lemma,polarity
0,start comment let assure Q Tarantino fan lov...,1
1,fun watch Disaster Movie,1
2,may much actor sheer awfulness performance B...,1
3,Jack Nicholsons performance best actor right...,1
4,Whiplash also feature one intense epic endin...,1
...,...,...
79995,suppose believe one extraordinarily trashy m...,0
79996,Shore score perfect pure cinematic fantasy S...,0
79997,OK put mentally ill people kind prison anymo...,0
79998,Fans extremely cheesy cinema ala Ed Wood mis...,0


In [104]:
target = data['polarity']
lemma_data = data['lemma']

In [105]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(lemma_data)
X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, random_state=42
)
X_train

<64000x29742 sparse matrix of type '<class 'numpy.float64'>'
	with 692137 stored elements in Compressed Sparse Row format>

In [106]:
rfclf = RandomForestClassifier()

In [107]:
rfclf.fit(X_train, y_train)

In [108]:
y_pred = rfclf.predict(X_test)
print(classification_report(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      8041
           1       0.85      0.85      0.85      7959

    accuracy                           0.85     16000
   macro avg       0.85      0.85      0.85     16000
weighted avg       0.85      0.85      0.85     16000

Accuracy: 0.85025


In [109]:
xgb_clf= GradientBoostingClassifier()

In [110]:
xgb_clf.fit(X_train,y_train)

In [111]:
y_pred_2 = xgb_clf.predict(X_test)
print(classification_report(y_pred_2,y_test))
print("Accuracy:",accuracy_score(y_pred_2, y_test))

              precision    recall  f1-score   support

           0       0.91      0.67      0.77     10788
           1       0.56      0.86      0.68      5212

    accuracy                           0.73     16000
   macro avg       0.73      0.77      0.73     16000
weighted avg       0.79      0.73      0.74     16000

Accuracy: 0.7344375


In [112]:
gnb = GaussianNB()

In [None]:
gnb.fit(X_train.toarray(),y_train)

In [None]:
y_pred_3 = xgb_clf.predict(X_test)
print(classification_report(y_pred_3,y_test))
print("Accuracy:",accuracy_score(y_pred_3, y_test))