In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [21]:
data = pd.read_csv('labeled_data.csv', engine='python')

In [22]:
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,polarity
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[('past', 'a'), ('PM', 'n'), ('finished', 'v')...",past PM finish watch Francis Ford Coppola Go...,negative
1,I should probably go to bed.,I should probably go to bed,"[('probably', 'r'), ('go', 'v'), ('bed', 'v')]",probably go bed,neutral
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[('late', 'a'), ('tomorrow', 'n'), ('wake', 'v...",late tomorrow wake bit early,neutral
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[('early', 'r'), ('enough', 'r'), ('postpone',...",early enough postpone write line,positive
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[('seen', 'v'), ('three', None), ('times', 'v'...",see three time opportunity share thought ref...,positive


In [23]:
data = data[['lemma', 'polarity']]
data

Unnamed: 0,lemma,polarity
0,past PM finish watch Francis Ford Coppola Go...,negative
1,probably go bed,neutral
2,late tomorrow wake bit early,neutral
3,early enough postpone write line,positive
4,see three time opportunity share thought ref...,positive
...,...,...
183651,way make movie purpose unless crazy,negative
183652,found helpful,neutral
183653,review helpful,positive
183654,Sign vote,neutral


In [24]:
data_each_polarity = 40000

In [25]:
pos_data = data.loc[data['polarity'] == 'positive']
pos_data = pos_data.sample(n=data_each_polarity)
pos_data

Unnamed: 0,lemma,polarity
140205,even case talent like anti talent enough dir...,positive
73635,Widow Sara Goldfarb spend day watch self hel...,positive
36119,certainly Rolling Stones Gim Shelter use won...,positive
43842,joke go go watch,positive
124586,Somebody really need stand something movie like,positive
...,...,...
132330,would well settle main theme like fantasy li...,positive
69089,contrary believe lack discussion horror slav...,positive
69435,motif simple enough love vengeance former sl...,positive
27965,review helpful,positive


In [26]:
neg_data = data.loc[data['polarity'] == 'negative']
neg_data = neg_data.sample(n=data_each_polarity)
neg_data

Unnamed: 0,lemma,polarity
66715,want sequel know could never live one favori...,negative
57778,think toward pathetic character tragic one d...,negative
66258,lie pool wool eye misconstrue make false sca...,negative
115611,notice seem pretty relax picnic open bird tr...,negative
146344,possibly bad movie ever see,negative
...,...,...
172828,think amazingly bad become comedic least fir...,negative
120154,accord bad one,negative
125410,Well bad way could ever conceive allegedly v...,negative
22951,quite tour de force scoop prize award season...,negative


In [27]:
data = pd.concat([pos_data, neg_data], ignore_index=True)
data = data.replace({'polarity': {'negative': 0, 'positive': 1}})
data

Unnamed: 0,lemma,polarity
0,even case talent like anti talent enough dir...,1
1,Widow Sara Goldfarb spend day watch self hel...,1
2,certainly Rolling Stones Gim Shelter use won...,1
3,joke go go watch,1
4,Somebody really need stand something movie like,1
...,...,...
79995,think amazingly bad become comedic least fir...,0
79996,accord bad one,0
79997,Well bad way could ever conceive allegedly v...,0
79998,quite tour de force scoop prize award season...,0


In [28]:
target = data['polarity']
lemma_data = data['lemma']

In [29]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(lemma_data)
X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, random_state=42
)
X_train.shape

(64000, 29679)

In [30]:
rfclf = RandomForestClassifier()

In [31]:
rfclf.fit(X_train, y_train)

In [32]:
y_pred = rfclf.predict(X_test)
print(classification_report(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      7973
           1       0.84      0.84      0.84      8027

    accuracy                           0.84     16000
   macro avg       0.84      0.84      0.84     16000
weighted avg       0.84      0.84      0.84     16000

Accuracy: 0.841


In [33]:
xgb_clf= GradientBoostingClassifier()

In [34]:
xgb_clf.fit(X_train,y_train)

In [35]:
y_pred_2 = xgb_clf.predict(X_test)
print(classification_report(y_pred_2,y_test))
print("Accuracy:",accuracy_score(y_pred_2, y_test))

              precision    recall  f1-score   support

           0       0.90      0.67      0.77     10806
           1       0.55      0.84      0.67      5194

    accuracy                           0.72     16000
   macro avg       0.72      0.76      0.72     16000
weighted avg       0.79      0.72      0.73     16000

Accuracy: 0.7244375


In [36]:
lgr = LogisticRegression(max_iter=1000)

In [42]:
lgr.fit(X_train,y_train)

In [43]:
y_pred_3 = lgr.predict(X_test)
print(classification_report(y_pred_3,y_test))
print("Accuracy:",accuracy_score(y_pred_3, y_test))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      8142
           1       0.88      0.90      0.89      7858

    accuracy                           0.89     16000
   macro avg       0.89      0.89      0.89     16000
weighted avg       0.89      0.89      0.89     16000

Accuracy: 0.8926875
