In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [3]:
data = pd.read_csv('labeled_data.csv', engine='python')

In [4]:
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,polarity
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[('past', 'a'), ('PM', 'n'), ('finished', 'v')...",past PM finish watch Francis Ford Coppola Go...,negative
1,I should probably go to bed.,I should probably go to bed,"[('probably', 'r'), ('go', 'v'), ('bed', 'v')]",probably go bed,neutral
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[('late', 'a'), ('tomorrow', 'n'), ('wake', 'v...",late tomorrow wake bit early,neutral
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[('early', 'r'), ('enough', 'r'), ('postpone',...",early enough postpone write line,positive
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[('seen', 'v'), ('three', None), ('times', 'v'...",see three time opportunity share thought ref...,positive


In [5]:
data = data[['lemma', 'polarity']]
data

Unnamed: 0,lemma,polarity
0,past PM finish watch Francis Ford Coppola Go...,negative
1,probably go bed,neutral
2,late tomorrow wake bit early,neutral
3,early enough postpone write line,positive
4,see three time opportunity share thought ref...,positive
...,...,...
183651,way make movie purpose unless crazy,negative
183652,found helpful,neutral
183653,review helpful,positive
183654,Sign vote,neutral


In [6]:
data_each_polarity = 40000

In [7]:
pos_data = data.loc[data['polarity'] == 'positive']
pos_data = pos_data.sample(n=data_each_polarity)
pos_data

Unnamed: 0,lemma,polarity
35212,already long movie see whole give envelope s...,positive
77852,Prestige favorite movie Nolan,positive
67326,love way bring Franco original Django intera...,positive
71427,shock anymore repetitious,positive
166425,like part Scary Movie love Another Teen movie,positive
...,...,...
8468,De Niro cast Jimmy Conway also merit perfect...,positive
133301,Please understand opinion sure day belligere...,positive
150706,Anway movie mostly love story,positive
154024,vacation time plenty time devote fun movie,positive


In [8]:
neg_data = data.loc[data['polarity'] == 'negative']
neg_data = neg_data.sample(n=data_each_polarity)
neg_data

Unnamed: 0,lemma,polarity
148326,imply may play form basketball ocean laundry...,negative
163383,seem assume role character take soul heart c...,negative
82434,house Illinois empty Starling find house Jac...,negative
153681,favor see store hide put everyone danger buy,negative
12917,undergraduate student take discrete math gra...,negative
...,...,...
22339,energy ambition put scheme could use start s...,negative
138599,wrong,negative
133617,kids version scary movie film even par,negative
115274,typically zone watch funny bad movie plot lu...,negative


In [9]:
data = pd.concat([pos_data, neg_data], ignore_index=True)
data = data.replace({'polarity': {'negative': 0, 'positive': 1}})
data

Unnamed: 0,lemma,polarity
0,already long movie see whole give envelope s...,1
1,Prestige favorite movie Nolan,1
2,love way bring Franco original Django intera...,1
3,shock anymore repetitious,1
4,like part Scary Movie love Another Teen movie,1
...,...,...
79995,energy ambition put scheme could use start s...,0
79996,wrong,0
79997,kids version scary movie film even par,0
79998,typically zone watch funny bad movie plot lu...,0


In [10]:
target = data['polarity']
lemma_data = data['lemma']

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(lemma_data)
X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, random_state=42
)
X_train.shape

(64000, 29645)

In [None]:
rfclf = RandomForestClassifier()

In [None]:
rfclf.fit(X_train, y_train)

In [None]:
y_pred = rfclf.predict(X_test)
print(classification_report(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      8074
           1       0.84      0.85      0.85      7926

    accuracy                           0.85     16000
   macro avg       0.85      0.85      0.85     16000
weighted avg       0.85      0.85      0.85     16000

Accuracy: 0.8459375


In [None]:
xgb_clf= GradientBoostingClassifier()

In [None]:
xgb_clf.fit(X_train,y_train)

In [None]:
y_pred_2 = xgb_clf.predict(X_test)
print(classification_report(y_pred_2,y_test))
print("Accuracy:",accuracy_score(y_pred_2, y_test))

              precision    recall  f1-score   support

           0       0.90      0.67      0.77     10812
           1       0.55      0.85      0.67      5188

    accuracy                           0.73     16000
   macro avg       0.73      0.76      0.72     16000
weighted avg       0.79      0.73      0.74     16000

Accuracy: 0.7278125


In [17]:
lgr = LogisticRegression(max_iter=1000)

In [18]:
lgr.fit(X_train.toarray(),y_train)

In [19]:
y_pred_3 = lgr.predict(X_test)
print(classification_report(y_pred_3,y_test))
print("Accuracy:",accuracy_score(y_pred_3, y_test))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      8158
           1       0.88      0.90      0.89      7842

    accuracy                           0.89     16000
   macro avg       0.89      0.89      0.89     16000
weighted avg       0.89      0.89      0.89     16000

Accuracy: 0.8913125
