In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [13]:
data = pd.read_csv('labeled_data.csv', engine='python')

In [None]:
data.head()

In [None]:
data = data[['lemma', 'polarity']]
data

Unnamed: 0,lemma,polarity
0,past PM finish watch Francis Ford Coppola Go...,negative
1,probably go bed,neutral
2,late tomorrow wake bit early,neutral
3,early enough postpone write line,positive
4,see three time opportunity share thought ref...,positive
...,...,...
183651,way make movie purpose unless crazy,negative
183652,found helpful,neutral
183653,review helpful,positive
183654,Sign vote,neutral


In [None]:
data_each_polarity = 40000

In [None]:
pos_data = data.loc[data['polarity'] == 'positive']
pos_data = pos_data.sample(n=data_each_polarity)
pos_data

Unnamed: 0,lemma,polarity
140205,even case talent like anti talent enough dir...,positive
73635,Widow Sara Goldfarb spend day watch self hel...,positive
36119,certainly Rolling Stones Gim Shelter use won...,positive
43842,joke go go watch,positive
124586,Somebody really need stand something movie like,positive
...,...,...
132330,would well settle main theme like fantasy li...,positive
69089,contrary believe lack discussion horror slav...,positive
69435,motif simple enough love vengeance former sl...,positive
27965,review helpful,positive


In [None]:
neg_data = data.loc[data['polarity'] == 'negative']
neg_data = neg_data.sample(n=data_each_polarity)
neg_data

Unnamed: 0,lemma,polarity
66715,want sequel know could never live one favori...,negative
57778,think toward pathetic character tragic one d...,negative
66258,lie pool wool eye misconstrue make false sca...,negative
115611,notice seem pretty relax picnic open bird tr...,negative
146344,possibly bad movie ever see,negative
...,...,...
172828,think amazingly bad become comedic least fir...,negative
120154,accord bad one,negative
125410,Well bad way could ever conceive allegedly v...,negative
22951,quite tour de force scoop prize award season...,negative


In [None]:
data = pd.concat([pos_data, neg_data], ignore_index=True)
data = data.replace({'polarity': {'negative': 0, 'positive': 1}})
data

Unnamed: 0,lemma,polarity
0,even case talent like anti talent enough dir...,1
1,Widow Sara Goldfarb spend day watch self hel...,1
2,certainly Rolling Stones Gim Shelter use won...,1
3,joke go go watch,1
4,Somebody really need stand something movie like,1
...,...,...
79995,think amazingly bad become comedic least fir...,0
79996,accord bad one,0
79997,Well bad way could ever conceive allegedly v...,0
79998,quite tour de force scoop prize award season...,0


In [None]:
target = data['polarity']
lemma_data = data['lemma']

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(lemma_data)
X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, random_state=42
)
X_train.shape

(64000, 29679)

In [None]:
rfclf = RandomForestClassifier()

In [None]:
rfclf.fit(X_train, y_train)

In [None]:
y_pred = rfclf.predict(X_test)
print(classification_report(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      7973
           1       0.84      0.84      0.84      8027

    accuracy                           0.84     16000
   macro avg       0.84      0.84      0.84     16000
weighted avg       0.84      0.84      0.84     16000

Accuracy: 0.841


In [None]:
xgb_clf= GradientBoostingClassifier()

In [None]:
xgb_clf.fit(X_train,y_train)

In [None]:
y_pred_2 = xgb_clf.predict(X_test)
print(classification_report(y_pred_2,y_test))
print("Accuracy:",accuracy_score(y_pred_2, y_test))

              precision    recall  f1-score   support

           0       0.90      0.67      0.77     10806
           1       0.55      0.84      0.67      5194

    accuracy                           0.72     16000
   macro avg       0.72      0.76      0.72     16000
weighted avg       0.79      0.72      0.73     16000

Accuracy: 0.7244375


In [None]:
lgr = LogisticRegression(max_iter=1000)

In [None]:
lgr.fit(X_train,y_train)

In [None]:
y_pred_3 = lgr.predict(X_test)
print(classification_report(y_pred_3,y_test))
print("Accuracy:",accuracy_score(y_pred_3, y_test))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      8142
           1       0.88      0.90      0.89      7858

    accuracy                           0.89     16000
   macro avg       0.89      0.89      0.89     16000
weighted avg       0.89      0.89      0.89     16000

Accuracy: 0.8926875


In [3]:
!pip install "gensim==3.8.3"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==3.8.3
  Downloading gensim-3.8.3.tar.gz (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25l[?25hdone
  Created wheel for gensim: filename=gensim-3.8.3-cp39-cp39-linux_x86_64.whl size=26528065 sha256=b217e804d4295cc5b30db479d7bedea32799ee01a5d1df986bee2503852c8bce
  Stored in directory: /root/.cache/pip/wheels/ca/5d/af/618594ec2f28608c1d6ee7d2b7e95a3e9b06551e3b80a491d6
Successfully built gensim
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.1
    Uninstalling gensim-4.3.1:
      Successfully uninstalled gensim-4.3.1
Successfully installed gensim-3.8.3


In [14]:
from gensim.test.utils import common_texts
from gensim.sklearn_api import D2VTransformer
import nltk
nltk.download('popular')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [15]:
def tokenize(text):
  return word_tokenize(text)

In [16]:
data['word_tokenize'] = data['review'].apply(tokenize)
data

In [18]:

model = D2VTransformer(min_count=1, size=300)
docvecs = model.fit_transform(data['word_tokenize'])  

In [20]:
docvecs.shape

(183656, 300)