In [5]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


In [7]:
df = pd.read_csv('../csv/IMDB Dataset.csv')
df.sample(5)

Unnamed: 0,review,sentiment
4573,"I enjoyed watching Cliffhanger, at the beginni...",positive
5964,"Contrary to most other commentators, I deeply ...",negative
41651,The ghost of the Vietnam war has haunted the A...,negative
39920,Spoken like a true hard-boiled u'an gangsta. T...,negative
18321,The only reason I even gave it a 1 out of 10 i...,negative


In [8]:
df['reaction'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.sample(5)

Unnamed: 0,review,sentiment,reaction
49814,The gates of Hell opened up and spit out this ...,negative,0
41831,"A fantastic movie, and very overlooked. Gary h...",positive,1
14431,Those who love Elivra as I did in her late nig...,positive,1
4418,There are no spoilers here... Because there is...,negative,0
22244,Hello all! I went to this movie without any ex...,positive,1


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['reaction'], test_size=0.3)

In [12]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('random_forest', RandomForestClassifier(n_estimators=50, criterion='entropy'))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

1
2
3
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      7473
           1       0.85      0.83      0.84      7527

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



In [14]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=5, metric='euclidean'))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.57      0.61      7473
           1       0.62      0.70      0.66      7527

    accuracy                           0.63     15000
   macro avg       0.64      0.63      0.63     15000
weighted avg       0.64      0.63      0.63     15000



In [15]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('naive_bayes', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      7473
           1       0.88      0.82      0.85      7527

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000

