In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
!gdown 1027oN3DtvUGk_f26LGbm1WkMQF36_N5A

Downloading...
From: https://drive.google.com/uc?id=1027oN3DtvUGk_f26LGbm1WkMQF36_N5A
To: /content/IMDB Dataset.csv
100% 66.2M/66.2M [00:01<00:00, 35.1MB/s]


In [3]:
df = pd.read_csv("/content/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [5]:
df['positive']=df.sentiment.apply(lambda x:1 if x=='positive' else 0)

In [6]:
df.head()

Unnamed: 0,review,sentiment,positive
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


# Train test split

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.review, df.positive, test_size=0.2)

In [8]:
print(X_train.shape,y_train.shape)

(40000,) (40000,)


In [9]:
print(X_test.shape,y_test.shape)

(10000,) (10000,)


In [10]:
df.shape

(50000, 3)

In [11]:
df.head()

Unnamed: 0,review,sentiment,positive
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1



# Train the  Random Forest model

In [12]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', RandomForestClassifier(n_estimators=50, criterion='entropy'))
])


In [13]:
clf.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      4969
           1       0.84      0.84      0.84      5031

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000




# Train the  KNN 

In [15]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])


In [16]:
clf.fit(X_train, y_train)

In [17]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.66      0.66      4969
           1       0.66      0.65      0.66      5031

    accuracy                           0.66     10000
   macro avg       0.66      0.66      0.66     10000
weighted avg       0.66      0.66      0.66     10000




# Train the naive bayes model

In [18]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [19]:
clf.fit(X_train, y_train)

In [20]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      4969
           1       0.86      0.82      0.84      5031

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



Link  https://colab.research.google.com/drive/1sUcjPmxAVnwG2HVFoKyztwp_V1gaTots?usp=sharing