In [60]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

## Loading the Dataset

In [15]:
from pathlib import Path
import zipfile

source = Path("/content/imdb_reviews.zip")
dest = Path("/content/imdb_reviews")

if not dest.is_dir():
    dest.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(source, "r") as zip_ref:
        print(f"[INFO] Unzipping dataset `{source}` to `{dest}`...")
        zip_ref.extractall(dest)

print(f"[INFO] Dataset succesfully downloaded to `{dest}`..")

[INFO] Dataset succesfully downloaded to `/content/imdb_reviews`..


## Understanding the Dataset

In [16]:
df = pd.read_csv("/content/imdb_reviews/IMDB Dataset.csv")

df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Modifying the Dataset

In [17]:
df["Sentiment"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)
df["Review"] = df["review"]

df = df[["Review", "Sentiment"]]

df.head(5)

Unnamed: 0,Review,Sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [18]:
# Getting the number of positive and negative reviews
df.value_counts("Sentiment")

Sentiment
0    25000
1    25000
dtype: int64

## Splitting the Dataset into Training and Testing Sets

In [21]:
test_prop = 0.2

x_train, x_test, y_train, y_test = train_test_split(df.Review, df.Sentiment, test_size=test_prop)

print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

40000 40000
10000 10000


In [26]:
print(x_test[:3])
print(y_test[:3])

16202    I though that it was hard, if not impossible, ...
21192    When I go to see a movie about zombie's, I'm n...
429      What can I say? Not as bad as many here have m...
Name: Review, dtype: object
16202    0
21192    0
429      1
Name: Sentiment, dtype: int64


### Initialize the BOW object

In [30]:
bow_v = CountVectorizer()

### Creating the Train and Test Compres Vectors

In [32]:
x_train_cv = bow_v.fit_transform(x_train.values)
x_test_cv = bow_v.transform(x_test.values)

In [36]:
# Getting the vocabulary
list(bow_v.vocabulary_)[:10]

['this',
 'is',
 'one',
 'of',
 'the',
 'classic',
 'twilight',
 'zone',
 'episodes',
 'where']

## Initializing the Model (R-Forest)

In [45]:
model = RandomForestClassifier(n_estimators=50, criterion="entropy")

### Training the Model

In [46]:
model.fit(x_train_cv, y_train.values)

### Evaluating the Model

In [53]:
y_preds = model.predict(x_test_cv)

print(classification_report(y_test.values, y_preds))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84      5034
           1       0.84      0.83      0.84      4966

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



## Initializing the Model (KNN)

In [56]:
model1 = KNeighborsClassifier(n_neighbors=10, metric="euclidean")

### Training the Model

In [57]:
model1.fit(x_train_cv, y_train.values)

### Evaluating the Model

In [58]:
y_preds1 = model1.predict(x_test_cv)

print(classification_report(y_test.values, y_preds1))

              precision    recall  f1-score   support

           0       0.66      0.67      0.67      5034
           1       0.66      0.65      0.66      4966

    accuracy                           0.66     10000
   macro avg       0.66      0.66      0.66     10000
weighted avg       0.66      0.66      0.66     10000



## Initializng the Model (N-Bayes)

In [61]:
model2 = MultinomialNB()

### Training the Model

In [62]:
model2.fit(x_train_cv, y_train.values)

### Evaluating the Model

In [64]:
y_preds2 = model2.predict(x_test_cv)

print(classification_report(y_test.values, y_preds2))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      5034
           1       0.87      0.81      0.84      4966

    accuracy                           0.84     10000
   macro avg       0.85      0.84      0.84     10000
weighted avg       0.85      0.84      0.84     10000

