## Connect with Google Drive

In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


# Import Relevent Libraries, Functions and Classes

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle

# Import the Dataset

In [None]:
data=pd.read_csv("/content/drive/MyDrive/IRWA Assignment-02/Review Classification/New IMDB Dataset.csv")

# Initial Analysis of Dataset

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
#randomly select 5000 raws from dataframe
#data=data.sample(n=5000)

In [None]:
#data.head()

Unnamed: 0,review,sentiment
23546,Just PPV'd this. I don't want to waste too muc...,negative
4707,This movie makes me want to throw up every tim...,negative
7065,Almost missed it. While visiting friends in Ph...,positive
9593,If I had never seen an episode of the original...,negative
37596,a friend gave it to me saying it was another c...,negative


In [None]:
#checking whether there are any null values
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [None]:
data.shape

(5000, 2)

# Label Encoding

In [None]:
#create encoder object
le=LabelEncoder()

In [None]:
le.fit(data['sentiment'])
data['sentiment'] = le.transform(data['sentiment'])

# Split Input and Output Data

In [None]:
x=data["review"]
y=data["sentiment"]

In [None]:
x.head()

23546    Just PPV'd this. I don't want to waste too muc...
4707     This movie makes me want to throw up every tim...
7065     Almost missed it. While visiting friends in Ph...
9593     If I had never seen an episode of the original...
37596    a friend gave it to me saying it was another c...
Name: review, dtype: object

In [None]:
y.head()

23546    0
4707     0
7065     1
9593     0
37596    0
Name: sentiment, dtype: int64

# Create Bag of Word(CountVectorizer) Model

In [None]:
count_vectorizer=CountVectorizer()
cv=count_vectorizer.fit_transform(x)

In [None]:
cv

<5000x38688 sparse matrix of type '<class 'numpy.int64'>'
	with 678228 stored elements in Compressed Sparse Row format>

In [None]:
cv_array=cv.toarray()
cv_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Check Class Imbalance

In [None]:
y.value_counts()

1    2524
0    2476
Name: sentiment, dtype: int64

#### According to return values, no need to balance classes artificially

# Split Train and Test Data

In [None]:
x_train,x_test,y_train,y_test=train_test_split(cv_array,y,test_size=0.3,random_state=0)

In [None]:
x_train.shape

(3500, 38688)

In [None]:
x_test.shape

(1500, 38688)

In [None]:
y_train.shape

(3500,)

In [None]:
y_test.shape

(1500,)

# The Model

In [None]:
model=RandomForestClassifier(n_estimators=100)

# Train the Model

In [None]:
model.fit(x_train,y_train)

# Prediction and Accuracy of the Model

In [None]:
pred=model.predict(x_test)

In [None]:
accuracy=accuracy_score(y_test,pred)

In [None]:
print(f"Accuracy of the Model: {accuracy}")

Accuracy of the Model: 0.828


# Save count_vectorizer to Pickel File

In [None]:
pickle.dump(count_vectorizer, open('Count Vectorizer.pkl', 'wb'))

# Checking the Model with Sample Reviews

In [None]:
model.predict(count_vectorizer.transform(["This is a very good movie"]))[0]

1

In [None]:
model.predict(count_vectorizer.transform(["This is a bad movie"]))[0]

0

In [None]:
model.predict(count_vectorizer.transform(["This is a incredible movie"]))[0]

1

In [None]:
model.predict(count_vectorizer.transform(["This is a damn bad movie"]))[0]

0

In [None]:
model.predict(count_vectorizer.transform(["What is this bad"]))[0]

0

# Save Model to Pickel File

In [None]:
pickle.dump(model, open('Review Classification Model.pkl', 'wb'))