In [1]:
# =========== EXERCISES =============
# In this Exercise, you are going to classify whether a given movie review is positive or negative.
# you are going to use Bag of words for pre-processing the text and apply different classification algorithms.
# Sklearn CountVectorizer has the inbuilt implementations for Bag of Words.
# ===================================


#Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
# About Data: IMDB Dataset

# Credits: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?resource=download

# - This data consists of two columns. - review - sentiment
# - Reviews are the statements given by users after watching the movie.
# - sentiment feature tells whether the given review is positive or negative.

# ===============================================


#1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
df = pd.read_csv("movies_sentiment_data.csv")



#2. print the shape of the data
print(df.shape)

#3. print top 5 datapoints
df.head()

(19000, 2)


Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [3]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative
df['Category'] = df['sentiment'].apply(lambda x:1 if x=='positive' else 0)

In [4]:
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
df['Category'].value_counts()

1    9500
0    9500
Name: Category, dtype: int64

In [5]:
#Do the 'train-test' splitting with test size of 20%
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.review, df.Category, test_size=0.2)

In [7]:
# Exercise-1

# using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.
# Note:

# use CountVectorizer for pre-processing the text.

# use Random Forest as the classifier with estimators as 50 and criterion as entropy.

# print the classification report.

# References:

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

# =====================================================

#1. create a pipeline object
clf = Pipeline([
    ('vectorizer', CountVectorizer()), #initializing the vectorizer using the RandomForest classifier
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))
])


#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))


# ==========================
# As you can see above, for both the classes (positive and negative sentiment) we got more than 80% precision, 
# recall and f1- score. This seems to be an acceptable performance.

              precision    recall  f1-score   support

           0       0.83      0.85      0.84      1882
           1       0.85      0.83      0.84      1918

    accuracy                           0.84      3800
   macro avg       0.84      0.84      0.84      3800
weighted avg       0.84      0.84      0.84      3800



In [8]:
# Exercise-2

# using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
# Note:

# use CountVectorizer for pre-processing the text.
# use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean'.
# print the classification report.
# References:

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# =============================================

#1. create a pipeline object
clf = Pipeline([
    ('vectorizer', CountVectorizer()), #using the KNN classifier with 10 neighbors 
    ('KNN', (KNeighborsClassifier(n_neighbors=10, metric='euclidean')))
])


#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))


# ==========================
# Hmmm..here the performance of various metrics (precision, recall etc.) seem to be lower (~60 %). 
# Let's try one more classifier and then discuss why performance is varying so much

              precision    recall  f1-score   support

           0       0.64      0.62      0.63      1882
           1       0.64      0.66      0.65      1918

    accuracy                           0.64      3800
   macro avg       0.64      0.64      0.64      3800
weighted avg       0.64      0.64      0.64      3800



In [10]:
# Exercise-3

# using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
# Note:

# use CountVectorizer for pre-processing the text.
# use Multinomial Naive Bayes as the classifier.
# print the classification report.
# References:

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
# ====================================


#1. create a pipeline object
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('Multi NB', MultinomialNB()) #using the Multinomial Naive Bayes classifier 
])


#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

# ========================================
# That's great! MultinomialNB model for both the classes (positive and negative sentiment) we got more than 80% precision, 
# recall and f1- score and performed equally good with Random Forest. This seems to be an acceptable performance.

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      1882
           1       0.87      0.82      0.85      1918

    accuracy                           0.85      3800
   macro avg       0.85      0.85      0.85      3800
weighted avg       0.85      0.85      0.85      3800

