## Classify whether a given movie review is positive or negative.

In [1]:
#Import necessary libraries

import pandas as pd
import numpy as np



### About Data: IMDB Dataset
- This data consists of two columns. - review - sentiment
- Reviews are the statements given by users after watching the movie.
- sentiment feature tells whether the given review is positive or negative.

In [2]:
df = pd.read_csv('movies_sentiment_data.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [4]:
df.shape

(19000, 2)

In [5]:
df['Category'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [6]:
df.head()

Unnamed: 0,review,sentiment,Category
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1


In [47]:
df['Category'].value_counts()

Category
1    9500
0    9500
Name: count, dtype: int64

### Excercise 1: using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.

- use CountVectorizer for pre-processing the text.
- use Random Forest as the classifier with estimators as 50 and criterion as entropy.
- print the classification report.



In [9]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,y_train,y_test= train_test_split(df['review'],df['Category'],test_size=0.2,random_state=0)

In [14]:
X_train.shape

(15200,)

In [16]:
X_test.shape

(3800,)

#### Count vectorizer 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
v = CountVectorizer()

In [20]:
X_train_cv = v.fit_transform(X_train)

In [21]:
X_test_cv = v.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestClassifier

#### Random Forest Model

In [24]:
model = RandomForestClassifier(n_estimators=50, criterion='entropy', random_state=42)

In [25]:
model.fit(X_train_cv,y_train)

In [26]:
y_pred = model.predict(X_test_cv)

#### Report

In [27]:
from sklearn.metrics import classification_report

In [28]:
report = classification_report(y_test,y_pred)

In [29]:
print(report)

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1908
           1       0.83      0.83      0.83      1892

    accuracy                           0.83      3800
   macro avg       0.83      0.83      0.83      3800
weighted avg       0.83      0.83      0.83      3800



In [49]:
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('RF',RandomForestClassifier())
])

In [50]:
clf.fit(X_train,y_train)

In [51]:
report = classification_report(y_test,clf.predict(X_test))

In [52]:
print(report)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1908
           1       0.85      0.85      0.85      1892

    accuracy                           0.85      3800
   macro avg       0.85      0.85      0.85      3800
weighted avg       0.85      0.85      0.85      3800



### Exercise 2 : using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.

- use CountVectorizer for pre-processing the text.
- use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean'.
- print the classification report.


#### Using pipeline

In [31]:
from sklearn.pipeline import Pipeline


In [32]:
from  sklearn.neighbors import KNeighborsClassifier

In [36]:
clf = Pipeline([
    ('vectorizor',CountVectorizer()),
    ('KNN', KNeighborsClassifier())
])

In [37]:
clf.fit(X_train,y_train)

In [38]:
report = classification_report(y_test,clf.predict(X_test))

In [39]:
print(report)

              precision    recall  f1-score   support

           0       0.64      0.55      0.59      1908
           1       0.60      0.69      0.64      1892

    accuracy                           0.62      3800
   macro avg       0.62      0.62      0.62      3800
weighted avg       0.62      0.62      0.62      3800



### Exercise 3 : using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.

- use CountVectorizer for pre-processing the text.
- use Multinomial Naive Bayes as the classifier.
- print the classification report.

In [43]:
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [44]:
clf.fit(X_train,y_train)

In [45]:
report = classification_report(y_test,clf.predict(X_test))

In [46]:
print(report)

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1908
           1       0.87      0.82      0.85      1892

    accuracy                           0.85      3800
   macro avg       0.85      0.85      0.85      3800
weighted avg       0.85      0.85      0.85      3800

