In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import classification_report


In [10]:
data_train = pd.read_csv(r'train.csv', encoding='unicode_escape')


In [11]:
data_train.head()

Unnamed: 0,ï»¿text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [12]:
data_test = pd.read_csv(r'test.csv', encoding='unicode_escape')


In [13]:
data_test.tail()

Unnamed: 0,ï»¿text,label
4995,This is the kind of picture John Lassiter woul...,1
4996,A MUST SEE! I saw WHIPPED at a press screening...,1
4997,NBC should be ashamed. I wouldn't allow my chi...,0
4998,This movie is a clumsy mishmash of various gho...,0
4999,Formula movie about the illegitimate son of a ...,0


In [14]:
#Renaming column name for readability
data_train.rename(columns = {'ï»¿text':'Movie_Review'}, inplace = True)
data_train.head()

Unnamed: 0,Movie_Review,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [15]:
#Renaming column name for readability
data_test.rename(columns = {'ï»¿text':'Movie_Review'}, inplace = True)
data_test.tail()

Unnamed: 0,Movie_Review,label
4995,This is the kind of picture John Lassiter woul...,1
4996,A MUST SEE! I saw WHIPPED at a press screening...,1
4997,NBC should be ashamed. I wouldn't allow my chi...,0
4998,This movie is a clumsy mishmash of various gho...,0
4999,Formula movie about the illegitimate son of a ...,0


In [16]:
data_test.head()

Unnamed: 0,Movie_Review,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [17]:
#Concatenating train and test datasets into one dataframe
data = pd.concat([data_train, data_test], axis=0, ignore_index=True)

In [18]:
data

Unnamed: 0,Movie_Review,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
44995,This is the kind of picture John Lassiter woul...,1
44996,A MUST SEE! I saw WHIPPED at a press screening...,1
44997,NBC should be ashamed. I wouldn't allow my chi...,0
44998,This movie is a clumsy mishmash of various gho...,0


In [19]:
data.iloc[44999]

Movie_Review    Formula movie about the illegitimate son of a ...
label                                                           0
Name: 44999, dtype: object

In [22]:
data.tail()

Unnamed: 0,Movie_Review,label
44995,This is the kind of picture John Lassiter woul...,1
44996,A MUST SEE! I saw WHIPPED at a press screening...,1
44997,NBC should be ashamed. I wouldn't allow my chi...,0
44998,This movie is a clumsy mishmash of various gho...,0
44999,Formula movie about the illegitimate son of a ...,0


In [23]:
data.shape

(45000, 2)

In [24]:
#nltk required downloads
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [25]:
#Function to preprocess the data and return the clean copus
def get_corpus(df):
 corpus = []
 for i in range(0, 45000):
  #Word tokenization
  tokenizer = RegexpTokenizer(r'\w+')
  review=tokenizer.tokenize(data['Movie_Review'][i])
  #Lowercase conversion
  review =[word.lower() for word in review if word.isalpha()]

  # Instantiating WordNeLemmatizer class
  lemmatizer=WordNetLemmatizer()
 
  #Lemmatization
  review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))] 

  review = ' '.join(review) 
  corpus.append(review)
  
 return corpus

In [26]:
corpus=get_corpus(data)

In [27]:
#Feature Extraction
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

y = data.iloc[:, 1].values

In [28]:
X.shape


(45000, 1500)

In [29]:
#Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [30]:
y.shape

(45000,)

In [31]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [33]:
#Naive Bayes Classifier
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [34]:
y_pred_nb = classifier.predict(X_test)
y_pred_nb

array([1, 0, 0, ..., 0, 0, 1])

In [35]:
from sklearn.metrics import accuracy_score


In [36]:
as_nb = accuracy_score(y_test, y_pred_nb) 
as_nb

0.7826666666666666

In [37]:
cm_nb = confusion_matrix(y_test, y_pred_nb) 
cm_nb

array([[3884,  643],
       [1313, 3160]])

In [38]:
print(classification_report(y_test, y_pred_nb ))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80      4527
           1       0.83      0.71      0.76      4473

    accuracy                           0.78      9000
   macro avg       0.79      0.78      0.78      9000
weighted avg       0.79      0.78      0.78      9000



In [39]:
#Random Forest Classifier
classifier = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=300, random_state=0)

In [40]:
y_pred_rf = classifier.predict(X_test)
y_pred_rf

array([1, 0, 1, ..., 0, 0, 1])

In [41]:
cm_rf = confusion_matrix(y_test, y_pred_rf)
cm_rf

array([[3812,  715],
       [ 666, 3807]])

In [42]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      4527
           1       0.84      0.85      0.85      4473

    accuracy                           0.85      9000
   macro avg       0.85      0.85      0.85      9000
weighted avg       0.85      0.85      0.85      9000



In [43]:
as_rf = accuracy_score(y_test, y_pred_rf) 
as_rf

0.8465555555555555

In [44]:
#Support Vector Machine
from sklearn.svm import SVC  
clf = SVC(kernel='linear') 

In [45]:
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [46]:
y_pred_svm = classifier.predict(X_test)
y_pred_svm

array([1, 0, 1, ..., 0, 0, 1])

In [47]:
cm_svm = confusion_matrix(y_test, y_pred_svm)
cm_svm

array([[3812,  715],
       [ 666, 3807]])

In [48]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      4527
           1       0.84      0.85      0.85      4473

    accuracy                           0.85      9000
   macro avg       0.85      0.85      0.85      9000
weighted avg       0.85      0.85      0.85      9000



In [49]:
as__svm = accuracy_score(y_test, y_pred_svm) 
as__svm

0.8465555555555555