# Importing Necessary Libraries and Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup as bs

# Data Importing

In [2]:
data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Data Cleaning and Preprocessing

In [11]:
corpus = []

for i in range(0, 4000):
    review = bs( data['review'][i], "lxml").text
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Count Vectorizer algorithm for creating Bag of Words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 4000)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[0:4000, 1].values

# Splitting model into training data(80%) and test data(20%)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Using Gaussian Naive Bayes classification algorithm in-built in sci-kit ML framework

In [15]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [18]:
ypred = classifier.predict(X_test)

In [21]:
from sklearn.metrics import confusion_matrix
cmat = confusion_matrix(y_test, ypred)
print("The Confusion martrix is :- \n")
print(cmat)
print('**************************************************')
print('The accuracy score is:',accuracy_score(y_test,ypred))
print('**************************************************')
print('The classification report: \n')
print(classification_report(y_test,ypred))

The Confusion martrix is :- 

[[333  90]
 [143 234]]
**************************************************
The accuracy score is: 0.70875
**************************************************
The classification report: 

              precision    recall  f1-score   support

    negative       0.70      0.79      0.74       423
    positive       0.72      0.62      0.67       377

    accuracy                           0.71       800
   macro avg       0.71      0.70      0.70       800
weighted avg       0.71      0.71      0.71       800

