In [1]:
import numpy as np
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Cleaning

1. Remove html tags
2. Remove special characters
3. Converting every thing to lower case
4. Removing stop words
5. Stemming

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [5]:
df.sentiment.replace({'positive': 1, 'negative': 0}, inplace=True)

In [6]:
def clean_html(text):
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

In [7]:
df['review'] = df['review'].apply(clean_html)

In [8]:
def convert_lower(text):
  return text.lower()

In [9]:
df['review'] = df['review'].apply(clean_html)

In [10]:
def remove_special_characters(text):
  pattern = r'[^a-zA-z0-9\s]'
  text = re.sub(pattern, '', text)
  return text

In [11]:
df['review'] = df['review'].apply(remove_special_characters)

In [12]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
def remove_stopwords(text):
  words = [w for w in text.split() if w not in stopwords.words('english')]
  return ' '.join(words)

In [14]:
remove_stopwords('This is a sample sentence, showing off the stop words filtration.')

'This sample sentence, showing stop words filtration.'

In [15]:
df['review'] = df['review'].apply(remove_stopwords)

In [16]:
ps = PorterStemmer()

def stemming(text):
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

df['review'] = df['review'].apply(stemming)

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,a wonder littl product the film techniqu unass...,1
2,i thought wonder way spend time hot summer wee...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [18]:
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(df['review']).toarray()
y = df.iloc[:, -1].values

X.shape, y.shape

((50000, 1000), (50000,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40000, 1000), (10000, 1000), (40000,), (10000,))

In [20]:
clfGaussiannb = GaussianNB()
clfMultinomialnb = MultinomialNB()
clfBernoullinb = BernoulliNB()

In [21]:
clfGaussiannb.fit(X_train, y_train)
clfMultinomialnb.fit(X_train, y_train)
clfBernoullinb.fit(X_train, y_train)

BernoulliNB()

In [22]:
y_pred_gaussiannb = clfGaussiannb.predict(X_test)
y_pred_multinomialnb = clfMultinomialnb.predict(X_test)
y_pred_bernoullinb = clfBernoullinb.predict(X_test)

In [23]:
print('GaussianNB Accuracy: ', accuracy_score(y_test, y_pred_gaussiannb))
print('MultinomialNB Accuracy: ', accuracy_score(y_test, y_pred_multinomialnb))
print('BernoulliNB Accuracy: ', accuracy_score(y_test, y_pred_bernoullinb))

GaussianNB Accuracy:  0.7911
MultinomialNB Accuracy:  0.8202
BernoulliNB Accuracy:  0.8235
