In [139]:
import numpy as np
import pandas as pd

In [385]:
df = pd.read_csv('IMDB Dataset.csv')

In [386]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [341]:
#One review
df['review'].iloc[10]

'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />At first it was very odd and pretty funny but as the movie progressed I didn\'t find the jokes or oddness funny anymore.<br /><br />Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.<br /><br />I imagine this film would appeal to a stoner who is currently partaking.<br /><br />For something similar but better try "Brother from another planet"'

# Text Cleaning

1. Remove HTML Tags
2. Remove Special Characters
3. Converting into lower case
4. Removing stop words
5. Stemming

In [387]:
df['sentiment'].replace({'positive':1, 'negative':0}, inplace=True)

In [388]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Removing HTML Tags

In [389]:
import re
def remove_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [390]:
df['review'] = df['review'].apply(remove_html)

In [391]:
df['review'].iloc[10]

'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.At first it was very odd and pretty funny but as the movie progressed I didn\'t find the jokes or oddness funny anymore.Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.I imagine this film would appeal to a stoner who is currently partaking.For something similar but better try "Brother from another planet"'

In [392]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Converting text to lower

In [393]:
def lower_case(text):
    return text.lower()

In [394]:
df['review'] = df['review'].apply(lower_case)

In [395]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


## Removing Special Characters

In [396]:
def remove_special_characters(text):
    x = ''
    for i in text:
        if i.isalnum():
            x += i
        else:
            x += ' '
    return x
    

In [397]:
df['review'] = df['review'].apply(remove_special_characters)

In [398]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is...,1


## Remove Stop words

In [399]:
import nltk

In [77]:
import spacy

In [58]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Subhan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [160]:
from nltk.corpus import stopwords

In [161]:
stp_words = stopwords.words('english')

In [400]:
def removing_stop_words(text):
    x = []
    for i in text.split():
        if i not in stp_words:
            x.append(i)
            
    y = x[:]
    x.clear()
    return y

In [401]:
df['review'] = df['review'].apply(removing_stop_words)

## Stemming 

In [402]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [403]:
def stem_words(text):
    y=[]
    for i in text:
        y.append(ps.stem(i))
     
    return y

In [404]:
df['review'] = df['review'].apply(stem_words)

In [405]:
df.head()

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, h...",1
1,"[wonder, littl, product, film, techniqu, unass...",1
2,"[thought, wonder, way, spend, time, hot, summe...",1
3,"[basic, famili, littl, boy, jake, think, zombi...",0
4,"[petter, mattei, love, time, money, visual, st...",1


### Joining the whole dataset back

In [406]:
def join_back(input_list):
    return " ".join(input_list)

In [407]:
df['review'] = df['review'].apply(join_back)

In [408]:
df

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1
...,...,...
49995,thought movi right good job creativ origin fir...,1
49996,bad plot bad dialogu bad act idiot direct anno...,0
49997,cathol taught parochi elementari school nun ta...,0
49998,go disagre previou comment side maltin one sec...,0


# Training and testing the start

In [435]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
cv = CountVectorizer(max_features=5000)

In [436]:
X = cv.fit_transform(df['review']).toarray()

In [437]:
y = df['sentiment']

In [439]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

MemoryError: Unable to allocate 1.49 GiB for an array with shape (40000, 5000) and data type int64

In [415]:
X_train.shape

(40000, 5000)

In [416]:
y_test.shape

(10000,)

In [417]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [418]:
g_nb = GaussianNB()
m_nb = MultinomialNB()
b_nb = BernoulliNB()

In [419]:
g_nb.fit(X_train, y_train)

In [420]:
m_nb.fit(X_train, y_train)

In [421]:
b_nb.fit(X_train, y_train)

In [422]:
y_test = y_test[:,np.newaxis]

  y_test = y_test[:,np.newaxis]


In [423]:
y_test.shape

(10000, 1)

In [424]:
g_nb_prediction = g_nb.predict(X_test)

In [425]:
m_nb_prediction = m_nb.predict(X_test)

In [426]:
b_nb_prediction = b_nb.predict(X_test)

In [427]:
from sklearn.metrics import accuracy_score

In [428]:
print(f"Accuracy score from Gaussian Naive Bayes is {accuracy_score(y_test, g_nb_prediction )}")
print(f"Accuracy score from Multinomial Naive Bayes is {accuracy_score(y_test, m_nb_prediction)}")
print(f"Accuracy score from Burnaulli Naive Bayes is {accuracy_score(y_test, b_nb_prediction)}")

Accuracy score from Gaussian Naive Bayes is 0.7279
Accuracy score from Multinomial Naive Bayes is 0.847
Accuracy score from Burnaulli Naive Bayes is 0.8526


In [297]:
accuracy_score(y_test, y_pred3)

0.695

In [440]:
import os 
os.getcwd()

'C:\\Users\\Subhan\\Sentiment Analysis using Naive Bayes'