In [75]:
import numpy as np
import pandas as pd

In [76]:
reviews = pd.read_csv('50000_reviews.csv')

In [77]:
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [78]:
# One review
reviews['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

# Text Cleaning


#### 1. Sample 50000 rows
#### 2. Remove HTML tags
#### 3. Remove special characters
#### 4. Conversion to lower case
#### 5. Remove stop words
#### 6. Stemming

In [79]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50004 entries, 0 to 50003
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50004 non-null  object
 1   sentiment  50004 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [80]:
reviews['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)

In [81]:
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [82]:
import re

# Remove html tags from reviews
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [83]:
reviews['review'] = reviews['review'].apply(clean_html)

In [84]:
# Convert text to lower case

def convert_lower(text):
    return text.lower()

In [85]:
reviews['review'] = reviews['review'].apply(convert_lower)

In [86]:
# Remove special characters

def remove_special_characters(text):
    result_string = ''
    
    for character in text:
        if character.isalnum():
            result_string += character
        else:
            result_string += ' '
            
    return result_string

In [87]:
reviews['review'] = reviews['review'].apply(remove_special_characters)

In [88]:
from nltk.corpus import stopwords

stopwords_set = set(stopwords.words('english'))

In [89]:
def remove_stopwords(text):
    no_stop_words = []
    for word in text.split():
        if word not in stopwords_set:
            no_stop_words.append(word)
    
    # Clear for next iteration of review
    final_list = no_stop_words[:]
    no_stop_words.clear()
    return final_list

In [90]:
reviews['review'] = reviews['review'].apply(remove_stopwords)

In [91]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [92]:
# Convert each word into it's root stem word

def stem_words(text):
    stemmed_words = []
    for word in text:
        stemmed_words.append(ps.stem(word))
    
    # Clear for next iteration of review
    final_list = stemmed_words[:]
    stemmed_words.clear()
    return final_list

In [93]:
reviews['review'] = reviews['review'].apply(stem_words)

In [94]:
def convert_split_list_to_string(text_list):
    return " ".join(text_list)

In [95]:
reviews['review'] = reviews['review'].apply(convert_split_list_to_string)

In [96]:
X = reviews.iloc[:, 0:1].values
y = reviews.iloc[:, -1].values

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1250)

In [98]:
import pickle

In [112]:
# Save the count vectorizer with 1250 features
pickle.dump(cv, open('cv.pkl', 'wb'))

# Save the vocabulary
# vocabulary_ ne aukaat dikha di
pickle.dump(cv.vocabulary_, open('vocab.pkl', 'wb'))

In [100]:
X = cv.fit_transform(reviews['review']).toarray()

In [101]:
X.shape

(50004, 1250)

In [102]:
# Make training and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [103]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [104]:
gnb.fit(X_train, y_train)
mnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)

In [105]:
# Save the fitted model (Bernoulli Naive Bayes since it has the highest accuracy)
pickle.dump(bnb, open('naive_bayes.pkl', 'wb'))

In [106]:
y_pred_g = gnb.predict(X_test)
y_pred_m = mnb.predict(X_test)
y_pred_b = bnb.predict(X_test)

In [107]:
from sklearn.metrics import accuracy_score

print("Accuracy of Gaussian Naive Bayes model ", accuracy_score(y_test, y_pred_g))
print("Accuracy of Multinomial Naive Bayes model ", accuracy_score(y_test, y_pred_m))
print("Accuracy of Bernoulli Naive Bayes model ", accuracy_score(y_test, y_pred_b))

Accuracy of Gaussian Naive Bayes model  0.775922407759224
Accuracy of Multinomial Naive Bayes model  0.8311168883111689
Accuracy of Bernoulli Naive Bayes model  0.8348165183481652


In [108]:
def text_preprocessing(text):
    text = clean_html(text)
    text = convert_lower(text)
    text = remove_special_characters(text)
    text = remove_stopwords(text)
    text = stem_words(text)
    text = convert_split_list_to_string(text)
    text = [text]
    text = cv.transform(text)
    
    return text

In [109]:
def get_sentiment(text):
    text = text_preprocessing(text)
    return 'Positive' if bnb.predict(text)[0] == 1 else 'Negative'

In [110]:
get_sentiment('Hey this is good')

'Positive'

In [111]:
get_sentiment('I totally hated this movie. The actors were not good and the movie\'s plot didn\'t make any sense. I wish I could put a rod up my ass after watching this movie. Totally gonna recommend to commit suicide after watching this. Amber Heard sucks.')

'Negative'