# Sentiment Analysis Using NLP
In this notebook, we will train a NLP model to find the sentiment of the given text. 
Dataset we are going to use contains Reviews given to the restaurent and a sentiment score, which 0 for negative sentiment and 1 for positive sentiment.

# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing the dataset

In [2]:
df = pd.read_csv('/content/Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Cleaning the text

In [3]:
import re
import nltk
nltk.download('stopwords') # download stopwords
from nltk.corpus import stopwords # import stopwords

# stem of some word returns the root form of the word...
# for ex.. Justified --> justify ....
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
    
corpus = [] # to accumulate all the cleaned text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
stopwords_list = stopwords.words('english')
removethese = ["aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
               "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', 
               "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
               "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
               "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won',
               "won't", 'wouldn', "wouldn't"]

for n in removethese:
    stopwords_list.remove(n)

In [6]:
# Iterating through all reviews and clean the text
for i in range(len(df)):
    # convert anything other than A-Z and a-z to black space
    review = re.sub('[^A-Za-z]', ' ', df.Review[i])
    # lower case
    review = review.lower()
    # tokenize the word
    review = review.split()
    # stem the words
    review  = [ps.stem(word) for word in review if word not in stopwords_list  ]
    # join the words
    review = ' '.join(review)
    # Add review to the corpus
    corpus.append(review)

In [7]:
# Lets look at the corpus
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti didn tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

# Creating the BOW model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# count vectorizer will give us the count for each word 
# how many time its appears in corpus
cv = CountVectorizer(max_features=1500) 

# fit the cv on corpus
X = cv.fit_transform(corpus).toarray()

y = df.iloc[:,-1].values

In [9]:
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

# Splitting the dataset into Training and Testing set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

In [11]:
X_train.shape, X_test.shape

((800, 1500), (200, 1500))

# Training the Naive Bayes model on the Training Set

In [12]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

# Predicting the test results

In [13]:
y_hat = clf.predict(X_test)
print(np.concatenate((y_hat.reshape(len(y_hat), 1),
                      y_test.reshape(len(y_test), 1)), 1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

# Making the confusion matrix

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_hat)
print(cm)
print(accuracy_score(y_test, y_hat))

[[56 41]
 [12 91]]
0.735


# Trying different model to improve accuracy

In [28]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=120, max_depth=6)
clf.fit(X_train, y_train)

y_hat = clf.predict(X_test)

# print(np.concatenate((y_hat.reshape(len(y_hat), 1),
#                       y_test.reshape(len(y_test), 1)), 1))

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_hat)
print(cm)
print(accuracy_score(y_test, y_hat))

[[91  6]
 [44 59]]
0.75
