# Using NLP predict whether the review is positive or negative for a given dataset 
https://drive.google.com/open?id=1-TJWzdxapGhp2aElncd6RH6zOpSAf69X

In [1]:
import pandas as pd
import numpy as np
import nltk
import re # regular expression
import string
# CountVectorizer used to count words and form matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Data Gathering

In [2]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", sep='\t', encoding='utf-8')
dataset.Review

0                               Wow... Loved this place.
1                                     Crust is not good.
2              Not tasty and the texture was just nasty.
3      Stopped by during the late May bank holiday of...
4      The selection on the menu was great and so wer...
5         Now I am getting angry and I want my damn pho.
6                  Honeslty it didn't taste THAT fresh.)
7      The potatoes were like rubber and you could te...
8                              The fries were great too.
9                                         A great touch.
10                              Service was very prompt.
11                                    Would not go back.
12     The cashier had no care what so ever on what I...
13     I tried the Cape Cod ravoli, chicken, with cra...
14     I was disgusted because I was pretty sure that...
15     I was shocked because no signs indicate cash o...
16                                   Highly recommended.
17                Waitress was 

# Data cleaning

In [3]:
dataset.isnull().any()

Review    False
Liked     False
dtype: bool

In [4]:
dataset.duplicated().any()

True

In [5]:
dataset.drop_duplicates(keep='first', inplace=True)

In [6]:
dataset.duplicated().any()

False

In [7]:
dataset.describe()

Unnamed: 0,Liked
count,996.0
mean,0.501004
std,0.50025
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [8]:
# corpus is a collection of text
# data cleaning steps
# 1. Remove punctuation, 2. Remove numbers and 3. Lowercase letters
porter = nltk.PorterStemmer()

In [9]:
def cleaning_and_stemming(text):
    text = text.lower()  # converting text into lowercase
    text = re.sub('\[.*?\]', '', text)  # removing .,*,?,.... symbols
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # revoming punctuation
    text = re.sub('\w*\d\w*', '', text)  # removing numbers
    words = text.split()  # spliting sentence into words
    # stemming is the process of extracting root word from a word
    # stop words are those which express state of words like a, an, as, those, when etc.
    stem_words = [ porter.stem(word) for word in words if word not in set( nltk.corpus.stopwords.words('english') ) ]
    stemmed_sentence = ' '.join(stem_words)
    return stemmed_sentence

In [10]:
# re-aranging reviews with cleaned review
cleaned_review = lambda review: cleaning_and_stemming(review)
dataset.Review = dataset.Review.apply(cleaned_review)

In [11]:
dataset.head()

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


# Tokenization

In [12]:
# It consists three steps 
# Clean Text - remove excess, unnecessary parts of the text
# Tokenize Text - split the text into smaller pieces
# Document-Term Matrix - put into a matrix so a machine can read it

In [13]:
tokenizer = CountVectorizer( max_features = 10000 )
tokenized_data = tokenizer.fit_transform( dataset.Review )
document_term_matrix = pd.DataFrame( tokenized_data.toarray(), columns=tokenizer.get_feature_names() )
document_term_matrix.index = dataset.index

In [14]:
document_term_matrix.head()

Unnamed: 0,absolut,absolutley,accid,accommod,accomod,accordingli,account,ach,acknowledg,across,...,yelper,yet,youd,youll,your,yucki,yukon,yum,yummi,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Modeling 

In [15]:
X = document_term_matrix
y = dataset['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [16]:
random_forest_classifier = RandomForestClassifier(n_estimators = 300)
random_forest_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
support_vector_classifier = SVC()
support_vector_classifier.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [19]:
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Prediction

In [20]:
rfc_prediction = random_forest_classifier.predict(X_test)
lr_prediction = logistic_regression.predict(X_test)
svc_prediction = support_vector_classifier.predict(X_test)
decision_tree_prediction = decision_tree_classifier.predict(X_test)

# Accuracy

In [21]:
print("Random forest:", accuracy_score(rfc_prediction, y_test)*100)
print("Logistic Regression:", accuracy_score(lr_prediction, y_test)*100)
print("Support Vector:", accuracy_score(svc_prediction, y_test)*100)
print("Decision Tree:", accuracy_score(decision_tree_prediction, y_test)*100)

Random forest: 76.58862876254182
Logistic Regression: 80.60200668896321
Support Vector: 48.49498327759198
Decision Tree: 72.90969899665552


# Confusion Matrix

In [22]:
print("Random forest:")
print(confusion_matrix(rfc_prediction, y_test))
print("Logistic Regression:")
print(confusion_matrix(lr_prediction, y_test))
print("Support Vector:")
print(confusion_matrix(svc_prediction, y_test))
print("Decision Tree:")
print(confusion_matrix(decision_tree_prediction, y_test))

Random forest:
[[121  46]
 [ 24 108]]
Logistic Regression:
[[121  34]
 [ 24 120]]
Support Vector:
[[145 154]
 [  0   0]]
Decision Tree:
[[108  44]
 [ 37 110]]


# Classification Report

In [23]:
print("Random forest:")
print(classification_report(rfc_prediction, y_test))
print("Logistic Regression:")
print(classification_report(lr_prediction, y_test))
print("Support Vector:")
print(classification_report(svc_prediction, y_test))
print("Decision Tree:")
print(classification_report(decision_tree_prediction, y_test))

Random forest:
              precision    recall  f1-score   support

           0       0.83      0.72      0.78       167
           1       0.70      0.82      0.76       132

   micro avg       0.77      0.77      0.77       299
   macro avg       0.77      0.77      0.77       299
weighted avg       0.78      0.77      0.77       299

Logistic Regression:
              precision    recall  f1-score   support

           0       0.83      0.78      0.81       155
           1       0.78      0.83      0.81       144

   micro avg       0.81      0.81      0.81       299
   macro avg       0.81      0.81      0.81       299
weighted avg       0.81      0.81      0.81       299

Support Vector:
              precision    recall  f1-score   support

           0       1.00      0.48      0.65       299
           1       0.00      0.00      0.00         0

   micro avg       0.48      0.48      0.48       299
   macro avg       0.50      0.24      0.33       299
weighted avg       1.0

  'recall', 'true', average, warn_for)
