# Natural Language Processing

## Importing the libraries

In [179]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## Importing the dataset

In [180]:
# quoting = 3, ignores " to avoid processing errors 
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
print(dataset.head())

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


## Cleaning the texts

In [181]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset.values[i, 0])
  review = review.lower()
  review = review.split() # tokenize (covernt from string to list)

  #stemming
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  # not indicates sentiment and therefore should not be part of stop words
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [182]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features = 1500)
x = cv.fit_transform(corpus).toarray()
# tf_idf_vectorizer = TfidfVectorizer(max_features = 1500)
# x = tf_idf_vectorizer.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [183]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

## Training models on the Training set

In [184]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(x_train, y_train)
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2).fit(x_train, y_train)
# SVM
from sklearn.svm import SVC
svc = SVC(kernel='rbf').fit(x_train, y_train)
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB().fit(x_train, y_train)
# Random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='entropy').fit(x_train, y_train)

## Making the Confusion Matrix

In [185]:
from sklearn.metrics import confusion_matrix, accuracy_score
# logistic regression
print("Logistic Regression performance")
y_pred_lr = lr.predict(x_test)
print(confusion_matrix(y_test, y_pred_lr))
print(accuracy_score(y_test, y_pred_lr))

# knn
print("KNN performance")
y_pred_knn = knn.predict(x_test)
print(confusion_matrix(y_test, y_pred_knn))
print(accuracy_score(y_test, y_pred_knn))

# SVM
print("SVM performance")
y_pred_svc = svc.predict(x_test)
print(confusion_matrix(y_test, y_pred_svc))
print(accuracy_score(y_test, y_pred_svc))

# Naive Bayes
print("Naive Bayes performance")
y_pred_nb = nb.predict(x_test)
print(confusion_matrix(y_test, y_pred_nb))
print(accuracy_score(y_test, y_pred_nb))

# Random Forest
print("Random Forest performance")
y_pred_rf = rf.predict(x_test)
print(confusion_matrix(y_test, y_pred_rf))
print(accuracy_score(y_test, y_pred_rf))

Logistic Regression performance
[[80 17]
 [28 75]]
0.775
KNN performance
[[74 23]
 [45 58]]
0.66
SVM performance
[[89  8]
 [36 67]]
0.78
Naive Bayes performance
[[55 42]
 [12 91]]
0.73
Random Forest performance
[[90  7]
 [41 62]]
0.76
