# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Datasets\\Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

FileNotFoundError: [Errno 2] No such file or directory: 'Datasets\\Restaurant_Reviews.tsv'

## Cleaning the texts

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
print("Corpus\n\n",corpus)

## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle
cv = CountVectorizer(max_features = 1500)
count_vect = cv.fit(corpus)
#Pickle count vectorizer
with open('cntvect.pickle', 'wb') as f:
    pickle.dump(count_vect, f)
#Creating matrix X and y
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the prediction model on the training set

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score

classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("\nAccuracy Score NB : ",accuracy_score(y_test, y_pred))

classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("\nAccuracy Score RFC : ",accuracy_score(y_test, y_pred))

classifier = LogisticRegression(max_iter=200, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("\nAccuracy Score LR : ",accuracy_score(y_test, y_pred))

classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("\nAccuracy Score SVM : ",accuracy_score(y_test, y_pred))

Since SVM model performs the best in terms of accuracy score (0.79) we will be using it for our review interface.

## Predicting the Test set results

In [None]:
print(X_test)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = classifier.predict(X_test)
print("Predicted vs Actual\n")
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
print("\nAccuracy Score : ",accuracy_score(y_test, y_pred))

In [None]:
tuple_model = (classifier, accuracy_score(y_test, y_pred))
filename = 'tuple_model.pkl'
pickle.dump(tuple_model, open(filename, 'wb'))

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Counfusion Matrix\n",cm)

## Predicting if a single review is positive or negative

In [None]:
#Model prediction
#Load the model
filename = 'tuple_model.pkl'
pkmodel, pkscore = pickle.load(open(filename, 'rb'))
#Load the count vectorizer
filename = 'cntvect.pickle'
cntvect = pickle.load(open(filename, 'rb'))

while True:
  print("\nPress T to exit\n")
  new_review = input("Enter Review : ")
  if new_review == 'T':
    break
  new_review = re.sub('[^a-zA-Z]', ' ', new_review)
  new_review = new_review.lower()
  new_review = new_review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
  new_review = ' '.join(new_review)
  new_corpus = [new_review]
  new_X_test = cntvect.transform(new_corpus).toarray()
  new_y_pred = pkmodel.predict(new_X_test)
  if new_y_pred == 1:
    print("The review is Positive.")
  else:
    print("The review is Negative.")
  print("\nAccuracy Score : ",pkscore)