# Natural Language Processing

In [1]:
!pip install textblob
!pip install pyspellchecker



## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [4]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

## Cleaning the texts

In [5]:
%%time

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from spellchecker import SpellChecker
from textblob import TextBlob
import textblob as tb
tb.en.spelling.update({'wow':1})

corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
#     spell = SpellChecker()
#     str = re.findall("[a-zA-Z,.]+",corpus[2])
#     misspelled = spell.unknown(str)
#     print(misspelled)
    review = str(TextBlob(review).correct())
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sreen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Wall time: 1min 24s


In [6]:
print(corpus)

['wow love place', 'crust not good', 'not taste texture nasty', 'stop late may bank holiday rich steve recommend love', 'select menu great price', 'get angry want damn who', 'honest last fresh', 'potato like rubber could tell made ahead time kept warmer', 'fro great', 'great touch', 'service prompt', 'would not go back', 'cashier care ever say still end way over', 'try cape cod revolt chicken cranberri mmm', 'disgust pretty sure human hair', 'shock sign india cash', 'highly recommend', 'witness little slow service', 'place not worth time let along vera', 'not like', 'burritto bah', 'food may', 'service also cut', 'could care less interior beauty', 'perform', 'right red velvet cake oh stuff good', 'never brought salad ask', 'hole wall great mexican street tact friendly staff', 'took hour get food table restful food luke warm never run around like total overwhelm', 'worst salmon sashimi', 'also comb like burgher fro beer decent deal', 'like final blow', 'found place acid could not happie

## Creating the Bag of Words model

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1400)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

### Feature Scaling

In [39]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Logistic Regression model on the Training set

In [40]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

## Predicting the Test set results

In [41]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[76 21]
 [22 81]]


0.785

## Predict if a Single review is positive or negative

In [50]:
new_review = 'I do not love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_review = str(TextBlob(new_review).correct())
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_X_test = sc.transform(new_X_test)
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]


## Saving the Model with pickle

In [44]:
import pickle

In [51]:
model_filename = 'logistic_regression_finalized_model.sav'
sc_filename = 'sc.sav'
cv_filename = 'cv.sav'
pickle.dump(classifier, open(model_filename, 'wb'))
pickle.dump(sc, open(sc_filename, 'wb'))
pickle.dump(cv, open(cv_filename, 'wb'))