## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import os

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## Load Dataset

In [3]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.shape

(1000, 2)

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Clean text but don't remove stopwords from dataframe's `Review` attribute

In [5]:
stopwords_list = stopwords.words('english')
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
corpus = []

for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    # ps = PorterStemmer()
    # review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [7]:
len(corpus)

1000

In [8]:
corpus[0:5]

['wow loved this place',
 'crust is not good',
 'not tasty and the texture was just nasty',
 'stopped by during the late may bank holiday off rick steve recommendation and loved it',
 'the selection on the menu was great and so were the prices']

In [9]:
corpus[-5:]

['i think food should have flavor and texture and both were lacking',
 'appetite instantly gone',
 'overall i was not impressed and would not go back',
 'the whole experience was underwhelming and i think we ll just go to ninja sushi next time',
 'then as if i hadn t wasted enough of my life there they poured salt in the wound by drawing out the time it took to bring the check']

## Create and fit_transform strings list on Tfidf Vectorizer

In [10]:
tfidf = TfidfVectorizer()
tfidf

In [11]:
X = tfidf.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [12]:
X.shape

(1000, 2016)

In [13]:
y.shape

(1000,)

In [14]:
X[0:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
y[0:5]

array([1, 0, 0, 1, 1], dtype=int64)

## Split into train and test set

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(850, 2016)
(150, 2016)
(850,)
(150,)


In [18]:
X_train[0:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
X_test[0:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
y_train[0:5]

array([1, 1, 1, 1, 0], dtype=int64)

In [21]:
y_test[0:5]

array([0, 0, 0, 0, 0], dtype=int64)

## Create and fit Logistic Classifier

In [22]:
log_classifier = LogisticRegression()
log_classifier

In [23]:
log_classifier.fit(X_train, y_train)

## Predict X_test

In [24]:
y_pred = log_classifier.predict(X_test)
y_pred.shape

(150,)

In [25]:
print(y_pred)

[0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0 0 1 0 1 1 0 1 0 0 0 1 0 0
 0 0 1 1 0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 1 1 1 0
 0 0 0 1 0 0 0 1 1 0 1 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 1 0 1 0 1 1 0 0 0
 1 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 0 0 1 1 0 0 0 0 0
 0 1]


## Confusion Matrix

In [26]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[61 10]
 [18 61]]


## Accuracy Score

In [27]:
ac = accuracy_score(y_test, y_pred)
print(ac)

0.8133333333333334


## Bias and Variance

In [28]:
bias = log_classifier.score(X_train, y_train)
bias

0.9576470588235294

In [29]:
variance = log_classifier.score(X_test,y_test)
variance

0.8133333333333334

## Export Logistic Classifier and Tfidf Vectorizer

In [30]:
classifier_filename = 'logistic_classifier.pkl'
with open (classifier_filename, 'wb') as file_logistic:
    pickle.dump(log_classifier, file_logistic)

In [31]:
vectorizer_filename = 'tfidf_vectorizer.pkl'
with open (vectorizer_filename, 'wb') as file_tfidf:
    pickle.dump(tfidf, file_tfidf)

## Test Review

In [32]:
test_string = 'restaurant was not cleaned. sitting chairs and table was dirty'

In [33]:
cleaned_string = []

test_str_clean = re.sub('[^a-zA-Z]', ' ', test_string)
test_str_clean = test_str_clean.lower()
test_str_clean = test_str_clean.split()
test_str_clean = ' '.join(test_str_clean)
cleaned_string.append(test_str_clean)

In [34]:
cleaned_string

['restaurant was not cleaned sitting chairs and table was dirty']

In [35]:
cleaned_string = np.array(cleaned_string)
cleaned_string

array(['restaurant was not cleaned sitting chairs and table was dirty'],
      dtype='<U61')

In [36]:
test_string_vector = tfidf.transform(cleaned_string)
test_string_vector.shape

(1, 2016)

In [37]:
# tfidf.vocabulary_

In [38]:
# tfidf.idf_

In [39]:
review_predict = log_classifier.predict(test_string_vector)
review_predict

array([0], dtype=int64)

In [40]:
review_predict[0]

0