In [135]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

ps = PorterStemmer()
df = pd.read_csv('./Restaurant_Reviews.tsv', sep = '\t', quoting = 3)
review = 'I do love this restaurant so much'
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\townsend\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [136]:
len(stopwords), np.array(stopwords)

(179,
 array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
        'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
        'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
        'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
        'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
        'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
        'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
        'by', 'for', 'with', 'about', 'against', 'between', 'into',
        'through', 'during', 'before', 'after', 'above', 'below', 'to',
        'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
        'again', 'further', 'then', 'once', 'here', 'there', 'when',
        'where', 'why', 'how', 'all', 'any', 'both

In [137]:
stopwords_exceptions = ('no', 'not', 'don', 'don\'t', 'aren\'t', 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", "won't", 'wouldn', "wouldn't")

stopwords = [a for a in stopwords if a not in stopwords_exceptions]
len(stopwords), np.array(stopwords)

(144,
 array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
        'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
        'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
        'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
        'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
        'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
        'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
        'by', 'for', 'with', 'about', 'against', 'between', 'into',
        'through', 'during', 'before', 'after', 'above', 'below', 'to',
        'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
        'again', 'further', 'then', 'once', 'here', 'there', 'when',
        'where', 'why', 'how', 'all', 'any', 'both

In [138]:
df['Review'].head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [139]:
review = re.sub('[^A-Za-z]', ' ', review)
fun = lambda x: ' '.join(ps.stem(a) for a in re.sub('[^a-z]', ' ', x.lower()).split() if a not in stopwords)
df['Review'] = df['Review'].apply(fun)

df['Review'].head()

0                                       wow love place
1                                       crust not good
2                               not tasti textur nasti
3    stop late may bank holiday rick steve recommen...
4                              select menu great price
Name: Review, dtype: object

In [140]:
CountVectorizer().fit_transform(df['Review'].values).size

5573

In [141]:
cv = CountVectorizer(max_features = 5500)
X = cv.fit_transform(df['Review'].values).toarray()
y = df['Liked'].values

In [142]:
review = ' '.join(ps.stem(a) for a in re.sub('^a-z', ' ', review).split() if a not in stopwords)
review = cv.transform([review]).toarray()

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

models = {'Logistic Regression': LogisticRegression(), 'K-NN': KNeighborsClassifier,
'SVC (Linear)': SVC(kernel = 'linear'), 'SVC (rbf)': SVC(kernel = 'rbf'), 'SVC (poly)': SVC(kernel = 'poly'),
'SVC (sigmoid)': SVC(kernel = 'sigmoid'), 'GauusianNB': GaussianNB(),
'Random Forest': RandomForestClassifier(), 'Decision Tree': DecisionTreeClassifier(criterion = 'entropy')} 

results = pd.DataFrame(columns = ['Score', 'Predict', 'Correct'])

for modelName in models:
    Score = 0
    
    if modelName == 'K-NN':
        
        for a in range(1, 50):
            model = models[modelName](a)
            model.fit(X_train, y_train)
            
            if model.score(X_test, y_test) > Score:
                Score = model.fit(X_train, y_train).score(X_test, y_test)
                Predict = model.fit(X, y).predict(review)[0]
                Correct = Predict == 1
                
    else:
        Score = models[modelName].fit(X_train, y_train).score(X_test, y_test)
        Predict = models[modelName].fit(X, y).predict(review)[0]
        Correct = Predict == 1
    
    results.loc[modelName] = dict(Score = f'{round(Score * 100, 2)}%', Predict = Predict, Correct = Correct)
    
results.sort_values('Score', ascending = False)

Unnamed: 0,Score,Predict,Correct
SVC (sigmoid),80.4%,1,True
Logistic Regression,80.0%,1,True
SVC (rbf),80.0%,1,True
SVC (Linear),79.2%,1,True
Random Forest,78.0%,1,True
Decision Tree,73.6%,0,False
K-NN,70.8%,1,True
GauusianNB,68.8%,1,True
SVC (poly),55.6%,1,True
