In [66]:
%matplotlib inline
from ipywidgets import interactive
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import nltk
import string
import re

In [67]:
data = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [68]:
review = data['Review']

In [69]:
label = data['Liked']

In [70]:
# change to lower case
review = review.apply(lambda x: x.lower())

In [71]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CompuTek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
# stop words and punct
useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)

In [73]:
useless_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [74]:
def remove_useless(words):
    return {
        word for word in words \
        if not word in useless_words}

In [75]:
# word tokenization
nltk.download('punkt')
from nltk.tokenize import word_tokenize
review = review.apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CompuTek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [76]:
review.head()

0                    [wow, ..., loved, this, place, .]
1                            [crust, is, not, good, .]
2    [not, tasty, and, the, texture, was, just, nas...
3    [stopped, by, during, the, late, may, bank, ho...
4    [the, selection, on, the, menu, was, great, an...
Name: Review, dtype: object

In [77]:
# review = review.apply(lambda words: [w for w in words if w not in useless_words])

In [78]:
review.head()

0                    [wow, ..., loved, this, place, .]
1                            [crust, is, not, good, .]
2    [not, tasty, and, the, texture, was, just, nas...
3    [stopped, by, during, the, late, may, bank, ho...
4    [the, selection, on, the, menu, was, great, an...
Name: Review, dtype: object

In [79]:
from nltk.stem.isri import WordNetLemmatizer

In [80]:
nltk.download('wordnet')
review = review.apply(lambda words: [WordNetLemmatizer().lemmatize(w) for w in words])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CompuTek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [81]:
review.head()

0                    [wow, ..., loved, this, place, .]
1                            [crust, is, not, good, .]
2    [not, tasty, and, the, texture, wa, just, nast...
3    [stopped, by, during, the, late, may, bank, ho...
4    [the, selection, on, the, menu, wa, great, and...
Name: Review, dtype: object

In [82]:
from nltk.stem.porter import PorterStemmer

In [83]:
review = review.apply(lambda words: [PorterStemmer().stem(w) for w in words])

In [84]:
review.head()

0                      [wow, ..., love, thi, place, .]
1                            [crust, is, not, good, .]
2    [not, tasti, and, the, textur, wa, just, nasti...
3    [stop, by, dure, the, late, may, bank, holiday...
4    [the, select, on, the, menu, wa, great, and, s...
Name: Review, dtype: object

# training without preprocessing

In [85]:
# review = review.apply(lambda x: " ".join(x))

In [86]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

In [87]:
train_df_vectorized = TfidfVectorizer(min_df=2, ngram_range=(1, 3))
X = train_df_vectorized.fit_transform(data['Review']) 
#train_df_vectorized = vect.transform(review)

In [88]:
clfrNB = MultinomialNB(alpha = 0.1)
clfrNB.fit(X, data['Liked'])

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [89]:
from sklearn.model_selection import cross_val_score
cv = cross_val_score(clfrNB, X, data['Liked'],cv=10)

In [90]:
cv

array([0.81, 0.81, 0.81, 0.77, 0.76, 0.83, 0.86, 0.84, 0.79, 0.77])

In [91]:
cv.mean()

0.805

In [92]:
test = ['I do not love them']
test = train_df_vectorized.transform(test)
clfrNB.predict(test)

array([0], dtype=int64)

In [101]:
!pip install PyArabic

Collecting PyArabic
  Downloading https://files.pythonhosted.org/packages/c6/e2/64c8b4605286b477c2f85963d58da8f97f24b0d0766e893e6817816fae63/PyArabic-0.6.5-py3-none-any.whl (110kB)
Installing collected packages: PyArabic
Successfully installed PyArabic-0.6.5


# training with cleaning

In [93]:
review = review.apply(lambda x: " ".join(x))

In [94]:
train_df_vectorized = TfidfVectorizer(min_df=2, ngram_range=(1, 3))
X = train_df_vectorized.fit_transform(review) 
#train_df_vectorized = vect.transform(review)

In [95]:
clfrNB = MultinomialNB(alpha = 0.1)
clfrNB.fit(X, data['Liked'])

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [96]:
from sklearn.model_selection import cross_val_score
cv = cross_val_score(clfrNB,X, data['Liked'],cv=10)

In [97]:
cv

array([0.79, 0.84, 0.79, 0.8 , 0.77, 0.85, 0.85, 0.84, 0.81, 0.79])

In [98]:
cv.mean()

0.813

In [100]:
test = ['I do not love them']
test = train_df_vectorized.transform(test)
clfrNB.predict(test)

array([0], dtype=int64)