## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## Import Dataset

In [3]:
dataset = pd.read_csv(r'Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.shape

(1000, 2)

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Remove stopwords from sentences and make a list of 1000 fresh sentences

In [5]:
corpus = []

for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
print(len(corpus))

1000


In [7]:
print(corpus[0:5])

['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price']


In [8]:
print(corpus[-5:])

['think food flavor textur lack', 'appetit instantli gone', 'overal impress would go back', 'whole experi underwhelm think go ninja sushi next time', 'wast enough life pour salt wound draw time took bring check']


## Create Tfidf Vectorizer

In [9]:
cv = TfidfVectorizer()
cv

## Fit_Transform 1000 fresh sentences to Tfidf Vectorizer

* X = Independent Variable (`Review`)
* y = Dependent Variable (`Liked`)

In [10]:
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [11]:
X.shape

(1000, 1565)

In [12]:
y.shape

(1000,)

In [18]:
X[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [19]:
X[-1]

array([0., 0., 0., ..., 0., 0., 0.])

In [21]:
y[0:100]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1], dtype=int64)

## Export Dataset

In [30]:
# for i in cv.idf_:
#     print(i)

In [32]:
# cv.vocabulary_

In [41]:
# sorted_vocabulary_by_index = cv.vocabulary_
# sorted_vocabulary_by_index = sorted(sorted_vocabulary_by_index.items(), key=lambda item: item[1])
# print(sorted_vocabulary_by_index)

In [38]:
# sorted_vocabulary_by_keys = sorted(cv.vocabulary_.items())
# sorted_vocabulary_by_keys

In [26]:
# X_ft_trnsfrm = cv.fit_transform(corpus)
# print(X_ft_trnsfrm)

In [28]:
# print(X_ft_trnsfrm.toarray())

In [48]:
# for vocab, idfscore in zip(cv.vocabulary_, cv.idf_):
#     print(cv.vocabulary_[vocab], ':', vocab, ':', idfscore)

In [53]:
# zp = zip(cv.vocabulary_, cv.idf_)
# list(zp)

In [74]:
# tfidf_dict =  {"Word_Index": [], "Unique_Word": [], "Idf_Score": []}
# for vocab, idfscore in zip(cv.vocabulary_, cv.idf_):
#     tfidf_dict["Word_Index"].append(cv.vocabulary_[vocab])
#     tfidf_dict["Unique_Word"].append(vocab)
#     tfidf_dict["Idf_Score"].append(idfscore)

# tfidf_df = pd.DataFrame(tfidf_dict)
# tfidf_df.shape

In [75]:
# tfidf_df_sorted = tfidf_df.sort_values(by=['Word_Index'], ascending=True)
# tfidf_df_sorted.head()
# tfidf_df_sorted.to_csv('Unique_words_idf_score.csv')

In [13]:
dataset.to_csv('Restaurant_Reviews_Csv.csv', index=False)

In [14]:
dict = {'Cleaned_Reviews': corpus}
df_corpus = pd.DataFrame(dict)
df_corpus.to_csv('Corpus.csv', index=False)

In [15]:
df_x = pd.DataFrame(X)
df_x.to_csv('X.csv', index=False)

## Split Independent and Dependent Variables into Train and Test Dataset

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [23]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(800, 1565)
(200, 1565)
(800,)
(200,)


In [24]:
X_train[0:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
X_test[0:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
y_train[0:5]

array([1, 1, 1, 0, 1], dtype=int64)

In [27]:
y_test[0:5]

array([0, 0, 0, 0, 0], dtype=int64)

## Create DecisionTreeClassifier

In [28]:
dt_classifier = DecisionTreeClassifier()
dt_classifier

In [29]:
dt_classifier.fit(X_train, y_train)

## Predict X_test

In [31]:
y_pred = dt_classifier.predict(X_test)
y_pred.shape

(200,)

In [32]:
y_pred

array([0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0], dtype=int64)

## Confusion Matrix

In [33]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[76 21]
 [40 63]]


## Accuracy Score

In [34]:
ac = accuracy_score(y_test, y_pred)
print(ac)

0.695


## Bias

In [36]:
bias = dt_classifier.score(X_train,y_train)
bias

0.99625

## Variance

In [None]:
variance = dt_classifier.score(X_test,y_test)
variance