In [1]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv

In [2]:
# Read the data
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter ='\t')

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place. 25 times,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Text Processing

In [4]:
# Clean the text by removing any extra characters 
import re # tools for cleaning text
import nltk # library to perform NLP
nltk.download('stopwords') # tool to remove the non-essential words. It is a list of words
from nltk.corpus import stopwords
# Stemming means taking the root of the word such as love from loved or loving
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
# Steps for cleaning the first review and apply for loop on all the reviews
df['Review'][0]

review = re.sub('[^a-zA-Z]', ' ', df['Review'][0]) # only keep the letters so what we dont want to remove

review = review.lower() # make all letters as lowercase 

review = review.split()  # split the review into separate words to create a list of words

ps =PorterStemmer()

review = [ps.stem(word) for word in review if not  word in set(stopwords.words('english'))] # use set for larger text

review = ' '.join(review) # take the list of words back into the string

In [6]:
review

'wow love place time'

In [8]:
# Do the same steps for all the reviews in the dataset using the for loop
corpus = [] # Build a new list for all the words from all the reviews
# corpus is a collection of text and a common term used in NLP
for i in range (0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i] )
    review = review.lower()
    review = review.split()
    ps =PorterStemmer()
    review = [ps.stem(word) for word in review if not  word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus

['wow love place time',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch',
 'servic prompt',
 'would go back',
 'cashier care ever say still end wayyy overpr',
 'tri cape cod ravoli chicken cranberri mmmm',
 'disgust pretti sure human hair',
 'shock sign indic cash',
 'highli recommend',
 'waitress littl slow servic',
 'place worth time let alon vega',
 'like',
 'burritto blah',
 'food amaz',
 'servic also cute',
 'could care less interior beauti',
 'perform',
 'right red velvet cake ohhh stuff good',
 'name',
 'hole wall great mexican street taco friendli staff',
 'took hour get food tabl restaur food luke warm sever run around like total overwhelm',
 'worst salmon sashimi',
 'also combo like burger fri beer decent deal',
 'like final blow',
 'found place accid could happi

## Feature Extraction
### Bag of Words Model

In [10]:
# Create the bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 1500)# We restrict the words to 1500 which keeps the most frequent words


In [11]:
X = cv.fit_transform(corpus).toarray()# independent variables

In [12]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
print(cv.get_feature_names())

['absolut', 'absolutley', 'accid', 'accommod', 'accomod', 'account', 'ach', 'acknowledg', 'across', 'actual', 'ad', 'afford', 'afternoon', 'ago', 'ahead', 'airlin', 'airport', 'ala', 'albondiga', 'allergi', 'almond', 'almost', 'alon', 'also', 'although', 'alway', 'amaz', 'ambianc', 'ambienc', 'amount', 'ampl', 'andddd', 'angri', 'annoy', 'anoth', 'anticip', 'anymor', 'anyon', 'anyth', 'anytim', 'anyway', 'apart', 'apolog', 'app', 'appal', 'appar', 'appeal', 'appet', 'appetit', 'appl', 'approv', 'area', 'arepa', 'aria', 'around', 'array', 'arriv', 'articl', 'ask', 'assur', 'ate', 'atmospher', 'atroci', 'attach', 'attack', 'attent', 'attitud', 'auju', 'authent', 'averag', 'avocado', 'avoid', 'aw', 'away', 'awesom', 'awkward', 'awkwardli', 'ayc', 'az', 'baba', 'babi', 'bachi', 'back', 'bacon', 'bad', 'bagel', 'bakeri', 'baklava', 'ball', 'bamboo', 'banana', 'bank', 'bar', 'bare', 'bargain', 'bartend', 'base', 'basebal', 'basic', 'batch', 'bathroom', 'batter', 'bay', 'bbq', 'bean', 'beat',



In [14]:
X.shape

(1000, 1500)

In [14]:
#df = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

In [15]:
y = df.iloc[:, 1].values # dependent variable (These are your Liked values from the dataset)

 ## Modeling 
 #### Naive Bayes Model

In [16]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Fitting classifier to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)



GaussianNB()

In [17]:
# Predicting the Test set results 
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score
cm = confusion_matrix(y_test, y_pred)

In [18]:
# Evaluating the results
cm
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.6791044776119403

### KNN Model

In [19]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Fitting classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [20]:
cm
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.6756756756756757

### Support Vector Machine SVM Model

In [21]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Fitting classifier to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

In [22]:
cm
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.7777777777777778

### Decision Tree Classification Model

In [23]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Fitting classifier to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [24]:
cm


array([[83, 34],
       [49, 84]], dtype=int64)

In [25]:
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.711864406779661

### Random Forest Model

In [26]:
# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)



# Fitting classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [27]:
cm

array([[100,  17],
       [ 58,  75]], dtype=int64)

In [28]:
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.8152173913043478

## Feature Extraction
### TF IDF 

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))


In [30]:
corpus = vectorizer.fit_transform(corpus).toarray()

In [31]:
X =corpus

In [32]:
X.shape

(1000, 179)

### Random Forest 

In [33]:
# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)



# Fitting classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [34]:
cm

array([[88, 29],
       [54, 79]], dtype=int64)

In [35]:
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.7314814814814815

### Decision Tree 

In [36]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Fitting classifier to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [37]:
cm

array([[85, 32],
       [56, 77]], dtype=int64)

In [38]:
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.7064220183486238

### Support Vector Machine SVM Model

In [39]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Fitting classifier to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

In [40]:
cm

array([[95, 22],
       [55, 78]], dtype=int64)

In [41]:
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.78

### KNN Model

In [42]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Fitting classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [43]:
cm

array([[76, 41],
       [64, 69]], dtype=int64)

In [44]:
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.6272727272727273

### Naive Bayes Algorithm

In [45]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)



# Fitting classifier to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score
cm = confusion_matrix(y_test, y_pred)

In [46]:
cm


array([[57, 40],
       [26, 77]], dtype=int64)

In [47]:
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)
recall_score(y_test, y_pred) # tp / (tp + fn)
precision_score(y_test, y_pred) # tp / (tp + fp)

0.6581196581196581

### Questions

#### 1. What is Natural Language processing and why is it important in the context of data analysis in today’s world? 
Sentiment analysis is a natural language processing (NLP) technique used to automatically identify and extract subjective information from text, such as opinions, attitudes, emotions, and feelings. It involves analyzing a piece of text, such as a sentence, paragraph, or entire document, and determining whether its overall sentiment is positive, negative, or neutral.

With the ever-growing enormous amount of unstructured data available in natural language formats, such as text, audio, and social media, NLP is becoming more and more significant in the context of data analysis in the modern world. With help of NLP organizations can gain useful insights from this data by utilising NLP techniques,such as sentiment analysis, topic modelling, entity recognition, and relationship extraction.

#### 2. What is the purpose of stopwords in NLP?

Stopwords in NLP are used to filter out these frequent words so that the emphasis can be placed on the terms that have higher significance and relevance in a particular context. Stopwords are frequently eliminated from texts to leave behind a smaller, more meaningful group of words, which can increase analysis's precision.

#### 3. What is the purpose of toarray() in the following code : X = cv.fit_transform(corpus).toarray()
The output of the fit transform() method is converted to a NumPy array in the provided code using the toarray() function.Using a CountVectorizer object called cv, the scikit-learn library's cv.fit transform(corpus) method converts the corpus's text input into a matrix of numerical features.The outcome is a sparse matrix, a kind of matrix in which the majority of the entries are 0. The sparse matrix given by fit transform() is converted to a dense NumPy array using the toarray() method since most machine learning methods require data to be in a dense matrix format.

#### 4. Evaluate the performance of each of these models. Create a table to highlight the differences across both the approaches

In [5]:
from tabulate import tabulate

# Sample data in the form of a list of lists
data = [["KNN",67.56 ], ["Naive Bayes Algorithm",67.91], ["Decision Tree",71.18], ["SVM", 77.77 ], ["Random Forest",81.52]]

# Table headers
headers = ["Model", "Precision in percentage"]

# Tabulate the data with headers
table = tabulate(data, headers=headers)

# Print the table
print(table)

Model                    Precision in percentage
---------------------  -------------------------
KNN                                        67.56
Naive Bayes Algorithm                      67.91
Decision Tree                              71.18
SVM                                        77.77
Random Forest                              81.52


#### 5. Which model predicts the best ?
Random Forest model is the best among all as per the above table