# Natural Language Processing
---

**Importing libraries:**

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

**Import dataset**

In [2]:
dataset = pd.read_csv('Data/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
print(dataset.shape)
dataset.head()

(1000, 2)


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Cleanig the texts

In [3]:
print(dataset['Review'][0])

Wow... Loved this place.


### We remove all signs, expect a-z letters and space

In [4]:
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])
review

'Wow    Loved this place '

### Made all words in lowercase and split

In [5]:
review = review.lower()
review = review.split()
review

['wow', 'loved', 'this', 'place']

### Words change in one form (loved = love)

In [6]:
ps = PorterStemmer()

### Remove non-significant word(and, in, all, the, this) that word don't help ML algorithm to predict

In [7]:
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review

['wow', 'love', 'place']

### Back join words

In [8]:
review = " ".join(review)
review

'wow love place'

### All dataset in cleaning

In [10]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    corpus.append(review)   

In [12]:
print(corpus[0:10])
dataset[0:10]

['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch']


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


### Create the bag of words model

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
#1500 most frequent word 
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
X.shape

(1000, 1500)

### Application to used Naive Base Model

In [53]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 1)

# Fitting Naive Bayes to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Accuracy:')
Accuracy = (cm[0, 0] + cm[1, 1]) / 200 * 100
print(Accuracy, '%')

print('Precision:')
Precision = cm[0, 0] / (cm[0, 0] + cm[0, 1]) * 100
print(Precision, '%')

print('Recall:')
Recall = cm[0, 0] / (cm[0, 0] + cm[1, 1]) * 100
print(Recall, '%')

print('F1 Score')
F1_Score = 2 * Precision * Recall / (Precision + Recall)
print(F1_Score, '%')

Accuracy:
68.5 %
Precision:
55.55555555555556 %
Recall:
43.79562043795621 %
F1 Score
48.9795918367347 %


### Application to used Random Forest Classification Model

In [52]:
from sklearn.ensemble import RandomForestClassifier

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.2,
                                                    random_state=1)
X_train.shape

# Fitting Naive Bayes to the Training set
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Accuracy:')
Accuracy = (cm[0, 0] + cm[1, 1]) / 200 * 100
print(Accuracy, '%')

print('Precision:')
Precision = cm[0, 0] / (cm[0, 0] + cm[0, 1]) * 100
print(Precision, '%')

print('Recall:')
Recall = cm[0, 0] / (cm[0, 0] + cm[1, 1]) * 100
print(Recall, '%')

print('F1 Score')
F1_Score = 2 * Precision * Recall / (Precision + Recall)
print(F1_Score, '%')

Accuracy:
73.5 %
Precision:
91.66666666666666 %
Recall:
67.3469387755102 %
F1 Score
77.64705882352939 %


### Application to used Decision Tree Classification Model

In [51]:
from sklearn.tree import DecisionTreeClassifier

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.2,
                                                    random_state=1)
X_train.shape

# Fitting Naive Bayes to the Training set
classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Accuracy:')
Accuracy = (cm[0, 0] + cm[1, 1]) / 200 * 100
print(Accuracy, '%')

print('Precision:')
Precision = cm[0, 0] / (cm[0, 0] + cm[0, 1]) * 100
print(Precision, '%')

print('Recall:')
Recall = cm[0, 0] / (cm[0, 0] + cm[1, 1]) * 100
print(Recall, '%')

print('F1 Score')
F1_Score = 2 * Precision * Recall / (Precision + Recall)
print(F1_Score, '%')

Accuracy:
72.0 %
Precision:
75.92592592592592 %
Recall:
56.94444444444444 %
F1 Score
65.07936507936508 %
