<a href="https://colab.research.google.com/github/Rydzio/Machine-Learning/blob/main/Natural_language_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bag of Words

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

## Importing the dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [None]:
data = '/content/gdrive/MyDrive/Machine Learning A-Z (Codes and Datasets)/Part 7 - Natural Language Processing/Section 36 - Natural Language Processing/Python/Restaurant_Reviews.tsv'
dataset = pd.read_csv(data, delimiter = '\t', quoting = 3)

## Cleaning the texts

In [None]:
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
corpus = []
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
print(corpus)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could not

## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [None]:
len(X[0])

1500

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [None]:
print(words_freq)

[('food', 127), ('not', 116), ('place', 112), ('good', 95), ('servic', 87), ('great', 70), ('go', 62), ('back', 61), ('time', 55), ('like', 51), ('realli', 36), ('love', 33), ('disappoint', 30), ('best', 30), ('would', 29), ('wait', 29), ('get', 28), ('ever', 28), ('restaur', 28), ('order', 28), ('also', 27), ('friendli', 27), ('eat', 27), ('one', 27), ('never', 26), ('nice', 25), ('amaz', 24), ('delici', 24), ('price', 22), ('tast', 22), ('vega', 22), ('come', 21), ('even', 21), ('pretti', 20), ('came', 20), ('experi', 20), ('staff', 19), ('us', 19), ('minut', 19), ('definit', 19), ('recommend', 18), ('say', 18), ('chicken', 18), ('server', 18), ('bad', 18), ('much', 18), ('star', 18), ('got', 18), ('steak', 18), ('flavor', 18), ('pizza', 18), ('want', 17), ('made', 17), ('tri', 17), ('salad', 17), ('first', 17), ('think', 17), ('dish', 17), ('menu', 16), ('could', 16), ('fri', 16), ('burger', 16), ('way', 16), ('better', 16), ('worst', 15), ('feel', 15), ('alway', 15), ('well', 15), 

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
print(X_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
print(X_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
print(y_train)

[1 1 1 0 1 0 1 0 0 0 1 0 0 1 1 0 1 1 1 1 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0 1
 1 1 0 1 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 0 1 1
 0 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 1 1 0 0 1 0 0
 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0
 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 0
 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 1 0 1 1 0 0 0 1 1 0 1 0 1
 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 0 0 0 1 0 0 1
 0 0 0 1 0 0 0 0 1 1 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 1 1 0 1 1 0
 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0
 1 1 1 0 0 1 1 0 1 0 1 0 0 0 1 1 1 0 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0 0 1 0
 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1
 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 0 0 0 1 1 1
 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 1 0
 0 1 0 1 0 1 1 0 0 1 0 1 

In [None]:
print(y_test)

[0 0 0 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0
 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 1 1 1
 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 1
 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 0 1
 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0
 1 0 1 0 1 1 0 0 0 1 0 1 1 0 1]


# Training Models

In [None]:
performance = np.array([['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score']])

## Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]

### Evaluating Performance

In [None]:
model = 'Logistic Regression'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[80 17]
 [28 75]]


In [None]:
print(ac)

0.775


In [None]:
print(ps)

0.815


In [None]:
print(rs)

0.728


In [None]:
print(fs)

0.769


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']]


## Training the K-Nearest Neighbors model on the Training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[0 0]
 [1 0]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 1]
 [0 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 1]
 [1 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [0 0]
 [0 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]

### Evaluating Performance

In [None]:
model = 'K-NN'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[68 29]
 [42 61]]


In [None]:
print(ac)

0.645


In [None]:
print(ps)

0.678


In [None]:
print(rs)

0.592


In [None]:
print(fs)

0.632


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']
 ['K-NN' '0.645' '0.678' '0.592' '0.632']]


## Training the Support Vector Machine model on the Training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]

### Evaluating Performance

In [None]:
model = 'SVM'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[79 18]
 [24 79]]


In [None]:
print(ac)

0.79


In [None]:
print(ps)

0.814


In [None]:
print(rs)

0.767


In [None]:
print(fs)

0.79


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']
 ['K-NN' '0.645' '0.678' '0.592' '0.632']
 ['SVM' '0.79' '0.814' '0.767' '0.79']]


## Training the Kernel SVM model on the Training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]

### Evaluating Performance

In [None]:
model = 'Kernel SVM'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[89  8]
 [36 67]]


In [None]:
print(ac)

0.78


In [None]:
print(ps)

0.893


In [None]:
print(rs)

0.65


In [None]:
print(fs)

0.753


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']
 ['K-NN' '0.645' '0.678' '0.592' '0.632']
 ['SVM' '0.79' '0.814' '0.767' '0.79']
 ['Kernel SVM' '0.78' '0.893' '0.65' '0.753']]


## Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

### Evaluating Performance

In [None]:
model = 'Naive Bayes'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[55 42]
 [12 91]]


In [None]:
print(ac)

0.73


In [None]:
print(ps)

0.684


In [None]:
print(rs)

0.883


In [None]:
print(fs)

0.771


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']
 ['K-NN' '0.645' '0.678' '0.592' '0.632']
 ['SVM' '0.79' '0.814' '0.767' '0.79']
 ['Kernel SVM' '0.78' '0.893' '0.65' '0.753']
 ['Naive Bayes' '0.73' '0.684' '0.883' '0.771']]


## Training the Decision Tree Classification model on the Training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]

### Evaluating Performance

In [None]:
model = 'Decision Tree'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[78 19]
 [31 72]]


In [None]:
print(ac)

0.75


In [None]:
print(ps)

0.791


In [None]:
print(rs)

0.699


In [None]:
print(fs)

0.742


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']
 ['K-NN' '0.645' '0.678' '0.592' '0.632']
 ['SVM' '0.79' '0.814' '0.767' '0.79']
 ['Kernel SVM' '0.78' '0.893' '0.65' '0.753']
 ['Naive Bayes' '0.73' '0.684' '0.883' '0.771']
 ['Decision Tree' '0.75' '0.791' '0.699' '0.742']]


## Training the Random Fores Classification model on the Training set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]

### Evaluating Performance

In [None]:
model = 'Random Forest'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[87 10]
 [45 58]]


In [None]:
print(ac)

0.725


In [None]:
print(ps)

0.853


In [None]:
print(rs)

0.563


In [None]:
print(fs)

0.678


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']
 ['K-NN' '0.645' '0.678' '0.592' '0.632']
 ['SVM' '0.79' '0.814' '0.767' '0.79']
 ['Kernel SVM' '0.78' '0.893' '0.65' '0.753']
 ['Naive Bayes' '0.73' '0.684' '0.883' '0.771']
 ['Decision Tree' '0.75' '0.791' '0.699' '0.742']
 ['Random Forest' '0.725' '0.853' '0.563' '0.678']]


## Training the CART model on the Training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'gini', random_state = 0)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [1 0]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]

### Evaluating Performance

In [None]:
model = 'CART'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[77 20]
 [33 70]]


In [None]:
print(ac)

0.735


In [None]:
print(ps)

0.778


In [None]:
print(rs)

0.68


In [None]:
print(fs)

0.725


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']
 ['K-NN' '0.645' '0.678' '0.592' '0.632']
 ['SVM' '0.79' '0.814' '0.767' '0.79']
 ['Kernel SVM' '0.78' '0.893' '0.65' '0.753']
 ['Naive Bayes' '0.73' '0.684' '0.883' '0.771']
 ['Decision Tree' '0.75' '0.791' '0.699' '0.742']
 ['Random Forest' '0.725' '0.853' '0.563' '0.678']
 ['CART' '0.735' '0.778' '0.68' '0.725']]


## Training the Maximum Entropy model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(penalty='l2', C = 1.0)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
prediction = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

In [None]:
print(prediction)

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]

### Evaluating Performance

In [None]:
model = 'Maximum Entropy'
cm = confusion_matrix(y_test, y_pred)
ac = round(accuracy_score(y_test, y_pred), 3)
ps = round(precision_score(y_test, y_pred), 3)
rs = round(recall_score(y_test, y_pred), 3)
fs = round(f1_score(y_test, y_pred), 3)

In [None]:
print(cm)

[[80 17]
 [28 75]]


In [None]:
print(ac)

0.775


In [None]:
print(ps)

0.815


In [None]:
print(rs)

0.728


In [None]:
print(fs)

0.769


In [None]:
performance = np.concatenate((performance, [[model, ac, ps, rs, fs]]))

In [None]:
print(performance)

[['Model' 'Accuracy' 'Precision' 'Recall' 'F1 Score']
 ['Logistic Regression' '0.775' '0.815' '0.728' '0.769']
 ['K-NN' '0.645' '0.678' '0.592' '0.632']
 ['SVM' '0.79' '0.814' '0.767' '0.79']
 ['Kernel SVM' '0.78' '0.893' '0.65' '0.753']
 ['Naive Bayes' '0.73' '0.684' '0.883' '0.771']
 ['Decision Tree' '0.75' '0.791' '0.699' '0.742']
 ['Random Forest' '0.725' '0.853' '0.563' '0.678']
 ['CART' '0.735' '0.778' '0.68' '0.725']
 ['Maximum Entropy' '0.775' '0.815' '0.728' '0.769']]


## Comparing the Model Performances

TP = # True Positives, TN = # True Negatives, FP = # False Positives, FN = # False Negatives

- Accuracy measures how model did predicting values in comparison to real data.

Accuracy = (TP + TN) / (TP + TN + FP + FN)


- Precission measures the exactness of predicted data in comparison to real data.

Precision = TP / (TP + FP)

- Recall measures completness of predicted data in comparison to real data

Recall = TP / (TP + FN)

- F1 Score is compromise between Precision and Recall

F1 Score = 2 * Precision * Recall / (Precision + Recall)

In [None]:
print(pd.DataFrame(performance))

                     0         1          2       3         4
0                Model  Accuracy  Precision  Recall  F1 Score
1  Logistic Regression     0.775      0.815   0.728     0.769
2                 K-NN     0.645      0.678   0.592     0.632
3                  SVM      0.79      0.814   0.767      0.79
4           Kernel SVM      0.78      0.893    0.65     0.753
5          Naive Bayes      0.73      0.684   0.883     0.771
6        Decision Tree      0.75      0.791   0.699     0.742
7        Random Forest     0.725      0.853   0.563     0.678
8                 CART     0.735      0.778    0.68     0.725
9      Maximum Entropy     0.775      0.815   0.728     0.769


# Predicting if a single review is positive or negative

## Positive review

Use our model to predict if the following review:

"I love this restaurant so much"

is positive or negative.

**Solution:** We just repeat the same text preprocessing process we did before, but this time with a single review.

In [None]:
new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[1]


The review was correctly predicted as positive by our model.

## Negative review

Use our model to predict if the following review:

"I hate this restaurant so much"

is positive or negative.

**Solution:** We just repeat the same text preprocessing process we did before, but this time with a single review.

In [None]:
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]


The review was correctly predicted as negative by our model.