In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter='\t', quoting=3)

In [28]:
# Cleaning the texts
import re
import nltk # Library we use to clean the text from stopwords like a,the,an and any other that we don't have nothing of it
nltk.download("stopwords") # downloading the stopwords so we can take them out of our text 
from nltk.corpus import stopwords # importing those same stopwords
from nltk.stem.porter import PorterStemmer # We use this library to transform every word that is in any other time (Past, Future, Past continuose, Future continuose,...) to Present simple so we can avoid putting love and loved like 2 seperates words
corpus = [] # New list to put our cleaned new sentences
for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', " ", dataset["Review"][i]) # Cleaning everything that is not a letter
    review = review.lower() # Truning all letters to lower letters 
    review = review.split() # Splitting words so we can prepair it for the stemming 
    ps = PorterStemmer() # stemmer object 
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    all_stopwords.remove("isn't")
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)] # applying the stemming to our texts
    review = " ".join(review) # Joining the words back together
    corpus.append(review) # adding new text to our list which was made for that

[nltk_data] Downloading package stopwords to C:\Users\Veljko
[nltk_data]     Stojanovic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
# Creating the Bag of Words model (tokenization)
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(max_features=1500) # making a object for the count_vectorizer class and putting as a parametar number of words to work with and a machine will choose that many most frequently used words
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [31]:
from sklearn.naive_bayes import GaussianNB
classificator = GaussianNB()
classificator.fit(x_train, y_train)

GaussianNB()

In [32]:
y_pred = classificator.predict(x_test)

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(cm); print(ac)

[[55 42]
 [12 91]]
0.73


In [35]:
# Single positive prediction
new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classificator.predict(new_X_test)
print(new_y_pred)

[1]


In [37]:
# Single negative prediction
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classificator.predict(new_X_test)
print(new_y_pred)

[0]
