# DataPreprocessing 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
df=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3) # By quoting=3 the{ " } will be ignored in the text since it can cause error because these quotes are within texts

In [17]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [18]:
df.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


# Cleaning the data

### remove puncuations,special characters,capital and lower cases this is an essential step of cleaning data as we need to clean the data as much as possible

stop words - The words which are not related to the prediction. for example words like 'the','a','and' etc


Stemming - Changing the words like loved to love these means the same but it makes easy for our prediction making the easy to understand by the model

In [19]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\upend\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
corpus=[] #all the cleaned data will be present in corpus
for i in range(0,len(df)):
    #review will the cleaned data so that it can be appended to the corpus
    #re.sub() this fun will replace the specail chars with a space 
    review=re.sub('^a-zA-Z',' ',df['Review'][i])
    #all capital to lower case
    review=review.lower()
    #splitting is done for the stemming process
    review=review.split()
    #applying stemming
    ps=PorterStemmer()
    #creating an object for stemming
    #applying stemming to all the words in a sentence
    all_stopwords=stopwords.words('english')
    all_stopwords.remove('not')
    review=[ps.stem(word) for word in review if not word in set(all_stopwords)]
    review=' '.join(review)
    corpus.append(review)

# Bag of words

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)
X=cv.fit_transform(corpus).toarray()
y=df.iloc[:,-1].values

In [34]:
len(X[0])

1500

# Splitting and training data

In [35]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Implementing Naive Bayes model

In [38]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)

In [39]:
y_pred=clf.predict(X_test)

In [40]:
from sklearn.metrics import accuracy_score,confusion_matrix
print(confusion_matrix(y_pred,y_test))
accuracy_score(y_pred,y_test)

[[53 11]
 [44 92]]


0.725

In [43]:
new_review = 'I love you'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = clf.predict(new_X_test)
print(new_y_pred)

[1]


In [47]:
new_review = 'I am not sure'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = clf.predict(new_X_test)
print(new_y_pred)

[1]


In [49]:
len(X)

1000

In [50]:
len(y)

1000