# Natural Language Processing

In [None]:
!pip install nltk

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [5]:
dataset.head(n=10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


## Cleaning the texts

In [3]:
# importing libraries
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /home/vpsr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Cleaning
# list to contain cleaned reviews
corpus = []
# looping over every review to clean it
for i in range(0,1000):
    # print("Original sentence: ",dataset['Review'][i])
    # replacing everthing that is not a character with a space
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # print("After replacing chars: ",review)
    # converting to lowercase
    review = review.lower()
    # print("After lowercasing: ",review)
    # spliting into words
    review = review.split()
    # print("After spliting: ",review)
    # applying stemming
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    # print("After steming: ",review)
    review = ' '.join(review)
    # print("Creating sentence: ",review,"\n")
    corpus.append(review)



## Creating the Bag of Words model

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
# cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

In [33]:
print("X: ",len(y),"y: ",len(X))

X:  1000 y:  1000


## Splitting the dataset into the Training set and Test set

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Naive Bayes model on the Training set

In [35]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [36]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73