# **NLP analysis of Restaurant reviews**

**Step 1 : Import dataset**

In [1]:
# importing libraries
import numpy as np
import pandas as pd

# import dataset
dataset = pd.read_csv("/content/Restaurant_Reviews.tsv",delimiter = "\t")

**Step 2 :  Text Cleaning or Preprocessing**

In [2]:
# library to clean data
import re

# natural language tool kit
import nltk

nltk.download('stopwords')

# to remove stopword
from nltk.corpus import stopwords

# for stemming purpose
from nltk.stem.porter import PorterStemmer

# Initialize empty array
# to append clean text
corpus = []

# 1000 (reviews) rows to clean
for i in range(0,1000):

  #column : "Review",row ith
  review = re.sub('[^a-zA-Z]','',dataset['Review'][i])

  # convert all cases to lower cases
  review = review.lower()

  # split to array(default delimiter is " ")
  review = review.split()

  # creating PorterStemmer object to
  # take main stem of each word
  ps = PorterStemmer()

  # loop for stemming each word
  # in the string array at the ith row
  review = [ps.stem(word) for word in review
            if not word in set(stopwords.words('english'))]

  # rejoin all the string array elements
  # to create back into a string
  review = ' '.join(review)

  # append each string to create
  # array of clean text
  corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Step 3 : Tokenization**

**Step 4 : Making the bag of words via sparse matrix**

In [3]:
# Creating the bag of words model
from sklearn.feature_extraction.text import CountVectorizer

#to extract max 1500 feature.
# "max_features" is attribute to
# experiment with to get better results
cv = CountVectorizer(max_features = 1500)

# X conatins corpus (dependent variable)
X = cv.fit_transform(corpus).toarray()

# y contains answers if review
# is positive or negative
y = dataset.iloc[:,1].values

**Step 5 : Splitting Corpus into Training and Test set**

In [7]:
# Splitting the dataset into
# the training set and test set
from sklearn.model_selection import train_test_split

# experiment with "test_size"
# to get better results

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)

**Step 6 : Fitting a predictive model ( Random Forest)**

In [8]:
# Fitting random forest classification
# to the training set
from sklearn.ensemble import RandomForestClassifier
# n_estimators can be said as number of
# trees, experiment with n_estimators
# to get better results
model = RandomForestClassifier(n_estimators = 501,
                               criterion = 'entropy')

model.fit(X_train,y_train)

**Step 7 : Predicting Final Results**

In [None]:
# predicting the test set results
y_pred = model.predict(X_test)

y_pred

**Step 8 :Confusion Matrix**

In [None]:
# Making the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
cm

In [None]:
from sklearn.metrics import accuracy_score
ac = accuracy_score(y_test,y_pred)
ac