# **Natural Language Processing of Restaurant Review in R**

## **Importing the dataset**

In [1]:
ds_original = read.delim('/content/Restaurant_Reviews.tsv', quote = '', stringsAsFactors = FALSE)
print(head(ds_original))

                                                                                   Review
1                                                                Wow... Loved this place.
2                                                                      Crust is not good.
3                                               Not tasty and the texture was just nasty.
4 Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.
5                             The selection on the menu was great and so were the prices.
6                                          Now I am getting angry and I want my damn pho.
  Liked
1     1
2     0
3     0
4     1
5     1
6     0


## **Cleaning the texts**

In [2]:
install.packages('tm')
library(tm)
corpus = VCorpus(VectorSource(ds_original$Review))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
install.packages('SnowballC')
library(SnowballC)
corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Loading required package: NLP

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



## **Creating the Bag of Words model**

In [3]:
dtm = DocumentTermMatrix(corpus)
dtm = removeSparseTerms(dtm, 0.999)
ds = as.data.frame(as.matrix(dtm))
ds$Liked = ds_original$Liked

## **Encoding the target feature as factor**

In [4]:
ds$Liked = factor(ds$Liked, levels = c(0, 1))

## **Splitting the dataset into the Training set and Test set**

In [5]:
install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(ds$Liked, SplitRatio = 0.8)
train_set = subset(ds, split == TRUE)
test_set = subset(ds, split == FALSE)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



## **Fitting Random Forest Classification to the Training set**

In [6]:
install.packages('randomForest')
library(randomForest)
classifier = randomForest(x = train_set[-692],
                          y = train_set$Liked,
                          ntree = 10)
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-692])

# Making the Confusion Matrix
cm = table(test_set[, 692], y_pred)
print(cm)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

randomForest 4.7-1

Type rfNews() to see new features/changes/bug fixes.



   y_pred
     0  1
  0 82 18
  1 23 77


## **Evaluation Metrics**

In [7]:
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class 
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes
accuracy = sum(diag) / n 
cat("Accuracy of NLP Random Forest Model is:", accuracy)  
precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 
data.frame(precision, recall, f1) 

Accuracy of NLP Random Forest Model is: 0.795

Unnamed: 0_level_0,precision,recall,f1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
0,0.7809524,0.82,0.8
1,0.8105263,0.77,0.7897436


In [8]:
# Support Vector Classifier
install.packages('e1071') 
library(e1071)
classifier = svm(formula = Liked ~ .,
                 data = train_set,
                 type = 'C-classification',
                 kernel = 'radial')


# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-692])

# Making the Confusion Matrix
cm = table(test_set[, 692], y_pred)
print(cm)
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class 
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes
accuracy = sum(diag) / n 
cat("Accuracy of NLP SVM is:", accuracy)  
precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 
data.frame(precision, recall, f1) 

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘proxy’


“Variable(s) ‘boot’ and ‘brick’ and ‘eye’ and ‘given’ and ‘legit’ and ‘mall’ and ‘oven’ and ‘peanut’ and ‘pure’ and ‘scallop’ and ‘show’ and ‘tap’ constant. Cannot scale data.”


   y_pred
      0   1
  0 100   0
  1  94   6
Accuracy of NLP SVM is: 0.53

Unnamed: 0_level_0,precision,recall,f1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
0,0.5154639,1.0,0.6802721
1,1.0,0.06,0.1132075


In [9]:
# Naive Bayes Classifier
library(e1071)
classifier = naiveBayes(x = train_set[-692], y = train_set$Liked)

# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-692])

# Making the Confusion Matrix
cm = table(test_set[, 692], y_pred)
print(cm)
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class 
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes
accuracy = sum(diag) / n 
cat("Accuracy of NLP SVM is:", accuracy)  
precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 
data.frame(precision, recall, f1)


   y_pred
     0  1
  0  5 95
  1  4 96
Accuracy of NLP SVM is: 0.505

Unnamed: 0_level_0,precision,recall,f1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
0,0.5555556,0.05,0.09174312
1,0.5026178,0.96,0.65979381


In [10]:
# Decision Tree Classifier
install.packages('rpart')
library(rpart)
classifier = rpart(formula = Liked ~ ., data = train_set)
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-692], type = 'class')
# Making the Confusion Matrix
cm = table(test_set[, 692], y_pred)
print(cm)
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class 
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes
accuracy = sum(diag) / n 
cat("Accuracy of NLP Decision Tree is:", accuracy)  
precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 
data.frame(precision, recall, f1)


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



   y_pred
     0  1
  0 85 15
  1 43 57
Accuracy of NLP Decision Tree is: 0.71

Unnamed: 0_level_0,precision,recall,f1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
0,0.6640625,0.85,0.745614
1,0.7916667,0.57,0.6627907
