In [0]:
#help from nltk.org, scikit-learn.org, geeks for geeks.
#Step 1: Setup Environment
# Importing libraries and dataset with setting delimiter as ‘\t’ as columns are separated
import numpy as np
import pandas as pd

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')

In [4]:
#Step 2: Text Cleaning or Preprocessing
import re      #library to clean data
import nltk    #Natural Language Tool kit
nltk.download('stopwords')
from nltk.corpus import stopwords  #remove stopwords
#for Stemming propose
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
#Initialize empty array to append clean text
corpus =[]

#1000(reviews) rows to clean 
for i in range(0,1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])  ## column : "Review", row ith
  #convert all cases to lower cases
  review = review.lower()
  #split to array(default delimiter is " ")
  review = review.split()
  # creating PorterStemmer object to  take main stem of each word 
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review
            if not word in set(stopwords.words('english'))]
  #rejoin all string array elements to create back into a string     
  review = ' '.join(review)
  #append each string to create array of clean text    
  corpus.append(review)

#print(review)


In [0]:
#Step 3: Tokenization, involves splitting sentences and words from the body of the text.
#Step 4.Create Bag of Words model
from sklearn.feature_extraction.text import  CountVectorizer
#extract max 1500 feature
#"max_features" is attribute to experiment with to get better results 
cv = CountVectorizer(max_features = 1500)

# X contains corpus (dependent variable) 
X = cv.fit_transform(corpus).toarray()
#Y contains answer if review is positive or negative
y = dataset.iloc[:,1].values

Description of the dataset:

*   Columns seperated by \t (tab space)
*   First column is about reviews of people
*   In second column, 0 is for negative review and 1 is for positive review

In [0]:
# Step 5: Spliting Corpus into Training and Test set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [0]:
#Step 6: Fitting a Predictive Model(random forest)
Since Random fored is ensemble model (made of many trees) from sklearn.ensemble, import RandomForestClassifier class
With 501 tree or “n_estimators” and criterion as ‘entropy’
Fit the model via .fit() method with attributes X_train and y_train

#Step 6: Fitting a Predictive Model(random forest)

*  Since Random fored is ensemble model(made of many trees) from sklearn.ensemble, import RandomForestClassifier class
*   With 501 tree or “n_estimators” and criterion as ‘entropy’
*   Fit the model via .fit() method with attributes X_train and y_train

In [43]:
from sklearn.ensemble import RandomForestClassifier
# n_estimators can be said as number of trees, experiment with n_estimators to get better results
model = RandomForestClassifier(n_estimators = 501, criterion = 'entropy') 
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=501,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [47]:
#Step 7: Predicting the test set results
y_pred = model.predict(X_test)
y_pred

array([1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0])

In [57]:
#Step 8:performance of a classification model using Confusion Matrix
from sklearn.metrics import confusion_matrix
results = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :')
results

Confusion Matrix :


array([[107,   9],
       [ 53,  81]])

In [60]:
from sklearn.metrics import  accuracy_score, classification_report
print('Accuracy Score :',accuracy_score(y_test, y_pred))
print('Report : ')
print(classification_report(y_test, y_pred))

Accuracy Score : 0.752
Report : 
              precision    recall  f1-score   support

           0       0.67      0.92      0.78       116
           1       0.90      0.60      0.72       134

    accuracy                           0.75       250
   macro avg       0.78      0.76      0.75       250
weighted avg       0.79      0.75      0.75       250

