# step1.importing the data ####

In [1]:

import pandas as pd
import numpy as np
df=pd.read_csv("Restaurant_Reviews.tsv",delimiter="\t")  ###data contains restaurent review ,we will do some natuaral language processing in it

In [2]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# step2.text cleaning and preprocessing####

In [3]:
#library to clean the data ,reguler expressions
import re

In [4]:
#Natural LAnguage tool kit,nltk
import nltk

In [11]:
#to remove stopwords
from nltk.corpus import stopwords

In [6]:
#to get the root of words,stemming
from nltk.stem.porter import PorterStemmer

In [13]:
corpus=[]
stop=set(stopwords.words("english"))
#we will do batch processing 
for i in range(0,1000):
    review=re.sub('[^a-zA-Z]',' ',df.Review[i])
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    review=[ps.stem(word) for word in review if word not in stop]
    review=" ".join(review)
    corpus.append(review)

In [14]:
corpus[0:3]


['wow love place', 'crust good', 'tasti textur nasti']

# step3.Tokenization####

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
cv = CountVectorizer(max_features = 1500)  

In [17]:
X = cv.fit_transform(corpus).toarray()

In [23]:
y = df.iloc[:, 1].values

In [26]:
import sklearn

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 

In [31]:
from sklearn.ensemble import RandomForestClassifier

# step4.prediction using Random Forest Classifier ####

In [32]:
model = RandomForestClassifier(n_estimators = 501, 
                            criterion = 'entropy')

In [33]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=501,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
y_pred = model.predict(X_test) 

In [36]:
y_pred

array([1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0], dtype=int64)

In [35]:
from sklearn.metrics import confusion_matrix 
  
cm = confusion_matrix(y_test, y_pred) 
  
cm 

array([[ 93,  22],
       [ 34, 101]], dtype=int64)