<a href="https://colab.research.google.com/github/Rushinalawade/Credit-Card-Fraud-Detection-Model/blob/main/Classification_of_Restaurant_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report

In [None]:
df_review = pd.read_csv('/content/drive/MyDrive/Almabetter/Notebook/Zomato Restaurant reviews.csv')

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df_review.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0


In [None]:
df_review = df_review[['Review','Rating']]

In [None]:
df_review.head()

Unnamed: 0,Review,Rating
0,"The ambience was good, food was quite good . h...",5
1,Ambience is too good for a pleasant evening. S...,5
2,A must try.. great food great ambience. Thnx f...,5
3,Soumen das and Arun was a great guy. Only beca...,5
4,Food is good.we ordered Kodi drumsticks and ba...,5


In [None]:
df_review.drop(df_review[df_review['Rating'] == 'Like'].index,axis=0,inplace=True)

In [None]:
df_review['Rating'] = df_review['Rating'].astype(float)

In [None]:
#Lets rate the review:
def get_sentiment(rating):
  if(rating>=4):
    return 'Positive'
  elif(rating>2):
    return 'Neutral'
  else:
    return 'Negative'

df_review['Sentiment'] = df_review['Rating'].apply(get_sentiment)

In [None]:
df_review.head()

Unnamed: 0,Review,Rating,Sentiment
0,"The ambience was good, food was quite good . h...",5.0,Positive
1,Ambience is too good for a pleasant evening. S...,5.0,Positive
2,A must try.. great food great ambience. Thnx f...,5.0,Positive
3,Soumen das and Arun was a great guy. Only beca...,5.0,Positive
4,Food is good.we ordered Kodi drumsticks and ba...,5.0,Positive


In [None]:
df_review['Sentiment'].value_counts()

Positive    6274
Negative    2466
Neutral     1259
Name: Sentiment, dtype: int64

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import string
from nltk.corpus import stopwords
stopwords.words('english')
from nltk.stem import SnowballStemmer
stemming = SnowballStemmer('english')

In [None]:
def get_tokenized_text(text):
  text = str(text)
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)
  stem_msg = stemming.stem(nopunc)
  message = [char for char in stem_msg.split() if char not in stopwords.words('english')]
  return ' '.join(message)

In [None]:
df_review['tokenized_review'] = df_review['Review'].apply(get_tokenized_text)
df_review.head()

Unnamed: 0,Review,Rating,Sentiment,tokenized_review
0,"The ambience was good, food was quite good . h...",5.0,Positive,ambience good food quite good saturday lunch c...
1,Ambience is too good for a pleasant evening. S...,5.0,Positive,ambience good pleasant evening service prompt ...
2,A must try.. great food great ambience. Thnx f...,5.0,Positive,must try great food great ambience thnx servic...
3,Soumen das and Arun was a great guy. Only beca...,5.0,Positive,soumen das arun great guy behavior sincerety g...
4,Food is good.we ordered Kodi drumsticks and ba...,5.0,Positive,food goodwe ordered kodi drumsticks basket mut...


In [None]:
df_review[df_review['Sentiment']=='Neutral']

Unnamed: 0,Review,Rating,Sentiment,tokenized_review
30,"Short review: Decent breads and starters, bad ...",3.0,Neutral,short review decent breads starters bad shaam ...
45,The reason for giving only a 3 star is because...,3.0,Neutral,reason giving 3 star longlong time wait tables...
56,We went for lunch buffet yesterday and took a ...,3.0,Neutral,went lunch buffet yesterday took chance seeing...
73,We happened to go to this place on last sunday...,3.0,Neutral,happened go place last sunday mothers day flat...
76,I have been to this place twice and had 2 diff...,3.0,Neutral,place twice 2 different experiences 1st time c...
...,...,...,...,...
9985,Searched for Chinese cuisine nearby and I foun...,3.5,Neutral,searched chinese cuisine nearby found cuisine ...
9987,Initially thought of going to Wonton (Gachibow...,3.5,Neutral,initially thought going wonton gachibowli reco...
9988,Had a mixed experience... Cant say it is bad c...,3.0,Neutral,mixed experience cant say bad cant say good ok...
9995,Madhumathi Mahajan Well to start with nice cou...,3.0,Neutral,madhumathi mahajan well start nice courteous s...


In [None]:
X = df_review['tokenized_review']
y = df_review['Sentiment']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=123)
print(X_train.shape)
print(X_test.shape)

(7499,)
(2500,)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.90,min_df=10,max_features=300)
X_trainv = vectorizer.fit_transform(X_train)
X_testv = vectorizer.transform(X_test)

In [None]:
X_trainv.toarray()

array([[0.        , 0.25993294, 0.8595225 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.0414975 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.39914827, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression(class_weight={'Positive':1,'Negative':2.544,'Neutral':4.983})
lgr.fit(X_trainv,y_train)
lgr.score(X_trainv,y_train)

0.7646352847046273

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn import metrics

In [None]:
def evalution_matrix(model,X,y):
  # Predict model:
  y_pred = model.predict(X)

  #Predict Probability:
  y_pred_prob = model.predict_proba(X)

  #Scores
  print('Accuracy: ',accuracy_score(y,y_pred))
  print('ROC_AUC_score: ',roc_auc_score(y,y_pred_prob,multi_class='ovr'))
  print(classification_report(y,y_pred))

In [None]:
evalution_matrix(lgr,X_trainv,y_train)

Accuracy:  0.7646352847046273
ROC_AUC_score:  0.908732778362495
              precision    recall  f1-score   support

    Negative       0.74      0.82      0.78      1877
     Neutral       0.39      0.66      0.49       942
    Positive       0.94      0.76      0.84      4680

    accuracy                           0.76      7499
   macro avg       0.69      0.75      0.70      7499
weighted avg       0.82      0.76      0.78      7499



In [None]:
evalution_matrix(lgr,X_testv,y_test)

Accuracy:  0.7352
ROC_AUC_score:  0.8752509232236338
              precision    recall  f1-score   support

    Negative       0.69      0.79      0.74       589
     Neutral       0.34      0.55      0.42       317
    Positive       0.91      0.75      0.82      1594

    accuracy                           0.74      2500
   macro avg       0.65      0.70      0.66      2500
weighted avg       0.79      0.74      0.75      2500



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
rfc = RandomForestClassifier(max_depth= 6,min_samples_leaf= 10,min_samples_split=10,class_weight={'Positive':1,'Negative':2.544,'Neutral':4.983}).fit(X_trainv,y_train)

In [None]:
evalution_matrix(rfc,X_trainv,y_train)

Accuracy:  0.6736898253100413
ROC_AUC_score:  0.863137085706993
              precision    recall  f1-score   support

    Negative       0.61      0.82      0.70      1877
     Neutral       0.28      0.53      0.37       942
    Positive       0.93      0.64      0.76      4680

    accuracy                           0.67      7499
   macro avg       0.61      0.66      0.61      7499
weighted avg       0.77      0.67      0.70      7499



In [None]:
evalution_matrix(rfc,X_testv,y_test)

Accuracy:  0.6408
ROC_AUC_score:  0.835445018851695
              precision    recall  f1-score   support

    Negative       0.58      0.80      0.67       589
     Neutral       0.27      0.52      0.35       317
    Positive       0.90      0.60      0.72      1594

    accuracy                           0.64      2500
   macro avg       0.58      0.64      0.58      2500
weighted avg       0.75      0.64      0.67      2500



In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
rfc = RandomForestClassifier(class_weight={'Positive':1,'Negative':2.544,'Neutral':4.983})
param = {'n_estimators':[100,200,300],
          'max_depth':[5,6,8,10],
          'min_samples_leaf':[5,10,20,40],
          'min_samples_split':[6,10,15,20],
          'max_features':['sqrt','log2']}

rfc_cv = RandomizedSearchCV(estimator=rfc,param_distributions=param,n_iter=100,cv=5,verbose=2)
rfc_cv.fit(X_trainv,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=40, min_samples_split=15, n_estimators=100; total time=   0.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=40, min_samples_split=15, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=40, min_samples_split=15, n_estimators=100; total time=   0.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=40, min_samples_split=15, n_estimators=100; total time=   0.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=40, min_samples_split=15, n_estimators=100; total time=   0.5s
[CV] END max_depth=6, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   1.0s
[CV] END max_depth=6, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   1.0s
[CV] END max_depth=6, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_es

In [None]:
rfc_cv.best_params_

{'n_estimators': 300,
 'min_samples_split': 6,
 'min_samples_leaf': 5,
 'max_features': 'log2',
 'max_depth': 10}

In [None]:
evalution_matrix(rfc_cv,X_trainv,y_train)

Accuracy:  0.7815708761168155
ROC_AUC_score:  0.9094741474576266
              precision    recall  f1-score   support

    Negative       0.67      0.86      0.75      1877
     Neutral       0.48      0.58      0.53       942
    Positive       0.94      0.79      0.86      4680

    accuracy                           0.78      7499
   macro avg       0.70      0.74      0.71      7499
weighted avg       0.81      0.78      0.79      7499



In [None]:
evalution_matrix(rfc_cv,X_testv,y_test)

Accuracy:  0.7392
ROC_AUC_score:  0.8577105929151334
              precision    recall  f1-score   support

    Negative       0.62      0.84      0.71       589
     Neutral       0.41      0.43      0.42       317
    Positive       0.89      0.76      0.82      1594

    accuracy                           0.74      2500
   macro avg       0.64      0.68      0.65      2500
weighted avg       0.77      0.74      0.75      2500



In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,RepeatedKFold
rfc = RandomForestClassifier(class_weight={'Positive':1,'Negative':2.544,'Neutral':4.983},max_features='sqrt')
params = {'n_estimators':[200,300,400],
          'max_depth':[8,10],
          'min_samples_leaf':[4,5,6],
          'min_samples_split':[15,17,20]}
rfc_gcv = GridSearchCV(estimator=rfc,param_grid=params,cv=5,verbose=2)
rfc_gcv.fit(X_trainv,y_train)

In [None]:
rfc_gcv.best_params_

{'max_depth': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 15,
 'n_estimators': 300}

In [None]:
evalution_matrix(rfc_gcv,X_trainv,y_train)

Accuracy:  0.7841045472729697
ROC_AUC_score:  0.9067790170755661
              precision    recall  f1-score   support

    Negative       0.67      0.86      0.75      1877
     Neutral       0.49      0.58      0.53       942
    Positive       0.94      0.80      0.86      4680

    accuracy                           0.78      7499
   macro avg       0.70      0.74      0.71      7499
weighted avg       0.81      0.78      0.79      7499



In [None]:
evalution_matrix(rfc_gcv,X_testv,y_test)


Accuracy:  0.74
ROC_AUC_score:  0.8555837251535342
              precision    recall  f1-score   support

    Negative       0.62      0.83      0.71       589
     Neutral       0.41      0.43      0.42       317
    Positive       0.89      0.77      0.82      1594

    accuracy                           0.74      2500
   macro avg       0.64      0.68      0.65      2500
weighted avg       0.76      0.74      0.75      2500



In [None]:
sen = ['Food was delicious']
xtest = vectorizer.transform(sen)
lgr.predict(xtest)

array(['Positive'], dtype=object)