<a href="https://colab.research.google.com/github/NimmoUsman/test.12/blob/main/Sentiment_analysis_Nimmo_Usman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis of Restaurant Reviews

In [None]:
# The aim of this project is to predict the number of positive and negative reviews based on sentiments by using different classification models.

In [None]:
import numpy as np
import pandas as pd

In [None]:
r_data = pd.read_csv("Restaurant_Reviews.tsv", sep='\t' )

In [None]:

r_data.head()



Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
r_data.shape

(1000, 2)

In [None]:
r_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


# Data cleaning

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop=stopwords.words('english')

In [None]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100


In [None]:
def remove_punct(text):
    clean_list = "".join([char for char in text if char not in string.punctuation])

    return clean_list

r_data['Review_text_clean'] = r_data['Review'].apply(remove_punct)

In [None]:
r_data['body_len'] = r_data['Review'].apply(lambda x: len(x) - x.count(" "))
r_data['punct%'] = r_data['Review'].apply(lambda x: count_punct(x))


In [None]:
r_data.head()

Unnamed: 0,Review,Liked,Review_text_clean,body_len,punct%
0,Wow... Loved this place.,1,Wow Loved this place,21,19.0
1,Crust is not good.,0,Crust is not good,15,6.7
2,Not tasty and the texture was just nasty.,0,Not tasty and the texture was just nasty,34,2.9
3,Stopped by during the late May bank holiday of...,1,Stopped by during the late May bank holiday of...,73,1.4
4,The selection on the menu was great and so wer...,1,The selection on the menu was great and so wer...,48,2.1


In [None]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens
r_data['Review_text_tokenized'] = r_data['Review_text_clean'].apply(lambda x: tokenize(x.lower()))

r_data.head()

Unnamed: 0,Review,Liked,Review_text_clean,body_len,punct%,Review_text_tokenized
0,Wow... Loved this place.,1,Wow Loved this place,21,19.0,"[wow, loved, this, place]"
1,Crust is not good.,0,Crust is not good,15,6.7,"[crust, is, not, good]"
2,Not tasty and the texture was just nasty.,0,Not tasty and the texture was just nasty,34,2.9,"[not, tasty, and, the, texture, was, just, nasty]"
3,Stopped by during the late May bank holiday of...,1,Stopped by during the late May bank holiday of...,73,1.4,"[stopped, by, during, the, late, may, bank, ho..."
4,The selection on the menu was great and so wer...,1,The selection on the menu was great and so wer...,48,2.1,"[the, selection, on, the, menu, was, great, an..."


In [None]:
import nltk
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

r_data['Review_text_nostop'] = r_data['Review_text_tokenized'].apply(lambda x: remove_stopwords(x))

r_data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DPSYOPS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Review,Liked,Review_text_clean,body_len,punct%,Review_text_tokenized,Review_text_nostop
0,Wow... Loved this place.,1,Wow Loved this place,21,19.0,"[wow, loved, this, place]","[wow, loved, place]"
1,Crust is not good.,0,Crust is not good,15,6.7,"[crust, is, not, good]","[crust, good]"
2,Not tasty and the texture was just nasty.,0,Not tasty and the texture was just nasty,34,2.9,"[not, tasty, and, the, texture, was, just, nasty]","[tasty, texture, nasty]"
3,Stopped by during the late May bank holiday of...,1,Stopped by during the late May bank holiday of...,73,1.4,"[stopped, by, during, the, late, may, bank, ho...","[stopped, late, may, bank, holiday, rick, stev..."
4,The selection on the menu was great and so wer...,1,The selection on the menu was great and so wer...,48,2.1,"[the, selection, on, the, menu, was, great, an...","[selection, menu, great, prices]"


In [None]:
ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

r_data['Review_text_stemmed'] = r_data['Review_text_nostop'].apply(lambda x: stemming(x))

r_data.head()

Unnamed: 0,Review,Liked,Review_text_clean,body_len,punct%,Review_text_tokenized,Review_text_nostop,Review_text_stemmed
0,Wow... Loved this place.,1,Wow Loved this place,21,19.0,"[wow, loved, this, place]","[wow, loved, place]","[wow, love, place]"
1,Crust is not good.,0,Crust is not good,15,6.7,"[crust, is, not, good]","[crust, good]","[crust, good]"
2,Not tasty and the texture was just nasty.,0,Not tasty and the texture was just nasty,34,2.9,"[not, tasty, and, the, texture, was, just, nasty]","[tasty, texture, nasty]","[tasti, textur, nasti]"
3,Stopped by during the late May bank holiday of...,1,Stopped by during the late May bank holiday of...,73,1.4,"[stopped, by, during, the, late, may, bank, ho...","[stopped, late, may, bank, holiday, rick, stev...","[stop, late, may, bank, holiday, rick, steve, ..."
4,The selection on the menu was great and so wer...,1,The selection on the menu was great and so wer...,48,2.1,"[the, selection, on, the, menu, was, great, an...","[selection, menu, great, prices]","[select, menu, great, price]"


In [None]:
import nltk
nltk.download('wordnet')

nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DPSYOPS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DPSYOPS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


In [None]:
def lemmatizing(tokenized_text):
    text = [lemmatizer.lemmatize(word) for word in tokenized_text]
    return text

r_data['Rewview_text_lemmatized'] = r_data['Review_text_nostop'].apply(lambda x: lemmatizing(x))

r_data.head()

Unnamed: 0,Review,Liked,Review_text_clean,body_len,punct%,Review_text_tokenized,Review_text_nostop,Review_text_stemmed,Rewview_text_lemmatized
0,Wow... Loved this place.,1,Wow Loved this place,21,19.0,"[wow, loved, this, place]","[wow, loved, place]","[wow, love, place]","[wow, loved, place]"
1,Crust is not good.,0,Crust is not good,15,6.7,"[crust, is, not, good]","[crust, good]","[crust, good]","[crust, good]"
2,Not tasty and the texture was just nasty.,0,Not tasty and the texture was just nasty,34,2.9,"[not, tasty, and, the, texture, was, just, nasty]","[tasty, texture, nasty]","[tasti, textur, nasti]","[tasty, texture, nasty]"
3,Stopped by during the late May bank holiday of...,1,Stopped by during the late May bank holiday of...,73,1.4,"[stopped, by, during, the, late, may, bank, ho...","[stopped, late, may, bank, holiday, rick, stev...","[stop, late, may, bank, holiday, rick, steve, ...","[stopped, late, may, bank, holiday, rick, stev..."
4,The selection on the menu was great and so wer...,1,The selection on the menu was great and so wer...,48,2.1,"[the, selection, on, the, menu, was, great, an...","[selection, menu, great, prices]","[select, menu, great, price]","[selection, menu, great, price]"


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer='word')
X_counts = count_vect.fit_transform(r_data['Review_text_clean'])

print(X_counts.shape)
print(count_vect.get_feature_names())

(1000, 2067)




# Model Training

In [None]:
from sklearn.model_selection import train_test_split

X = r_data.Review_text_clean
y = r_data.Liked

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=225)

print('X_train:', len(X_train))
print('X_test:', len(X_test))
print('y_train:', len(y_train))
print('y_test:', len(y_test))

X_train: 800
X_test: 200
y_train: 800
y_test: 200


In [None]:
pip install scikit-learn-pipeline-utils

Collecting scikit-learn-pipeline-utils
  Downloading scikit_learn_pipeline_utils-0.0.7-py3-none-any.whl (4.7 kB)
Installing collected packages: scikit-learn-pipeline-utils
Successfully installed scikit-learn-pipeline-utils-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tfidf_vect = TfidfVectorizer()

logreg = LogisticRegression(solver='lbfgs')



In [None]:
from sklearn.pipeline import Pipeline

model = Pipeline([('vectorizer', tfidf_vect),('classifer', logreg)])
model.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifer', LogisticRegression())])

In [None]:
from sklearn.metrics import confusion_matrix

y_predictions = model.predict(X_test)
confusion_matrix(y_predictions, y_test)

array([[82, 17],
       [17, 84]], dtype=int64)

In [None]:
#Accuracy for Logistic regression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_predictions)
score2 = precision_score(y_test,y_predictions)
score3= recall_score(y_test,y_predictions)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))



Accuracy is  83.0 %
Precision is  0.83
Recall is  0.83


In [None]:
from sklearn.ensemble import RandomForestClassifier

Rd = RandomForestClassifier()
model = Pipeline([('vectorizer', tfidf_vect),('classifer', Rd)])
model.fit(X_train,y_train)



Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifer', RandomForestClassifier())])

In [None]:
#Predicting Test Results
y_predictions = model.predict(X_test)
confusion_matrix(y_predictions, y_test)

array([[74, 20],
       [25, 81]], dtype=int64)

In [None]:
# Accuracy for Random Forest Classifier
score1 = accuracy_score(y_test,y_predictions)
score2 = precision_score(y_test,y_predictions)
score3= recall_score(y_test,y_predictions)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))



Accuracy is  77.5 %
Precision is  0.76
Recall is  0.8


# Analysis and Conclusion

In this study, an attempt has been made to classify sentiment analysis for restaurant reviewsusing machine learning techniques. Two algorithms used are namely Logistic Regression and Random Forest Classifer.

Evaluation metrics used here are accuracy, precision and recall.

Using Logistic Regresssion,
Accuracy of prediction is 83.0%
Precision of prediction is 0.83%
Recall of prediction is 0.83%

Using Random Forest Classifier,
Accuracy of prediction is 77.5%
Precision of prediction is 0.76%
Recall of prediction is 0.8%

From the above results, Random Forest Classifier is slightly better method compared to Logistic Regression, with 83% accuracy which means the model built for the prediction of sentiment of the restaurantreview gives 83.0% right prediction.
