In [1]:
# Importing the libraries
import numpy as np
import pandas as pd

In [2]:
# Read the CSV Dataset file
df = pd.read_csv('Restaurant_Reviews.csv')

In [3]:
df.shape

(1000, 2)

In [4]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [5]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
df.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [8]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [9]:
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

# Feature Engineering

In [10]:
df['Length'] = df['Review'].apply(len)
df.head(5)

Unnamed: 0,Review,Liked,Length
0,Wow... Loved this place.,1,24
1,Crust is not good.,0,18
2,Not tasty and the texture was just nasty.,0,41
3,Stopped by during the late May bank holiday of...,1,87
4,The selection on the menu was great and so wer...,1,59


# Data Preprocessing

In [11]:
# Importing the NLP Libraries
import nltk
import re
# Download NLTK stopwords data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package stopwords to C:\Users\KARRA
[nltk_data]     TEJASWINIREDDY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
print(list(stopwords.words('english')))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, accuracy_score

In [14]:
stemmer = SnowballStemmer('english')
corpus = []

# Loop through each review
for i in range(len(df)):
    # Clean and preprocess the review
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])  # Remove non-alphabetical characters
    review = review.lower()  # Convert text to lowercase
    review_words = review.split()  # Tokenization
    review_words = [word for word in review_words if word not in set(stopwords.words('english'))]  # Remove Stop Words
    review_words = [stemmer.stem(word) for word in review_words]  # Stemming
    review = ' '.join(review_words)  # Rejoin Tokens
    corpus.append(review)  # Append to Corpus

In [15]:
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

# Creating a Bag of Words Model

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=1500, min_df=2)
x = tf.fit_transform(corpus).toarray() # Transform the text data into TF-IDF features
X = tf.fit_transform(corpus).toarray()
y = df['Liked'].values
# y = df.iloc[:, 1].values

# Data Splitting

In [17]:
from sklearn.model_selection import train_test_split, KFold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 693), (200, 693), (800,), (200,))

# Model Training

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# from sklearn.model_selection import train_test_split, KFold
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
model = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=4, min_samples_leaf=2, random_state=42)
num_folds = 11
kf = KFold(n_splits=num_folds, shuffle=True, random_state=41)

# Lists to store accuracies
train_accuracies = []
test_accuracies = []

# Loop through k-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #model
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    accuracy_diff = abs(train_accuracy - test_accuracy)

    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
    
# # Split the data with the same random seed for consistency
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=41)

classifier = SVC(kernel='linear', C=0.1, random_state=42)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)
y_train_pred = classifier.predict(x_train)

# Calculate the average training and testing accuracies
average_train_accuracy = np.mean(train_accuracies)
average_test_accuracy = np.mean(test_accuracies)


# Print the results
print("model-1 : Random Forest") #random forest
print(f"Training Accuracy: {average_train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {average_test_accuracy * 100:.2f}%")
print("Accuracy Difference: {:.2f}%".format(accuracy_diff * 100))



    # Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
accuracy_diff = abs(train_accuracy - test_accuracy)

    # Print the results
print("model-2 : SVC")
print("Training Accuracy: {:.2f}%".format(train_accuracy * 100))
print("Testing Accuracy: {:.2f}%".format(test_accuracy * 100))
print("Accuracy Difference: {:.2f}%".format(accuracy_diff * 100))

model-1 : Random Forest
Training Accuracy: 79.98%
Testing Accuracy: 74.21%
Accuracy Difference: 2.65%
model-2 : SVC
Training Accuracy: 79.62%
Testing Accuracy: 78.00%
Accuracy Difference: 1.62%


In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.73      0.92      0.81       103
           1       0.88      0.63      0.73        97

    accuracy                           0.78       200
   macro avg       0.80      0.78      0.77       200
weighted avg       0.80      0.78      0.77       200



In [21]:
def predict_sentiment(sample_review, classifier, tf):
    # Preprocess the sample review
    sample_review = re.sub(pattern='[^a-zA-Z]', repl=' ', string=sample_review)
    sample_review = sample_review.lower()
    sample_review_words = sample_review.split()
    sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
    ps = PorterStemmer()
    final_review = [ps.stem(word) for word in sample_review_words]
    final_review = ' '.join(final_review)
    
    temp = tf.transform([final_review]).toarray()

    # Use the pre-trained classifier to predict sentiment
    sentiment = classifier.predict(temp)

    # Post-processing: Check for positive words
    positive_words = ["good", "excellent", "amazing", "delicious"]  # Add more positive words as needed
    for word in positive_words:
        if word in sample_review.lower():
            sentiment = [1]  # Override sentiment to positive

    return sentiment[0]

# Sample reviews as strings
reviews = [
    'The food is really bad.',
    'I love their delicious dishes!',
    'Terrible experience. Avoid this place.',
    'The service was excellent.',
    'Worst place ever, but nice food'
]

# Assuming you have already defined the 'predict_sentiment' function, classifier, and tf

for review in reviews:
    sentiment = predict_sentiment(review, classifier, tf)
    if sentiment:
        sentiment_label = 'POSITIVE'
    else:
        sentiment_label = 'NEGATIVE'

    print(f"Review: '{review}'")
    print(f"Sentiment: {sentiment_label}")
    print()

Review: 'The food is really bad.'
Sentiment: NEGATIVE

Review: 'I love their delicious dishes!'
Sentiment: POSITIVE

Review: 'Terrible experience. Avoid this place.'
Sentiment: NEGATIVE

Review: 'The service was excellent.'
Sentiment: POSITIVE

Review: 'Worst place ever, but nice food'
Sentiment: NEGATIVE

