In [29]:
# Importing necessary libraries for Data Analysis
import pandas as pd
import numpy as np

In [30]:
from nltk.corpus import stopwords
import string
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hastee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# Load the dataset
try:
    df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', on_bad_lines='warn')
except pd.errors.ParserError as e:
    print(f"Error parsing file: {e}")

In [32]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [33]:
#preprocessing the data to remove stopwords
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [34]:
#Reviwing the data without stopwords
df['Review'] = df['Review'].apply(preprocess_text)
df1 = df.copy()
df1.head()

Unnamed: 0,Review,Liked
0,wow loved place,1
1,crust good,0
2,tasty texture nasty,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great prices,1


In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
vectorizer1 = CountVectorizer(binary = True)
vectorizer2 = CountVectorizer(binary = False)

In [37]:
x =df1['Review'].str.lower()
y = df1['Liked']

In [38]:
x1 = vectorizer1.fit_transform(x)
x2 = vectorizer2.fit_transform(x)

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
xtrain1,xtest1,ytrain,ytest = train_test_split(x1,y,test_size=0.25,random_state=42) #Bernoulli with counter vectorizer
xtrain2,xtest2,ytrain,ytest = train_test_split(x2,y,test_size=0.25,random_state=42) #multinomial with counter vectorizer
xtrain3,xtest3,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=42) #MultinomialNB with TfidfVectorizer

In [41]:
from sklearn.naive_bayes import BernoulliNB,MultinomialNB

In [42]:
bnb = BernoulliNB()

In [43]:
#Bernoulli with counter vectorizer
bnb.fit(xtrain1,ytrain)

In [44]:
# Make predictions on the testing set for Bernoulli with counter vectorizer
predictions1 = bnb.predict(xtest1)

In [45]:
from sklearn.naive_bayes import MultinomialNB

In [46]:
 mnb = MultinomialNB()

In [47]:
#multinomial with counter vectorizer
mnb.fit(xtrain2,ytrain)

In [48]:
# Make predictions on the testing set for multinomial with counter vectorizer
predictions2 = mnb.predict(xtest2)

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
# Create a pipeline with TfidfVectorizer and MultinomialNB
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [51]:
# Multinomial with Tfidf vectorizer
model.fit(xtrain3,ytrain)

In [52]:
# Make predictions on the test set Multinomial with Tfidf vectorizer
predictions3 = model.predict(xtest3)

In [53]:
# Import confusion_matrix and classification_report from the sklearn.metrics module
from sklearn.metrics import accuracy_score

In [54]:
# Evaluate the model of Bernoulli with counter vectorizer
accuracy_score(ytest,predictions1)

0.776

In [55]:
# Evaluate the model of Multinomial with counter vectorizer
accuracy_score(ytest,predictions2)

0.788

In [56]:
# Evaluate the model of Multinomial with Tfidf vectorizer
accuracy_score(ytest, predictions3)

0.78

Conclusion

1. Bernoulli Naive Bayes with Count Vectorizer: Accuracy 0.776.
2. Multinomial Naive Bayes with Count Vectorizer: Accuracy 0.788.
3. Multinomial Naive Bayes with TF-IDF Vectorizer: Accuracy 0.78.

Multinomial Naive Bayes with Count Vectorizer provides the highest accuracy at 0.788, making it the best choice for this text classification task. The TF-IDF variant is also strong, but slightly less effective at 0.78. The Bernoulli model with binary features is the least effective at 0.776.

In [73]:
import joblib
# Save the Multinomial Naive Bayes model
model = 'Multinomial.joblib'
joblib.dump(mnb, model)

['Multinomial.joblib']

In [64]:
#predicting the type of review with a dynamic input
def predict_rating(review):
    # Preprocess the review
    preprocessed_review = preprocess_text(review)

    # Transform the preprocessed review using the fitted CountVectorizer
    review_vectorized = vectorizer2.transform([preprocessed_review])

    # Predict the rating using the trained Multinomial Naive Bayes model
    predicted_rating = mnb.predict(review_vectorized)[0]

    return predicted_rating

# Get user input for the review
user_review = input("Enter your review: ")

# Predict the rating
predicted_rating = predict_rating(user_review)

# Print the numerical rating
print("Predicted Rating:", predicted_rating)

# Print the result based on the predicted rating
if predicted_rating == 1:
    print("This is a Positive Review.")
else:
    print("This is a Negative Review.")

Enter your review: had a good time, great service
Predicted Rating: 1
This is a Positive Review.
