In [None]:
import pandas as pd

# Load the TSV file
df = pd.read_csv("/content/Restaurant_Reviews.tsv", sep="\t")

# Save as a CSV file
df.to_csv("sentiment_data.csv", index=False)
print(df.head())
class_counts = df['Liked'].value_counts()
print(class_counts)

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1
Liked
1    500
0    500
Name: count, dtype: int64


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the Lemmatizer
lemmatizer = WordNetLemmatizer()

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def preprocess_text(text):

    text = text.lower()  # Convert to lowercase

    text = re.sub(r'\W', ' ', text)  # Replace characters except letters and digits with a space

    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space

    words = text.split()  # tokenization

    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatize each word

    return ' '.join(words)# Return the cleaned


df['cleaned_review'] = df['Review'].apply(preprocess_text)
print(df[['Review', 'cleaned_review']].head())


                                              Review  \
0                           Wow... Loved this place.   
1                                 Crust is not good.   
2          Not tasty and the texture was just nasty.   
3  Stopped by during the late May bank holiday of...   
4  The selection on the menu was great and so wer...   

                                      cleaned_review  
0                                    wow loved place  
1                                         crust good  
2                                tasty texture nasty  
3  stopped late may bank holiday rick steve recom...  
4                         selection menu great price  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=600, ngram_range=(1, 2), min_df=5)
X = vectorizer.fit_transform(df['cleaned_review']).toarray()
y = df['Liked']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

# Train Naive Bayes with regularization
nb_model = MultinomialNB(alpha=2)
nb_model.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, nb_model.predict(X_train))

# Make predictions
y_pred = nb_model.predict(X_test)

# Evaluate performance
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Training Accuracy: 81.86%
Testing Accuracy: 74.00%
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.71      0.73       152
           1       0.72      0.77      0.75       148

    accuracy                           0.74       300
   macro avg       0.74      0.74      0.74       300
weighted avg       0.74      0.74      0.74       300



In [None]:
import joblib

# Save the trained model
joblib.dump(nb_model, "sentiment_model.pkl")

# Save the vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
from google.colab import files

files.download("sentiment_model.pkl")
files.download("tfidf_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>