In [1]:
import pandas as pd
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

## TASK 1: Load & Explore the Dataset

In [2]:
df = pd.read_csv("output.csv")

In [3]:
df.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [4]:
df.columns

Index([' Review', 'Liked'], dtype='object')

In [5]:
df.columns = ["Review","Liked"]

In [6]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [7]:
df["Liked"].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [8]:
df.isna().sum()

Review    0
Liked     0
dtype: int64

In [9]:
df["Review_Length"] = df["Review"].astype(str).str.len()

In [10]:
df.head()

Unnamed: 0,Review,Liked,Review_Length
0,Wow... Loved this place.,1,24
1,Crust is not good.,0,18
2,Not tasty and the texture was just nasty.,0,41
3,Stopped by during the late May bank holiday of...,1,87
4,The selection on the menu was great and so wer...,1,59


## TASK 2: Clean & Preprocess the Text

In [11]:
df["Cleaned_Review"] = df["Review"].str.lower()

In [12]:
df.head()

Unnamed: 0,Review,Liked,Review_Length,Cleaned_Review
0,Wow... Loved this place.,1,24,wow... loved this place.
1,Crust is not good.,0,18,crust is not good.
2,Not tasty and the texture was just nasty.,0,41,not tasty and the texture was just nasty.
3,Stopped by during the late May bank holiday of...,1,87,stopped by during the late may bank holiday of...
4,The selection on the menu was great and so wer...,1,59,the selection on the menu was great and so wer...


In [13]:
# Remove punctuation & numbers
df["Cleaned_Review"] = df["Cleaned_Review"].str.replace(r"[^a-z\s]", "", regex=True)

# Remove extra spaces
df["Cleaned_Review"] = df["Cleaned_Review"].str.replace(r"\s+", " ", regex=True).str.strip()


In [14]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
df["Cleaned_Review"] = df["Cleaned_Review"].apply(word_tokenize)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Review,Liked,Review_Length,Cleaned_Review
0,Wow... Loved this place.,1,24,"[wow, loved, this, place]"
1,Crust is not good.,0,18,"[crust, is, not, good]"
2,Not tasty and the texture was just nasty.,0,41,"[not, tasty, and, the, texture, was, just, nasty]"
3,Stopped by during the late May bank holiday of...,1,87,"[stopped, by, during, the, late, may, bank, ho..."
4,The selection on the menu was great and so wer...,1,59,"[the, selection, on, the, menu, was, great, an..."


In [15]:
# stopwords
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stop_words.discard("not")
stop_words.discard("no")
stop_words.discard("nor")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
df["Cleaned_Review"] = df["Cleaned_Review"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

In [17]:
df.head()

Unnamed: 0,Review,Liked,Review_Length,Cleaned_Review
0,Wow... Loved this place.,1,24,"[wow, loved, place]"
1,Crust is not good.,0,18,"[crust, not, good]"
2,Not tasty and the texture was just nasty.,0,41,"[not, tasty, texture, nasty]"
3,Stopped by during the late May bank holiday of...,1,87,"[stopped, late, may, bank, holiday, rick, stev..."
4,The selection on the menu was great and so wer...,1,59,"[selection, menu, great, prices]"


In [18]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet
df["Cleaned_Review"] = df["Cleaned_Review"].apply(
    lambda tokens: [lemmatizer.lemmatize(word, pos='v') for word in tokens]
)
df.head(10)

Unnamed: 0,Review,Liked,Review_Length,Cleaned_Review
0,Wow... Loved this place.,1,24,"[wow, love, place]"
1,Crust is not good.,0,18,"[crust, not, good]"
2,Not tasty and the texture was just nasty.,0,41,"[not, tasty, texture, nasty]"
3,Stopped by during the late May bank holiday of...,1,87,"[stop, late, may, bank, holiday, rick, steve, ..."
4,The selection on the menu was great and so wer...,1,59,"[selection, menu, great, price]"
5,Now I am getting angry and I want my damn pho.,0,46,"[get, angry, want, damn, pho]"
6,Honeslty it didn't taste THAT fresh.),0,37,"[honeslty, didnt, taste, fresh]"
7,The potatoes were like rubber and you could te...,0,111,"[potatoes, like, rubber, could, tell, make, ah..."
8,The fries were great too.,1,25,"[fry, great]"
9,A great touch.,1,14,"[great, touch]"


In [19]:
df["Cleaned_Review"] = df["Cleaned_Review"].apply(lambda x: " ".join(x))

## TASK 3: Convert Text to Numerical Features

In [20]:
y = df["Liked"]
X = df["Cleaned_Review"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [22]:
vectorizer = CountVectorizer(binary=True, max_features=2000)

X_train_trans = vectorizer.fit_transform(X_train)
X_test_trans = vectorizer.transform(X_test)

## TASK 5: Train Naïve Bayes Models


In [23]:
model = BernoulliNB()
model.fit(X_train_trans, y_train)

## TASK 6: Evaluate Models


In [24]:
y_train_pred = model.predict(X_train_trans)
y_test_pred = model.predict(X_test_trans)

In [25]:
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Testing Accuracy:", accuracy_score(y_test, y_test_pred))

Training Accuracy: 0.9575
Testing Accuracy: 0.76


In [26]:
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Confusion Matrix:
 [[75 21]
 [27 77]]

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.78      0.76        96
           1       0.79      0.74      0.76       104

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200



## TASK 7: Predict Sentiment of New Reviews

In [27]:
def preprocess_new_review(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w, pos='v') for w in tokens]
    return " ".join(tokens)

new_reviews = [
    "The food was fantastic!",
    "Worst service ever."
]

processed_reviews = [preprocess_new_review(r) for r in new_reviews]

new_trans = vectorizer.transform(processed_reviews)
predictions = model.predict(new_trans)

for review, sentiment in zip(new_reviews, predictions):
    print("\nReview:", review)
    print("Predicted Sentiment:", "Positive" if sentiment == 1 else "Negative")


Review: The food was fantastic!
Predicted Sentiment: Positive

Review: Worst service ever.
Predicted Sentiment: Negative


## Pickle Creation

In [28]:
import pickle

In [29]:
pickle.dump((model, vectorizer), open("sentiment_model.pkl", "wb"))