# NAÏVE BAYES ON A REVIEW CLASSIFICATION DATASET

# TASK 1: Load & Explore the Dataset

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('output.csv')

In [4]:
df.columns

Index([' Review', 'Liked'], dtype='object')

In [5]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
df.shape

(1000, 2)

In [7]:
df.isnull().sum()

 Review    0
Liked      0
dtype: int64

In [8]:
df.duplicated().sum()

4

In [10]:
len(df[' Review'])

1000

# TASK 2: Clean & Preprocess the Text

In [25]:
text= df[' Review']

In [26]:
# Convert to Lowercase
text= text.str.lower()

In [27]:
text

0                               wow... loved this place.
1                                     crust is not good.
2              not tasty and the texture was just nasty.
3      stopped by during the late may bank holiday of...
4      the selection on the menu was great and so wer...
                             ...                        
995    i think food should have flavor and texture an...
996                             appetite instantly gone.
997    overall i was not impressed and would not go b...
998    the whole experience was underwhelming, and i ...
999    then, as if i hadn't wasted enough of my life ...
Name:  Review, Length: 1000, dtype: object

In [30]:
# Remove Puntuations,Special Characters, Emojis and number.
import pandas as pd
import re
text = text.str.replace(r'[0-9]', '',regex=True)                  # remove numbers
text = text.str.replace(r'[^\w\s]', '',regex=True)               # remove punctuation & special chars
text = text.str.replace(r'[\U00010000-\U0010ffff]', '',regex=True)  # remove emojis

In [31]:
text

0                                   wow loved this place
1                                      crust is not good
2               not tasty and the texture was just nasty
3      stopped by during the late may bank holiday of...
4      the selection on the menu was great and so wer...
                             ...                        
995    i think food should have flavor and texture an...
996                              appetite instantly gone
997    overall i was not impressed and would not go back
998    the whole experience was underwhelming and i t...
999    then as if i hadnt wasted enough of my life th...
Name:  Review, Length: 1000, dtype: object

In [37]:
tokens = text.str.split()
print(tokens)

0                              [wow, loved, this, place]
1                                 [crust, is, not, good]
2      [not, tasty, and, the, texture, was, just, nasty]
3      [stopped, by, during, the, late, may, bank, ho...
4      [the, selection, on, the, menu, was, great, an...
                             ...                        
995    [i, think, food, should, have, flavor, and, te...
996                          [appetite, instantly, gone]
997    [overall, i, was, not, impressed, and, would, ...
998    [the, whole, experience, was, underwhelming, a...
999    [then, as, if, i, hadnt, wasted, enough, of, m...
Name:  Review, Length: 1000, dtype: object


In [43]:
len(tokens)

1000

In [32]:
# Removing stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\patan Nishath
[nltk_data]     khan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
filtered_words = [
    [word for word in sentence if word not in stop_words]
    for sentence in tokens
]


In [42]:
len(filtered_words)

1000

In [44]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\patan Nishath
[nltk_data]     khan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = [
    [lemmatizer.lemmatize(word) for word in sentence]
    for sentence in filtered_words
]

print(lemmatized_words)




In [47]:
len(lemmatized_words)

1000

In [51]:
final_text = [" ".join(sentence) for sentence in lemmatized_words]

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

In [49]:
vect = CountVectorizer(binary=True)

In [52]:
bag = vect.fit_transform(final_text)

In [53]:
print('Vocabulary',vect.get_feature_names_out())
print('BOW Matrix \n',bag.toarray())

Vocabulary ['absolute' 'absolutely' 'absolutley' ... 'yum' 'yummy' 'zero']
BOW Matrix 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# TASK 4: Split the Data

In [54]:
y = df['Liked']

In [80]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(final_text,y,test_size=0.2,random_state=42,stratify=y)

In [59]:
X_train_vec = vect.fit_transform(X_train)
X_test_vec = vect.transform(X_test)

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=1)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# TASK 5: Train Naïve Bayes Models

In [103]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=0.5)

In [104]:
bnb.fit(X_train_vec,y_train)

In [105]:
# Training Accuracy
y_train_pred = bnb.predict(X_train_vec)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_train_pred,y_train)
print("Training Prediction",score)

Training Prediction 0.99625


In [107]:
# Testing Prediction
y_pred =bnb.predict(X_test_vec)
score = accuracy_score(y_pred,y_test)
print("Testing Prediction",score)

Testing Prediction 0.775


In [100]:
# HyperParameter Tuning
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("nb", BernoulliNB())
])

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__min_df": [1, 2, 5],
    "nb__alpha": [0.1, 0.5, 1.0]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)


Best parameters: {'nb__alpha': 0.5, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
Best CV accuracy: 0.7700000000000001


# TASK 6: Evaluate Models

# Calculate: Accuracy,Precision,Recall,F1 Score

In [109]:
# Let us use accuracy score to validate this model
from sklearn.metrics import accuracy_score
score=accuracy_score(y_pred,y_test)
print("Accuracy Score of BernouliNB is :",score)

Accuracy Score of BernouliNB is : 0.775


In [115]:
# printing precision score
from sklearn.metrics import precision_score, recall_score
precision=precision_score(y_test,y_pred,average=None)
print(precision)

[0.87671233 0.71653543]


In [113]:
# printing recall score
recall=recall_score(y_test,y_pred,average=None)
print(recall)

[0.64 0.91]


In [116]:
# Printing F1 Score
from sklearn.metrics import f1_score
f1=f1_score(y_test,y_pred,average=None)
print(f1)

[0.73988439 0.80176211]


In [117]:
import pickle

# save model
with open("bernoulli_nb_model.pkl", "wb") as file:
    pickle.dump(bnb, file)

print("Model saved successfully!")


Model saved successfully!
