Naive Bayes - Playstore Reviews

In [29]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from pickle import dump
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV

1. DATA COLLECTION

In [30]:
df = pd.read_csv("../data/raw/df_raw.csv")
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


2. EXPLOTE & CLEAN DATA

In [31]:
# 2.1. DIMENSIONS ➞ (rows, columns)
df.shape

(891, 3)

In [32]:
# 2.2. DATA TYPES
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [33]:
# 2.3. TEXT PROCESSING → DELETE SPACES & CONVERT TEXT TO LOWER CASE
df["review"] = df["review"].str.strip().str.lower()
df["package_name"] = df["package_name"].str.strip().str.lower()
df.drop("package_name", axis=1, inplace=True)
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


3. DIVISION OF DATA COLLECTION FOR TRAINING AND TESTING

In [34]:
X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

4. TRANSFORMATION OF TEXT TO MATRIX OF TOKEN COUNTS

In [35]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

5. NAIVE BAYES MODEL

In [36]:
# 5.1. MULTINOMIAL
model = MultinomialNB()
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.8156424581005587

In [37]:
# 5.2. GAUSSIAN
model_Gau = GaussianNB()
model_Gau.fit(X_train, y_train)
y_test_pred = model_Gau.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.8044692737430168

In [38]:
# 5.3. BERNOULLI 
model_Ber = BernoulliNB()
model_Ber.fit(X_train, y_train)
y_test_pred = model_Ber.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.770949720670391

6. MODEL OPTIMIZATION

In [39]:
hyperparams = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "fit_prior": [True, False],
}

model = MultinomialNB()
random_search = RandomizedSearchCV(
    model,
    hyperparams,
    n_iter=12,
    scoring='accuracy',
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)
print("Best hyperparameters:", random_search.best_params_)
print("Best accuracy:", random_search.best_score_)

Best hyperparameters: {'fit_prior': False, 'alpha': 1}
Best accuracy: 0.8229460093896714


Observations:
- 'fit_prior': False → Assume all classes are equally likely (uniform prior).
    Prior probabilities of the classes are not considered, which reflects that the data distribution does not benefit from these priors.
- 'alpha': 1 → Data aligns well with moderate Laplace smoothing.

▷ Best Accuracy: 0.8229 (82.29%) → Slight improvement over default configuration (81.56% in initial run).

In [40]:
dump(model, open("../models/naive_bayes_alpha_1_fit_prior_False_random_42.sav", "wb"))