In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import joblib
from sklearn.feature_extraction.text import CountVectorizer



In [8]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

In [9]:
df.head(5)

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [10]:
df.shape

(891, 3)

In [11]:
df = df.drop('package_name', axis=1)

In [12]:
df['review'] = df['review'].str.strip().str.lower()

In [13]:
print(df.columns)

Index(['review', 'polarity'], dtype='object')


In [14]:
y = df['polarity']
X = df['review']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [16]:
vectorizer = CountVectorizer(stop_words="english")

In [17]:
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [18]:
X_train_transformed_dense = X_train_transformed.toarray()
X_test_transformed_dense = X_test_transformed.toarray()

In [33]:
models = [GaussianNB(), MultinomialNB(), BernoulliNB()]

for model in models:
    model.fit(X_train_transformed_dense, y_train)
    y_pred = model.predict(X_test_transformed_dense)
    y_pred_train = model.predict(X_train_transformed_dense)
    
    print(f"Model: {model.__class__.__name__}")
    print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train)}")
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")
    print("\n")


Model: GaussianNB
Train Accuracy: 0.9859550561797753
Test Accuracy: 0.8044692737430168


Model: MultinomialNB
Train Accuracy: 0.9606741573033708
Test Accuracy: 0.8156424581005587


Model: BernoulliNB
Train Accuracy: 0.9199438202247191
Test Accuracy: 0.770949720670391




#### Random search to hypertune MultinomialNB

In [24]:
param = {
    'alpha': np.logspace(-3, 3, 7),
}

In [27]:
nb_model = MultinomialNB()
rnd_search = RandomizedSearchCV(nb_model, param, n_iter=10, cv=5, random_state=42)

rnd_search.fit(X_train_transformed_dense,y_train)



In [28]:
print(rnd_search.best_params_)
print(rnd_search.best_score_)

{'alpha': 1.0}
0.8103910174332709


In [32]:
rscv_mnb_model = MultinomialNB(alpha = 1)
rscv_mnb_model.fit(X_train_transformed_dense, y_train)
y_pred = rscv_mnb_model.predict(X_test_transformed_dense)
y_pred_train = rscv_mnb_model.predict(X_train_transformed_dense)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred))

0.9606741573033708
0.8156424581005587


In [40]:
joblib.dump(rscv_mnb_model, "rscv_mnb_model.sav")

['rscv_mnb_model.sav']