# Explore here

In [12]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

import joblib


In [13]:
#df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
#df.to_csv("../data/raw/raw_data.csv")

df = pd.read_csv('../data/raw/raw_data.csv')

df.drop('Unnamed: 0', axis=1, inplace=True)

In [14]:
df.head()


Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [15]:
# Cleaning and lowercasing the data
df["review"] = df["review"].str.strip().str.lower()

In [16]:
# Splitting the data 
X = df['review']
y = df['polarity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Encoding data into numerical features
# Converting text into a keyword list and counting their occurrences to quantify the text
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [18]:
# Prepare models
gauss = GaussianNB()
multi = MultinomialNB()
bernoulli = BernoulliNB()

# Train models
gauss.fit(X_train, y_train)
multi.fit(X_train, y_train)
bernoulli.fit(X_train, y_train)


In [19]:
# Predict on the test set
y_pred_gauss = gauss.predict(X_test)
y_pred_multi = multi.predict(X_test)
y_pred_bernoulli = bernoulli.predict(X_test)

# Evaluate the models
acc_gauss = accuracy_score(y_test, y_pred_gauss)
acc_multi = accuracy_score(y_test, y_pred_multi)
acc_bernoulli = accuracy_score(y_test, y_pred_bernoulli)

print(f"GaussianNB Accuracy: {acc_gauss}")
print(f"MultinomialNB Accuracy: {acc_multi}")
print(f"BernoulliNB Accuracy: {acc_bernoulli}")

GaussianNB Accuracy: 0.8044692737430168
MultinomialNB Accuracy: 0.8156424581005587
BernoulliNB Accuracy: 0.770949720670391


In [20]:
# Define the parameter grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False]
}

# Initialize the model
multi = MultinomialNB()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=multi, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=3)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Best parameters: {'alpha': 2.0, 'fit_prior': False}


In [21]:
# Train the model with the best parameters
best_multi = MultinomialNB(**best_params)
best_multi.fit(X_train, y_train)

# Predict on the test set
y_pred_best_multi = best_multi.predict(X_test)

# Evaluate the model
acc_best_multi = accuracy_score(y_test, y_pred_best_multi)
print(f"Optimized MultinomialNB Accuracy: {acc_best_multi}")


Optimized MultinomialNB Accuracy: 0.8212290502793296


In [23]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred_best_multi)
precision = precision_score(y_test, y_pred_best_multi)
recall = recall_score(y_test, y_pred_best_multi)
conf_matrix = confusion_matrix(y_test, y_pred_best_multi)
class_report = classification_report(y_test, y_pred_best_multi)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.82
Precision: 0.72
Recall: 0.64
Confusion Matrix:
[[113  13]
 [ 19  34]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       126
           1       0.72      0.64      0.68        53

    accuracy                           0.82       179
   macro avg       0.79      0.77      0.78       179
weighted avg       0.82      0.82      0.82       179



In [24]:
# Save model
joblib.dump(best_multi, '../models/multinomialnb.pkl')


['../models/multinomialnb.pkl']