In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from joblib import dump
from sklearn.feature_extraction.text import CountVectorizer



In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

In [4]:
df.head(5)

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [5]:
df.shape

(891, 3)

In [6]:
df = df.drop('package_name', axis=1)

In [7]:
df['review'] = df['review'].str.strip().str.lower()

In [8]:
print(df.columns)

Index(['review', 'polarity'], dtype='object')


In [9]:
y = df['polarity'].values
X = df.select_dtypes(include=['object'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['polarity'],
                                                    test_size=0.2,
                                                    random_state=1)

In [11]:
vectorizer = CountVectorizer(stop_words="english")

In [12]:
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [13]:
X_train_transformed_dense = X_train_transformed.toarray()
X_test_transformed_dense = X_test_transformed.toarray()

In [14]:
gnb_model = GaussianNB()
mnb_model = MultinomialNB()
bnb_model = BernoulliNB()

In [15]:
gnb_model.fit(X_train_transformed_dense, y_train)
y_pred = gnb_model.predict(X_test_transformed_dense)
y_pred_train = gnb_model.predict(X_train_transformed_dense)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred))

0.9887640449438202
0.7374301675977654
mean squared error-in: 0.011235955056179775
mean squared error-out: 0.26256983240223464


In [16]:
mnb_model.fit(X_train_transformed, y_train)
y_pred = mnb_model.predict(X_test_transformed)
y_pred_train = mnb_model.predict(X_train_transformed)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred))


0.9480337078651685
0.7932960893854749
mean squared error-in: 0.05196629213483146
mean squared error-out: 0.20670391061452514


In [17]:
bnb_model.fit(X_train_transformed, y_train)
y_pred = bnb_model.predict(X_test_transformed)
y_pred_train = bnb_model.predict(X_train_transformed)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred))

0.9143258426966292
0.6983240223463687
mean squared error-in: 0.08567415730337079
mean squared error-out: 0.3016759776536313


In [20]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
# extract probabilities from Multinomial Naive Bayes model
proba_from_nb = mnb_model.predict_proba(X_train_transformed)

# initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# train Random Forest model using the probabilities from the Naive Bayes model as features
rf_model.fit(proba_from_nb, y_train)

# extract probabilities from the Multinomial Naive Bayes model for the test set
proba_from_nb_test = mnb_model.predict_proba(X_test_transformed)

# make predictions on the test set using the Random Forest model
y_pred_rf_test = rf_model.predict(proba_from_nb_test)
y_pred_rf_train = rf_model.predict(proba_from_nb)



In [28]:
accuracy_rf_train = accuracy_score(y_train, y_pred_rf_train)
print(f"Random Forest In-Sample Accuracy: {accuracy_rf_train}")

accuracy_rf_test = accuracy_score(y_test, y_pred_rf_test)
print(f"Random Forest Out-of-Sample Accuracy: {accuracy_rf_test}")

Random Forest In-Sample Accuracy: 0.9985955056179775
Random Forest Out-of-Sample Accuracy: 0.7821229050279329


In [None]:
dump(mnb_model, open("mnb_model.sav", "wb"))