# Explore here

In [9]:
import pandas as pd

total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
total_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


Remove irrelevant information

In [10]:
total_data.drop(["package_name"], axis = 1, inplace = True)
total_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


Transform the text into a word count matrix, to obtain numerical features from the text.

In [11]:
total_data["review"] = total_data["review"].str.strip().str.lower()

Split the Data

In [12]:
from sklearn.model_selection import train_test_split
X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

Naive Bayes Model

In [26]:
from sklearn.naive_bayes import GaussianNB

model_GNB = GaussianNB()
model_GNB.fit(X_train, y_train)

Model Prediction

In [28]:
y_pred_GNB = model_GNB.predict(X_test)
y_pred_GNB

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0])

In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_GNB)

0.8044692737430168

The accuracy of the gaussian model is 0.8044692737430168.

In [30]:
from sklearn.naive_bayes import MultinomialNB

model_MNB = MultinomialNB()
model_MNB.fit(X_train, y_train)

In [31]:
y_pred_MNB = model_MNB.predict(X_test)
y_pred_MNB

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [32]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_MNB)

0.8156424581005587

The accuracy of the multinomial model is 0.8156424581005587, a little more accurate than the gaussian.

In [33]:
from sklearn.naive_bayes import BernoulliNB

model_BNB = BernoulliNB()
model_BNB.fit(X_train, y_train)

In [34]:
y_pred_BNB = model_BNB.predict(X_test)
y_pred_BNB

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [35]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_BNB)

0.770949720670391

The accuracy of the Bernoulli model is 0.770949720670391, the least accurate model.

We Optimize the Multinominal model with random forest


In [40]:
from sklearn.ensemble import RandomForestClassifier

model_RFMNB = RandomForestClassifier(random_state = 42)
model_RFMNB.fit(X_train, y_train)

In [41]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_MNB)

0.8156424581005587

It seems that the random forest accuracy is the same as the Multinomial model.

Saving the model

In [42]:
from pickle import dump

dump(model_RFMNB, open("naive_bayes_default.sav", "wb"))

We are going to try the Linear regression Model to see if it has a better accuracy.

In [43]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [44]:
print(f"Intercept (a): {model.intercept_}")
print(f"Coefficients (b): {model.coef_}")

Intercept (a): 0.6877569579062293
Coefficients (b): [ 0.04761977 -0.01486008 -0.01486008 ... -0.00585934  0.05974531
  0.18933158]


In [45]:
y_pred = model.predict(X_test)
y_pred

array([-0.14613715,  0.651007  ,  1.        ,  0.37379784,  0.1663495 ,
        0.02351888,  0.55645124, -0.04952061,  0.02665484,  0.16853849,
        0.48577064,  0.27382126,  0.64642419, -0.02305229,  0.45770321,
        1.28249873,  0.37542689,  1.1757392 ,  1.23655833, -0.15880631,
        1.28374134,  0.46351804,  1.10369639,  0.25899812,  0.26661971,
        0.90220184, -0.38142937,  0.72933566,  0.38112846,  0.1124909 ,
        1.07654807,  0.09874488,  0.71778505,  0.49129386,  0.45267919,
        0.96555693,  0.43453316,  0.44165758,  1.08266044, -1.24892874,
        0.47079263,  0.74080321,  0.94404287, -1.42404922,  0.23678854,
       -0.47282413,  0.37746497,  0.29037993, -0.12450077,  0.71914094,
        0.5334452 ,  1.0154879 ,  0.19356029, -0.43306041, -0.19027556,
        0.72427689,  0.57087196,  0.16339427, -0.00558989, -0.27632937,
       -0.18549637,  0.14697322,  0.29541549,  0.16127743,  0.57023333,
        0.48238051,  0.35143972,  0.47366712, -0.33286097,  0.70

In [46]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Coefficient of determination: {r2_score(y_test, y_pred)}")

Mean squared error: 0.26415156437160414
Coefficient of determination: -0.2673974654133824


We can see that this model is not quite fitted for this kind of data, meaning that at the moment the best model is the Multinominal Model.