# Explore here

In [234]:
# Naive Bayes

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [235]:
# Load the dataset
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)
df.head(10)

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
5,com.facebook.katana,idk i can't edit my posts? things such as my ...,0
6,com.facebook.katana,major flaws constant updates and always getti...,0
7,com.facebook.katana,video issues since i was forced into this upd...,0
8,com.facebook.katana,this update completely destroyed my facebook...,0
9,com.facebook.katana,"posting issues for the last week, there's bee...",0


In [236]:
df.tail(10)

Unnamed: 0,package_name,review,polarity
881,com.rovio.angrybirds,game ruined because of ads. i felt like re-do...,0
882,com.rovio.angrybirds,ads way to many ads can't even enjoy the game...,0
883,com.rovio.angrybirds,"great game, but too many ads almost not wort...",1
884,com.rovio.angrybirds,fun but hard angry birds is really fun video ...,1
885,com.rovio.angrybirds,too many ads far more adverts than any other ...,1
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1
890,com.rovio.angrybirds,they're everywhere i see angry birds everywhe...,1


In [237]:
# Inspect data structure
print("Data Structure")
print(df.info())

Data Structure
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB
None


In [238]:
# dropping package name per instructions
df = df.drop(['package_name'], axis=1)
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
...,...,...
886,loved it i loooooooooooooovvved it because it...,1
887,all time legendary game the birthday party le...,1
888,ads are way to heavy listen to the bad review...,0
889,fun works perfectly well. ads aren't as annoy...,1


In [239]:
# Ensure there are no NaN values in the column
df["review"] = df["review"].fillna("").str.strip().str.lower()

# Display the updated DataFrame
print(df)

                                                review  polarity
0    privacy at least put some option appear offlin...         0
1    messenger issues ever since the last update, i...         0
2    profile any time my wife or anybody has more t...         0
3    the new features suck for those of us who don'...         0
4    forced reload on uploading pic on replying com...         0
..                                                 ...       ...
886  loved it i loooooooooooooovvved it because it ...         1
887  all time legendary game the birthday party lev...         1
888  ads are way to heavy listen to the bad reviews...         0
889  fun works perfectly well. ads aren't as annoyi...         1
890  they're everywhere i see angry birds everywher...         1

[891 rows x 2 columns]


In [240]:
# Split the data into x and y 
X = df.drop('polarity', axis=1)  # All columns except 'y'
y = df['polarity']  # The target variable (subscription outcome)   

In [241]:
X.shape

(891, 1)

In [242]:
y.shape

(891,)

In [243]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [244]:
X_train.head

<bound method NDFrame.head of                                                 review
331  just did the latest update on viber and yet ag...
733  keeps crashing it only works well in extreme d...
382  the fail boat has arrived the 6.0 version is t...
704  superfast, just as i remember it ! opera mini ...
813  installed and immediately deleted this crap i ...
..                                                 ...
106  why can't i share my achievements? recently di...
270  beta is the best version of the chrome browser...
860  great little game. this is a great little game...
435  keeps crashing ever since i started using it m...
102  even though i am loving the new update, but th...

[712 rows x 1 columns]>

In [245]:
# Define the CountVectorizer with additional options
vec_model = CountVectorizer(stop_words='english', lowercase=True, max_features=5000)

X_train = vec_model.fit_transform(X_train.squeeze())
X_test = vec_model.transform(X_test.squeeze())

In [246]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 12575 stored elements and shape (712, 3310)>

In [247]:
# Check the shape of the vectorized data
print(f"Shape of X_train_vectorized: {X_train.shape}")

Shape of X_train_vectorized: (712, 3310)


In [248]:
print(X_train)


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 12575 stored elements and shape (712, 3310)>
  Coords	Values
  (0, 1550)	1
  (0, 822)	1
  (0, 1611)	1
  (0, 3070)	2
  (0, 3129)	2
  (0, 2208)	1
  (0, 2539)	1
  (0, 1801)	1
  (0, 2995)	1
  (0, 1741)	1
  (0, 2491)	1
  (0, 2431)	2
  (0, 2685)	1
  (0, 874)	1
  (0, 2915)	1
  (0, 105)	1
  (0, 187)	1
  (0, 454)	1
  (0, 2937)	1
  (0, 3074)	1
  (0, 869)	1
  (0, 3235)	1
  (0, 1113)	1
  (0, 236)	1
  (0, 971)	1
  :	:
  (711, 1990)	1
  (711, 663)	6
  (711, 1115)	2
  (711, 562)	1
  (711, 1105)	1
  (711, 92)	1
  (711, 1658)	2
  (711, 2876)	1
  (711, 398)	1
  (711, 936)	1
  (711, 770)	1
  (711, 1937)	1
  (711, 2839)	1
  (711, 93)	1
  (711, 2933)	1
  (711, 1025)	1
  (711, 1721)	1
  (711, 399)	2
  (711, 1338)	1
  (711, 150)	1
  (711, 1798)	1
  (711, 969)	1
  (711, 3071)	1
  (711, 6)	1
  (711, 3072)	1


In [249]:
print(y_train)

331    0
733    0
382    0
704    1
813    1
      ..
106    0
270    0
860    1
435    0
102    0
Name: polarity, Length: 712, dtype: int64


In [250]:
# Train Naive Bayes models
models = {
    "GaussianNB": GaussianNB(),
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB()
}

In [251]:
best_model = None
best_accuracy = 0

In [252]:
# Convert sparse matrices to dense arrays for GaussianNB
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

In [253]:
# Evaluate each Naive Bayes model
for name, model in models.items():
    if name == "GaussianNB":  # Use dense arrays for GaussianNB
        model.fit(X_train_dense, y_train)
        y_pred = model.predict(X_test_dense)
    else:  # Other models work with sparse data
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    if accuracy > best_accuracy:
        best_model = model
        best_accuracy = accuracy


GaussianNB Accuracy: 0.80
MultinomialNB Accuracy: 0.82
BernoulliNB Accuracy: 0.77


In [254]:
# Optimize with Random Forest if applicable
if best_model:
    # Use Random Forest directly on the original features
    rf = RandomForestClassifier(random_state=42, n_estimators=100)
    rf.fit(X_train, y_train)
    rf_y_pred = rf.predict(X_test)
    
    print("\nRandom Forest Results:")
    print("Accuracy:", accuracy_score(y_test, rf_y_pred))
    print("Classification Report:\n", classification_report(y_test, rf_y_pred))


Random Forest Results:
Accuracy: 0.7988826815642458
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179



In [255]:
# Train XGBoost
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate XGBoost
xgb_y_pred = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_y_pred))
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_y_pred))

XGBoost Accuracy: 0.8044692737430168
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.82      0.85       126
           1       0.64      0.77      0.70        53

    accuracy                           0.80       179
   macro avg       0.77      0.80      0.78       179
weighted avg       0.82      0.80      0.81       179



In [256]:
# Train the model
model.fit(X_train, y_train)