# SENTIMENT ANALYSIS (Classification Problem): 

## Google Play Store Reviews using NAIVE BAYES

In [None]:
#Libraries

import pandas as pd 
import numpy as np 
import datetime

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import pickle
from pickle import dump

**STEP 1: PROBLEM STATEMENT & DATA COLLECTION**

***1.1 PROBLEM STATEMENT***

**Goal** -  automatically classify Google Play Store reviews as positive or negative

***1.2 DATA COLLECTION***

In [39]:
pd.options.display.max_columns=None
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [40]:
df.to_csv("../data/raw/playstore_reviews_data.csv", index=False)

**STEP 2: EXPLORATION & DATA CLEANING**

****2.1.1 Understanding the features****

To answer the above questions and develop a predictive model, we collected data on the following variables:

* `package_name` - Name of the mobile application (categorical)
* `review` - Comment about the mobile application (categorical)
* `polarity` - Class variable (0 or 1), being 0 a negative comment and 1, positive (numeric)

In [41]:
# Obtaining Dataset dimensions:
print("Dataset dimensions:")
print(df.shape)

Dataset dimensions:
(891, 3)


In [42]:
# Obtaining Dataset informations:
print("Dataset informations:")
print(df.info())

Dataset informations:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB
None


In [43]:
# Analyze the types of information we have
print("Data types present in the dataset:", df.dtypes.unique())

# Identify numerical and categorical variables
numerical_vars = df.select_dtypes(include=['float', 'int']).columns
categorical_vars = df.select_dtypes(include=['O']).columns

# Count the number of numerical and categorical variables
num_numerical_vars = len(numerical_vars)
num_categorical_vars = len(categorical_vars)

print(f"Number of categorical variables: {num_categorical_vars}")
print("Categorical variables:", list(categorical_vars))
print('\n')
print(f"Number of numerical variables: {num_numerical_vars}")
print("Numerical variables:", list(numerical_vars))

Data types present in the dataset: [dtype('O') dtype('int64')]
Number of categorical variables: 2
Categorical variables: ['package_name', 'review']


Number of numerical variables: 1
Numerical variables: ['polarity']


#### Statements

* This DataFame is composed by 891 rows and 3 columns (variables).
* The data has:
    * 2 categorical variables
    * 1 numerical variables 

**2.2 DATA CLEANING**

***2.2.1 ELIMINATE DUPLICATES***

In this step, I will eliminate duplicates, which is essential to ensure data integrity. Duplicates can distort analyses, introduce bias, and affect model accuracy. This step helps keep the dataset clean by representing each input uniquely and reliably. 

In [44]:
# Display initial row count
initial_row_count = df.shape[0]

# Check and remove duplicates
num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

if num_duplicates > 0:
    df = df.drop_duplicates()
    print(f"Duplicate rows have been removed. Row count reduced from {initial_row_count} to {df.shape[0]}.")
else:
    print("No duplicate rows found.")

Number of duplicate rows: 0
No duplicate rows found.


#### Statement

* there are no duplicates ​​in this dataframe.

***2.2.2 ELIMINATE IRRELEVANT INFORMATION***

In [45]:
df = df.drop(columns=["package_name"])
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


***2.2.3 DATA PROCESSING***

In [46]:
df["review"] = df["review"].str.strip().str.lower()
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


**STEP 3 SPLIT TRAIN & TEST**

In [None]:
X = df["review"]
y = df["polarity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


***3.1 Transform the text into a word count matrix***

In [49]:
vec_model = CountVectorizer(stop_words="english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()


## **MACHINE LEARNING**

##### **NAIVE BAYES**

#### MultinomialNB

In [None]:
model_MultinominalNB = MultinomialNB()
model_MultinominalNB.fit(X_train, y_train)
y_pred_MultinominalNB = model_MultinominalNB.predict(X_test)

print("MultinomialNB Classification Report:\n")
print(classification_report(y_test, y_pred_MultinominalNB))

MultinominalNB_accuracy = accuracy_score(y_test, y_pred_MultinominalNB)
print(f"MultinomialNB accuracy:", (MultinominalNB_accuracy))

MultinomialNB Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179

MultinomialNB accuracy: 0.8156424581005587


#### GaussianNB

In [None]:
model_Gaussian = GaussianNB()
model_Gaussian.fit(X_train, y_train)
y_pred_Gaussian = model_Gaussian.predict(X_test)

print("GaussianNB Classification Report:\n")
print(classification_report(y_test, y_pred_Gaussian))

GaussianNB_accuracy = accuracy_score(y_test, y_pred_Gaussian)
print(f"GaussianNB accuracy:", GaussianNB_accuracy)

GaussianNB Classification Report:

              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179

GaussianNB accuracy: 0.8044692737430168


#### BernoulliNB

In [None]:
model_Bernoulli = BernoulliNB()
model_Bernoulli.fit(X_train, y_train)
y_pred_Bernoulli = model_Bernoulli.predict(X_test)

print("BernoulliNB Classification Report:\n")
print(classification_report(y_test, y_pred_Bernoulli))

BernoulliNB_accuracy = accuracy_score(y_test, y_pred_Bernoulli)
print(f"BernoulliNB accuracy:", BernoulliNB_accuracy)

BernoulliNB Classification Report:

              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179

BernoulliNB accuracy: 0.770949720670391


#### Statements
* Based on the metrics results, the MultinomialNB implementation achieved the best accuracy (81.56%), followed by GaussianNB (80.45%) and BernoulliNB (77.09%).
* F1-Score Weighted (MultinomialNB): 0.81, indicating a good balance between precision and recall.

### DECISION
I chose MultinomialNB as the best Naive Bayes model.

 ### **Optimizing MultinomialNB with RANDOM FOREST**

##### using Default hyperparameters:

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred_rf))

Random_Forest_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest accuracy:", Random_Forest_accuracy)

Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179

Random Forest accuracy: 0.7988826815642458


#### adjusting hyperparameters to improve Random Forest

In [None]:
# Random Forest model with tuned hyperparameters
rf_model_tuned = RandomForestClassifier(
    n_estimators=200,  
    max_depth=10,  
    min_samples_split=5,  
    min_samples_leaf=2,  
    max_features='sqrt', 
    random_state=42  
)


rf_model_tuned.fit(X_train, y_train)
y_pred_rf_tuned = rf_model_tuned.predict(X_test)


print("\nRandom Forest Classification Report (Adjusted):")
print(classification_report(y_test, y_pred_rf_tuned))

rf_tuned_accuracy = accuracy_score(y_test, y_pred_rf_tuned)
print(f"Random Forest (Adjusted) accuracy: {rf_tuned_accuracy:.2f}")


In [101]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


params = {
    "n_estimators": randint(50, 500), 
    "criterion": ['gini', 'entropy', 'log_loss'],
    "max_depth": randint(3, 100),  
    "min_samples_split": randint(2, 10),  
    "min_samples_leaf": randint(1, 10),  
    "max_features": ['sqrt', 'log2', None]  
}


model = RandomForestClassifier(random_state=42)

# Configurar o RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    n_iter=25, 
    cv=3,  
    scoring="accuracy",  
    random_state=42,
    verbose=1,  
    n_jobs=2 
)


random_search.fit(X_train, y_train)

print("Best hyperparameters:", random_search.best_params_)
print("Improved accuracy in cross-validation:", random_search.best_score_)


best_rf_model = random_search.best_estimator_
y_pred_rf_best = best_rf_model.predict(X_test)


print("\nRandom Forest Classification Report (Optimized):")
print(classification_report(y_test, y_pred_rf_best))

best_rf_accuracy = accuracy_score(y_test, y_pred_rf_best)
print(f"Random Forest (Optimized) accuracy: {best_rf_accuracy:.2f}")


Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best hyperparameters: {'criterion': 'gini', 'max_depth': 82, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 102}
Improved accuracy in cross-validation: 0.7850819652755616

Random Forest Classification Report (Optimized):
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       126
           1       0.74      0.58      0.65        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.76       179
weighted avg       0.81      0.82      0.81       179

Random Forest (Optimized) accuracy: 0.82


#### Statement:
After optimization, Random Forest achieved similar results to MultinomialNB, being equally viable as a final model.
However, I choose to keep MultinomialNB model, due to its simplicity and efficiency in the current sentiment analysis scenario.

## SAVING MODEL

In [105]:
with open('model_MultinominalNB_42.sav', 'wb') as file:
    pickle.dump(model_MultinominalNB, file)

print("Model saved successfully.")

Model saved successfully.
