# Explore here

In [2]:
# Your code here

import os
import pandas as pd
import requests
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# 1. descargar data

url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"

respuesta = requests.get(url)
nombre_archivo = "playstore_reviews.csv"
with open(nombre_archivo, 'wb') as archivo:
    archivo.write(respuesta.content)


In [3]:
# 2. convertir csv en dataframe

df = pd.read_csv('../data/raw/playstore_reviews.csv')
print(df.head())

          package_name                                             review  \
0  com.facebook.katana   privacy at least put some option appear offli...   
1  com.facebook.katana   messenger issues ever since the last update, ...   
2  com.facebook.katana   profile any time my wife or anybody has more ...   
3  com.facebook.katana   the new features suck for those of us who don...   
4  com.facebook.katana   forced reload on uploading pic on replying co...   

   polarity  
0         0  
1         0  
2         0  
3         0  
4         0  


In [4]:
# elimiar variable 'package_name'

df_redux = df.drop('package_name', axis=1)
df_redux.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [5]:
df_redux["review"] = df_redux["review"].str.strip().str.lower()
df_redux.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [6]:
# split dataframe into train and test

from sklearn.model_selection import train_test_split

X = df_redux['review']  
y = df_redux['polarity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [7]:
# transformar texto en matriz de palabras
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train_counts = vec_model.fit_transform(X_train).toarray()
X_test_counts = vec_model.transform(X_test).toarray()

X_train_counts



array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
# implementando Multinomial NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_classifier = MultinomialNB()

# Training 
nb_classifier.fit(X_train_counts, y_train)

# predictions 
y_pred = nb_classifier.predict(X_test_counts)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8156424581005587
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179



In [9]:
# implementando Gaussian NB

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

nb_classifier = GaussianNB()

# Training 
nb_classifier.fit(X_train_counts, y_train)

# predictions 
y_pred = nb_classifier.predict(X_test_counts)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8044692737430168
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



In [10]:
# implementando Bernouilli NB

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

nb_classifier = BernoulliNB()

# Training 
nb_classifier.fit(X_train_counts, y_train)

# predictions 
y_pred = nb_classifier.predict(X_test_counts)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.770949720670391
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



 - Multinomial Naive Bayes: accuracy = 0.82
 - Gaussian Naive Bayes : accuracy = 0.80
 - Bernoulli Naive Bayes: accuracy = 0.77

EL MODELO MAS ADECUADO ES EL MULTINOMIAL (como habiamos elegido originalmente) 

In [13]:
# probar si el modelo random forest ofrece mejor reultado

from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_counts, y_train)
y_pred = rf_model.predict(X_test_counts)


# Calcular la precisión
accuracy = accuracy_score(y_test, y_pred)
print(f'Precisión del modelo híbrido Naive Bayes + Random Forest: {accuracy}')

Precisión del modelo híbrido Naive Bayes + Random Forest: 0.7988826815642458


RESULTADO : El modelo random Forest no mejora la accuracy