Paso 1: Cargar el conjunto de datos

In [1]:
import pandas as pd

# Cargar el CSV desde GitHub
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

# Verificar las primeras filas
print(df.head())


          package_name                                             review  \
0  com.facebook.katana   privacy at least put some option appear offli...   
1  com.facebook.katana   messenger issues ever since the last update, ...   
2  com.facebook.katana   profile any time my wife or anybody has more ...   
3  com.facebook.katana   the new features suck for those of us who don...   
4  com.facebook.katana   forced reload on uploading pic on replying co...   

   polarity  
0         0  
1         0  
2         0  
3         0  
4         0  


Paso 2: Estudio de variables y preprocesamiento. No es necesario EDA


In [2]:
# Eliminar package_name
df.drop(columns=['package_name'], inplace=True)


In [3]:
# Convertir a minúsculas y eliminar espacios
df['review'] = df['review'].str.strip().str.lower()


In [4]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


VECTORIZACION DEL TEXTO

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words='english')
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()


Paso 3: Construimos el modelo Naive Bayes: 

3.1 MultinomialNB

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

model_mnb = MultinomialNB()
model_mnb.fit(X_train_vec, y_train)
y_pred_mnb = model_mnb.predict(X_test_vec)

print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))
print(classification_report(y_test, y_pred_mnb))


MultinomialNB Accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179



3.2 GaussianNB

In [7]:
from sklearn.naive_bayes import GaussianNB

model_gnb = GaussianNB()
model_gnb.fit(X_train_vec, y_train)
y_pred_gnb = model_gnb.predict(X_test_vec)

print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))
print(classification_report(y_test, y_pred_gnb))


GaussianNB Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



3.3 BernouilliNB

In [8]:
from sklearn.naive_bayes import BernoulliNB

model_bnb = BernoulliNB()
model_bnb.fit(X_train_vec, y_train)
y_pred_bnb = model_bnb.predict(X_test_vec)

print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))
print(classification_report(y_test, y_pred_bnb))


BernoulliNB Accuracy: 0.770949720670391
              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



Nos quedamos con el modelo MultinomialNB con accuracy de 0.82. Tiene sentido ya que es el mejor modelo para textos

Paso 4: Optimizacion con Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.7988826815642458
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179



No consigue apenas mejoras.

In [11]:
import joblib
import os

# Crea una carpeta de salida si no existe
os.makedirs("modelos_guardados", exist_ok=True)

# Guardar el modelo MultinomialNB entrenado
joblib.dump(model_mnb, 'modelos_guardados/modelo_multinomialnb.pkl')

# Guardar el vectorizador CountVectorizer
joblib.dump(vec_model, 'modelos_guardados/vectorizador_count.pkl')


['modelos_guardados/vectorizador_count.pkl']