# Explore here

In [7]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from sklearn.metrics import accuracy_score

In [2]:

reviews = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

reviews.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [5]:
# Copiamos el dataset en otra variable, eliminamos la variable 'package_name' ya que no es útil para nuestro modelo y ponemos las palabras en minusculas en al variable 'review'

reviews_final = reviews

reviews_final = reviews_final.drop('package_name', axis = 1)

reviews_final['review'] = reviews_final['review'].str.strip().str.lower()

reviews_final.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [6]:
X = reviews_final['review']
y = reviews_final['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [10]:
# Vectorizamos el dataset 

vectorizador = TfidfVectorizer(stop_words = 'english')

X_train_vec = vectorizador.fit_transform(X_train)
X_test_vec = vectorizador.transform(X_test)



In [11]:
# Entrenamos el modelo

modelo_multinominal = MultinomialNB()

modelo_multinominal.fit(X_train_vec, y_train)

In [12]:
y_pred_multinominal = modelo_multinominal.predict(X_test_vec)

In [13]:
acc_multinominal = accuracy_score(y_test, y_pred_multinominal)
acc_multinominal

0.7988826815642458

In [21]:
#Vamos a probar los demás modelos basados en el teorema de Bayes

modelo_bernoulli = BernoulliNB()
modelo_bernoulli.fit(X_train_vec, y_train)


In [24]:
y_pred_bernoulli = modelo_bernoulli.predict(X_test_vec)

In [26]:
acc_bernoulli = accuracy_score(y_test, y_pred_bernoulli)
acc_bernoulli

0.770949720670391

In [None]:
modelo_gaussian = GaussianNB()

X_train_dense = X_train_vec.toarray() #Convertimos los datos dispersos a densos, ya que el modelo Gaussian NB no puede trabajar con matrices dispersas

modelo_gaussian.fit(X_train_dense, y_train)

In [31]:
X_test_dense = X_test_vec.toarray()

y_pred_gaussian = modelo_gaussian.predict(X_test_dense)

In [32]:
acc_gaussian = accuracy_score(y_test, y_pred_gaussian)
acc_gaussian

0.8100558659217877

Al entrenar los modelos Naive Bayes, comprobamos que el más preciso, con el método de vectorizacion usado, es el modelo de Gaussian NB con una precisión del 81%.

Vamos a intentar optimizar el modelo para mejorar la predicción

In [35]:
from sklearn.model_selection import GridSearchCV

parametros = {
    'priors': [None, [0.75, 0.25], [0.5, 0.5], [0.25, 0.75]],  # Ejemplos de valores para priors
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]    # Ejemplos de valores para var_smoothing
}

grid_search = GridSearchCV(modelo_gaussian, parametros, cv=5)
grid_search.fit(X_train_dense, y_train)

print("Mejores hiperparámetros encontrados:", grid_search.best_params_)


Mejores hiperparámetros encontrados: {'priors': None, 'var_smoothing': 1e-09}


In [36]:
modelo_gaussian_opt = GaussianNB(priors=None, var_smoothing=1e-9)

modelo_gaussian_opt.fit(X_train_dense, y_train)

In [38]:
y_pred_opt = modelo_gaussian_opt.predict(X_test_dense)

acc_opt = accuracy_score(y_test, y_pred_opt)
acc_opt

0.8100558659217877

Al optimizar el modelo vemos que la precisión es la misma, eso significa que al optimizar el modelo los mejores hiperparámetros son los que vienen por defecto en el modelo.

In [39]:
from pickle import dump

dump(modelo_gaussian_opt, open("../models/naive_bayes_gaussian_opt.sav", "wb"))