# SIAA: Tarea Naïve Bayes

#### Autor: Javier, Raúl, Laura, Martín, Hugo

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#### Enlace URL del dataset obtenido:

https://www.kaggle.com/datasets/denisadutca/customer-behaviour

#### Columnas:

**User ID** (Identificación del usuario)

**Gender** (Género)

**Age** (Edad)

**EstimatedSalary** (Salario Estimado)

**Purchased** (Producto Comprado)

### Leemos el dataset

In [12]:
ruta_del_dataset = "Customer_Behaviour.csv"
df = pd.read_csv(ruta_del_dataset)

### Convertimos los datos a numéricos

In [13]:
encoder = OneHotEncoder()

colegios_valores_encoded = encoder.fit_transform(df[["Gender"]])
colegios_columnas_encoded = encoder.get_feature_names_out(["Gender"])
encoded_df = pd.DataFrame(colegios_valores_encoded.toarray(), columns=colegios_columnas_encoded)

df = pd.concat([df, encoded_df], axis=1)
df = df.drop(columns=["Gender"])

In [14]:
df

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,15624510,19,19000,0,0.0,1.0
1,15810944,35,20000,0,0.0,1.0
2,15668575,26,43000,0,1.0,0.0
3,15603246,27,57000,0,1.0,0.0
4,15804002,19,76000,0,0.0,1.0
...,...,...,...,...,...,...
395,15691863,46,41000,1,1.0,0.0
396,15706071,51,23000,1,0.0,1.0
397,15654296,50,20000,1,1.0,0.0
398,15755018,36,33000,0,0.0,1.0


### Dividimos el dataset en variables predictoras (X) y variable objetivo (Y), después escalamos los datos

In [15]:
# Separar variables predictoras (X) y objetivo (y)
X = df[["Gender_Female", "Gender_Male", "Age", "EstimatedSalary"]] 
Y = df["Purchased"]

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Creamos el modelo de naive bayes Gaussiano

In [16]:
# === 1. Naïve Bayes Gaussiano (para variables continuas) ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

gaussian_model = GaussianNB()
gaussian_model.fit(X_train_scaled, Y_train)
y_pred_gaussian = gaussian_model.predict(X_test_scaled)

print("=== Naïve Bayes Gaussiano ===")
print("Accuracy:", accuracy_score(Y_test, y_pred_gaussian))
print("Reporte de clasificación:\n", classification_report(Y_test, y_pred_gaussian))

print(confusion_matrix(Y_test, y_pred_gaussian))

=== Naïve Bayes Gaussiano ===
Accuracy: 0.925
Reporte de clasificación:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94        52
           1       0.92      0.86      0.89        28

    accuracy                           0.93        80
   macro avg       0.92      0.91      0.92        80
weighted avg       0.92      0.93      0.92        80

[[50  2]
 [ 4 24]]


### Creamos el modelo de naive bayes Multinomial

In [17]:
# === 2. Naïve Bayes Multinomial (para datos de conteo) ===
# Convertimos los datos en valores positivos ya que MultinomialNB no maneja valores negativos
X_train_multinomial = X_train - X_train.min()
X_test_multinomial = X_test - X_test.min()

multinomial_model = MultinomialNB()
multinomial_model.fit(X_train_multinomial, Y_train)
y_pred_multinomial = multinomial_model.predict(X_test_multinomial)

print("=== Naïve Bayes Multinomial ===")
print("Accuracy:", accuracy_score(Y_test, y_pred_multinomial))
print("Reporte de clasificación:\n", classification_report(Y_test, y_pred_multinomial))

print(confusion_matrix(Y_test, y_pred_multinomial))

=== Naïve Bayes Multinomial ===
Accuracy: 0.725
Reporte de clasificación:
               precision    recall  f1-score   support

           0       0.78      0.81      0.79        52
           1       0.62      0.57      0.59        28

    accuracy                           0.72        80
   macro avg       0.70      0.69      0.69        80
weighted avg       0.72      0.72      0.72        80

[[42 10]
 [12 16]]


### Creamos el modelo de naive bayes Bernoulli

In [18]:
X_train.median().mean()
X_train

Unnamed: 0,Gender_Female,Gender_Male,Age,EstimatedSalary
3,1.0,0.0,27,57000
18,0.0,1.0,46,28000
202,1.0,0.0,39,134000
250,1.0,0.0,44,39000
274,1.0,0.0,57,26000
...,...,...,...,...
71,1.0,0.0,24,27000
106,1.0,0.0,26,35000
270,1.0,0.0,43,133000
348,0.0,1.0,39,77000


In [None]:
# === 3. Naïve Bayes Bernoulli (para datos binarios) ===
binarizer_salary = Binarizer(threshold=np.median(X_train['EstimatedSalary']))
binarizer_age = Binarizer(threshold=np.median(X_train['Age']))

# Transformar los datos a binario y unirlos en un dataframe de train y otro de test
X_train_binary_salary = pd.DataFrame(binarizer_salary.fit_transform(X_train[['EstimatedSalary']]), columns=['EstimatedSalary'])
X_train_binary_age = pd.DataFrame(binarizer_age.fit_transform(X_train[['Age']]), columns=['Age'])
X_train_binary = pd.concat([X_train_binary_salary, X_train_binary_age], axis=1)

X_test_binary_salary = pd.DataFrame(binarizer_salary.transform(X_test[['EstimatedSalary']]), columns=['EstimatedSalary'])
X_test_binary_age = pd.DataFrame(binarizer_age.transform(X_test[['Age']]), columns=['Age'])
X_test_binary = pd.concat([X_test_binary_salary, X_test_binary_age], axis=1)

bernoulli_model = BernoulliNB()
bernoulli_model.fit(X_train_binary, Y_train)
y_pred_bernoulli = bernoulli_model.predict(X_test_binary)

print("=== Naïve Bayes Bernoulli ===")
print("Accuracy:", accuracy_score(Y_test, y_pred_bernoulli))
print("Reporte de clasificación:\n", classification_report(Y_test, y_pred_bernoulli))

print(confusion_matrix(Y_test, y_pred_bernoulli))

=== Naïve Bayes Bernoulli ===
Accuracy: 0.7875
Reporte de clasificación:
               precision    recall  f1-score   support

           0       0.78      0.94      0.85        52
           1       0.82      0.50      0.62        28

    accuracy                           0.79        80
   macro avg       0.80      0.72      0.74        80
weighted avg       0.79      0.79      0.77        80

[[49  3]
 [14 14]]


### Según la presición sacada de todos los modelos, el mejor modelo es el gaussiano para este dataset