In [None]:
%%capture

!pip install -U scikit-learn

In [None]:
import pandas as pd
from sklearn.datasets import load_iris

pd.options.display.max_rows = 7

original_df = load_iris(as_frame=True)

target_names = original_df["target_names"]
feature_names = original_df["feature_names"]

data = original_df.data
labels = original_df.target

In [None]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
...,...,...,...,...
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In [None]:
labels

0      0
1      0
2      0
      ..
147    2
148    2
149    2
Name: target, Length: 150, dtype: int64

In [None]:
df = pd.concat([data, labels], axis=1)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
...,...,...,...,...,...
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2
149,5.9,3.0,5.1,1.8,2


In [None]:
print(
    f"Número de filas: {len(df)}"
)

print(
    f"\nNúmero de features: {len(df.columns) - 1}", 
    f"\nNombre de los features: {list(feature_names)}"
)

print(
    f"\nNúmero de categorías a clasificar: {len(df['target'].unique())}", 
    f"\nNombres de las categorías: {original_df.target_names}", 
    f"\nÍndices de categorías: {list(df.target.unique())}", 
)

Número de filas: 150

Número de features: 4 
Nombre de los features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

Número de categorías a clasificar: 3 
Nombres de las categorías: ['setosa' 'versicolor' 'virginica'] 
Índices de categorías: [0, 1, 2]


---

In [None]:
from sklearn.model_selection import train_test_split

X = df[
    ['sepal length (cm)',
     'sepal width (cm)',
     'petal length (cm)',
     'petal width (cm)']
]

y = df['target']

X_train, X_pretest, y_train, y_pretest = train_test_split(
    X, y, test_size=0.2, random_state=0
)
X_test, X_val, y_test, y_val = train_test_split(
    X_pretest, y_pretest, test_size=0.5, random_state=0
)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train);

In [None]:
val_pred = nb_clf.predict(X_val)
val_pred

array([0, 1, 1, 1, 0, 2, 1, 1, 1, 0, 2, 1, 1, 1, 0])

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

val_acc = accuracy_score(y_val, val_pred)
print(
    f"La accuracy obtenida en el set de validación es de {val_acc * 100:.2f}%\n"
)

print("Matriz de confusión: ")
confusion_matrix(y_val, val_pred)

# El eje Y marca el valor real, el eje X marca el valor predicho.
# Hay uno que era 2 y el algoritmo predijo 1.

La accuracy obtenida en el set de validación es de 93.33%

Matriz de confusión: 


array([[4, 0, 0],
       [0, 8, 0],
       [0, 1, 2]])

---

In [None]:
# nb_clf.var_smoothing = 1e-9

nb_clf.var_smoothing = 1

nb_clf.fit(X_train, y_train)
val_pred = nb_clf.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)

print(
    f"La accuracy obtenida en el set de validación es de {val_acc * 100:.2f}%\n"
)

La accuracy obtenida en el set de validación es de 80.00%



In [None]:
for var_smoothing in [1, 0.8, 0.6, 0.4, 0.2, 0.1, 0.01]:

    nb_clf.var_smoothing = var_smoothing

    nb_clf.fit(X_train, y_train)
    val_pred = nb_clf.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)

    print(
        f"[VAR SMOOTHING]: {var_smoothing}", 
        f"\n[VAL ACC]:       {val_acc:.2f}\n"
    )

[VAR SMOOTHING]: 1 
[VAL ACC]:       0.80

[VAR SMOOTHING]: 0.8 
[VAL ACC]:       0.87

[VAR SMOOTHING]: 0.6 
[VAL ACC]:       0.93

[VAR SMOOTHING]: 0.4 
[VAL ACC]:       0.93

[VAR SMOOTHING]: 0.2 
[VAL ACC]:       0.93

[VAR SMOOTHING]: 0.1 
[VAL ACC]:       0.93

[VAR SMOOTHING]: 0.01 
[VAL ACC]:       1.00



In [None]:
chosen_var_smoothing = 0.01
nb_clf.var_smoothing = chosen_var_smoothing
nb_clf.fit(X_train, y_train)

GaussianNB(var_smoothing=0.01)

---

In [None]:
test_pred = nb_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print(
    f"La accuracy obtenida en el set de test es de {test_acc * 100:.2f}%"
)

La accuracy obtenida en el set de test es de 100.00%


---