<a href="https://colab.research.google.com/github/SergioManuelJob/CrossValidation-NaiveBayes/blob/master/CrossValidation_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sergio Manuel Suárez Suárez

In [34]:
# importación de datos
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
iris = sns.load_dataset('iris')

In [36]:
X_iris = iris.drop('species', axis=1)
y_iris = iris['species']
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris, test_size=0.20, random_state=1)

# Accuracies con CrossValidation

### Estas son las accuracies obtenidas sin el Cross Validation

In [38]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB

# classifiers
names = ["GaussianNB", "MultiNomialNB", 'BernouilliNB', 'ComplementNB', 'CategoricalNB']
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), ComplementNB(), CategoricalNB()]

for name, clf in zip(names, classifiers):

  # FIT THE MODEL
  clf.fit(Xtrain, ytrain)
  # PREDIT AND SCORE
  score = clf.score(Xtest, ytest) #Returns the mean accuracy on the given test data and labels.
  print ("Accuracy sin cross validation del modelo: %s = %6.2f" % (name, score))


Accuracy sin cross validation del modelo: GaussianNB =   0.97
Accuracy sin cross validation del modelo: MultiNomialNB =   0.57
Accuracy sin cross validation del modelo: BernouilliNB =   0.20
Accuracy sin cross validation del modelo: ComplementNB =   0.57
Accuracy sin cross validation del modelo: CategoricalNB =   0.97


### Y estas con el Cross Validation

In [40]:
from sklearn.model_selection import KFold
cv = KFold(n_splits = 5, shuffle = True) # shuffle = False si hay dimensión temporal
total_scores = []
for name, clf in zip(names, classifiers):
  fold_accuracy = []
  for train_fold, test_fold in cv.split(Xtrain):
    # División train test aleatoria
    f_train_x = Xtrain.iloc[train_fold]           # Extrae la información (iloc), atendiendo a los indices obtenidos por CrossValidation
    f_train_y = ytrain.iloc[train_fold]
    # entrenamiento y ejeución del modelo
    clf.fit(f_train_x, f_train_y)
    # Realizamos la predicción (Final evaluation) y guardamos la precisión para calcular la media posteriormente
    y_pred = clf.predict(Xtrain.iloc[test_fold])
    # evaluación del modelo
    acc = accuracy_score(ytrain.iloc[test_fold], y_pred)
    fold_accuracy.append(acc)
  total_scores.append(sum(fold_accuracy)/len(fold_accuracy))

for i in range(len(names)):
  print ("Accuracy Con Cross Validation del modelo: %s = %6.2f" % (names[i], total_scores[i]))

Accuracy Con Cross Validation del modelo: GaussianNB =   0.95
Accuracy Con Cross Validation del modelo: MultiNomialNB =   0.78
Accuracy Con Cross Validation del modelo: BernouilliNB =   0.29
Accuracy Con Cross Validation del modelo: ComplementNB =   0.69
Accuracy Con Cross Validation del modelo: CategoricalNB =   0.93


## Ahora hacemos lo mismo pero con el dataset de los pinguinos.

In [43]:
dfPenguins = sns.load_dataset("penguins")
dfPenguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


## Hacemos un studio de los valores categóricos del dataset para preparar los datos para el modelo Naive Bayes.

In [44]:
print(dfPenguins["island"].unique())
print(dfPenguins["species"].unique())
print(dfPenguins["sex"].unique())

['Torgersen' 'Biscoe' 'Dream']
['Adelie' 'Chinstrap' 'Gentoo']
['Male' 'Female' nan]


In [45]:
dfPenguins.dropna(inplace=True)
# Borró 10 filas que tenian NA
dfPenguins.shape

(333, 7)

In [46]:
dfPenguins['island'] = pd.factorize(dfPenguins['island'], sort=True)[0] + 1
dfPenguins['species'] = pd.factorize(dfPenguins['species'], sort=True)[0] + 1
dfPenguins['sex'] = pd.factorize(dfPenguins['sex'], sort=True)[0] + 1
dfPenguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,1,3,39.1,18.7,181.0,3750.0,2
1,1,3,39.5,17.4,186.0,3800.0,1
2,1,3,40.3,18.0,195.0,3250.0,1
4,1,3,36.7,19.3,193.0,3450.0,1
5,1,3,39.3,20.6,190.0,3650.0,2
...,...,...,...,...,...,...,...
338,3,1,47.2,13.7,214.0,4925.0,1
340,3,1,46.8,14.3,215.0,4850.0,1
341,3,1,50.4,15.7,222.0,5750.0,2
342,3,1,45.2,14.8,212.0,5200.0,1


## Ya con el dataset listo, podemos usarlo para el modelo Naive Bayes. Pero lo más aconsejable ahora mismo sería escalar las variables, y es lo que voy a hacer.

In [47]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
columns_to_scale = ['body_mass_g', 'flipper_length_mm', 'bill_length_mm', 'bill_depth_mm']

dfPenguins[columns_to_scale] = scaler.fit_transform(dfPenguins[columns_to_scale])
dfPenguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,1,3,0.254545,0.666667,0.152542,0.291667,2
1,1,3,0.269091,0.511905,0.237288,0.305556,1
2,1,3,0.298182,0.583333,0.389831,0.152778,1
4,1,3,0.167273,0.738095,0.355932,0.208333,1
5,1,3,0.261818,0.892857,0.305085,0.263889,2


## Ahora creamos la variable 'target' que es el valor a predecir por nuestro modelo, el cual viene de la variable a adivinar especies, del dataset. Para después entrenar a nuestros modelos NaiveBayes.

In [48]:
# Establecer la columna 'species' como variable target
target = dfPenguins['species']

# Eliminar la columna 'species' del DataFrame
dfPenguins.drop('species', axis=1, inplace=True)
print(target.shape)
dfPenguins.head()

(333,)


Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,3,0.254545,0.666667,0.152542,0.291667,2
1,3,0.269091,0.511905,0.237288,0.305556,1
2,3,0.298182,0.583333,0.389831,0.152778,1
4,3,0.167273,0.738095,0.355932,0.208333,1
5,3,0.261818,0.892857,0.305085,0.263889,2


## Y terminamos de preparar los datos para darselos al modelo, dividiendolos entre datos de train y de test.

In [85]:
Xtrain, Xtest, ytrain, ytest = train_test_split(dfPenguins, target, test_size=0.21, random_state=1)

## Ahora entrenamos los distintos modelos de Naive Bayes, y mostramos sus matrices de confusión.

In [87]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB

# classifiers
names = ["GaussianNB", "MultiNomialNB", 'BernouilliNB', 'ComplementNB']
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), ComplementNB()]

for name, clf in zip(names, classifiers):

  # FIT THE MODEL
  clf.fit(Xtrain, ytrain)
  # PREDIT AND SCORE
  score = clf.score(Xtest, ytest) #Returns the mean accuracy on the given test data and labels.
  print ("Accuracy sin cross validation del modelo: %s = %6.2f" % (name, score))

Accuracy sin cross validation del modelo: GaussianNB =   0.87
Accuracy sin cross validation del modelo: MultiNomialNB =   0.73
Accuracy sin cross validation del modelo: BernouilliNB =   0.40
Accuracy sin cross validation del modelo: ComplementNB =   0.67


In [89]:
from sklearn.model_selection import KFold
cv = KFold(n_splits = 4, shuffle = True) # shuffle = False si hay dimensión temporal
total_scores = []
for name, clf in zip(names, classifiers):
  fold_accuracy = []
  for train_fold, test_fold in cv.split(Xtrain):
    # División train test aleatoria
    f_train_x = Xtrain.iloc[train_fold]           # Extrae la información (iloc), atendiendo a los indices obtenidos por CrossValidation
    f_train_y = ytrain.iloc[train_fold]
    # entrenamiento y ejeución del modelo
    clf.fit(f_train_x, f_train_y)
    # Realizamos la predicción (Final evaluation) y guardamos la precisión para calcular la media posteriormente
    y_pred = clf.predict(Xtrain.iloc[test_fold])
    # evaluación del modelo
    acc = accuracy_score(ytrain.iloc[test_fold], y_pred)
    fold_accuracy.append(acc)
  total_scores.append(sum(fold_accuracy)/len(fold_accuracy))

for i in range(len(names)):
  print ("Accuracy Con Cross Validation del modelo: %s = %6.2f" % (names[i], total_scores[i]))

Accuracy Con Cross Validation del modelo: GaussianNB =   0.83
Accuracy Con Cross Validation del modelo: MultiNomialNB =   0.81
Accuracy Con Cross Validation del modelo: BernouilliNB =   0.45
Accuracy Con Cross Validation del modelo: ComplementNB =   0.79
