# Importación de módulos generales

In [None]:
%pylab inline
%load_ext memory_profiler

# %pylab

import os
import tempfile
import pandas as pd
# import numpy as np
import networkx as nx
# import matplotlib
# import pylab  as plt
import pygraphviz

from pomegranate import BayesianNetwork

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Funciones auxiliares visualización

Importamos las funciones auxiliares para visualizar redes que hemos definido:
- **plot_pomegranate_bn_nx**(pgm, layout=None, node_size=2000, node_color='pink')
- **plot_pomegranate_bn_pgvz**(pgm, filename=None, prog='dot', color='red')
- **plot_pgm_bn**(pgm, layout=None, node_size=2000, node_color='pink'):

In [None]:
from funciones_auxiliares import *

# Lectura de datos

Podemos obtener los datos originales de https://www.kaggle.com/uciml/pima-indians-diabetes-database.

In [None]:
data = pd.read_csv("data/diabetes/prima_indian_diabetes_dataset.csv")
data.head()

Como podemos observar, hay valores nulos en ciertas variables, en este caso representados mediante el valor 0.

In [None]:
data [['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
print("Valores nulos en cada variable:\n", data.isnull().sum())

print("\nNúmero de registros completos: ", data.dropna().count()[0])

Discretizamos las variables numéricas:

In [None]:
columns_to_discretize = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
num_bins=15

for col in columns_to_discretize:
    data[col] = pd.cut(data[col], bins=num_bins)
    
# data.Glucose = pd.cut(data.Glucose, bins=10)
data.head()

Y separamos los registros completos(sin valores nulos) de los iniciales

In [None]:
complete_data = data.dropna().reset_index(drop=True)

Dividimos los datos en conjunto de entrenamiento y validación

In [None]:
X = data.drop(columns=['Outcome'])
y = data.Outcome

X_complete = complete_data.drop(columns=['Outcome'])
y_complete = complete_data.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)
X_complete_train, X_complete_test, y_complete_train, y_complete_test = train_test_split(X_complete, y_complete, test_size=1/3)

print("Particiones obtenidas sobre los datos iniciales:")
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test:  ", X_test.shape)
print("y_test:  ", y_test.shape)
print("\nParticiones obtenidas sobre los datos con los registros completos:")
print("X_complete_train: ", X_complete_train.shape)
print("y_complete_train: ", y_complete_train.shape)
print("X_complete_test:  ", X_complete_test.shape)
print("y_complete_test:  ", y_complete_test.shape)

# Algoritmos de aprendizaje

### Chow-Liu

Aplicamos el algoritmo de Chow-Liu para aprender una estructura de árbol:

In [None]:
%time %memit pomegranate_tree = BayesianNetwork.from_samples(complete_data, algorithm='chow-liu', root=8, \
                                                             state_names=list(data))

pomegranate_tree.bake()

Visualizamos el árbol resultante:

In [None]:
plot_pomegranate_bn_nx(pomegranate_tree, node_size=1000, node_color='pink')

Podemos hacer inferencia, predeciendo la variabel *Outcome* para los registros de entrenamiento:

In [None]:
nan_vec = np.array([[np.NaN]*X_complete_train.shape[0]]).T
complete_train_uknown_outcome = np.concatenate((X_complete_train.to_numpy(),nan_vec), axis=1)

%time %memit y_pred_train = np.array(pomegranate_tree.predict(complete_train_uknown_outcome))[:,8]

In [None]:
print(classification_report(list(y_pred_train), list(y_complete_train)))
print(accuracy_score(list(y_pred_train), list(y_complete_train)))

O sobre el dataset dejado en la partición para validación:

In [None]:
nan_vec = np.array([[np.NaN]*X_complete_test.shape[0]]).T
complete_data_matrix_uknown_outcome = np.concatenate((X_complete_test.to_numpy(),nan_vec), axis=1)

%time %memit y_pred = np.array(pomegranate_tree.predict(complete_data_matrix_uknown_outcome))[:,8]

In [None]:
print(classification_report(list(y_pred), list(y_complete_test)))
print(accuracy_score(list(y_pred), list(y_complete_test)))

# Comparación Chow-Liu vs Greedy vs Exact

In [None]:
%time %memit chow_liu_model = BayesianNetwork.from_samples(complete_data, algorithm='chow-liu', root=8, \
                                                             state_names=list(data))
%time %memit greedy_model = BayesianNetwork.from_samples(data, algorithm='greedy', state_names=list(data))
%time %memit exact_model = BayesianNetwork.from_samples(data, algorithm='exact', state_names=list(data))

chow_liu_model.bake()
greedy_model.bake()
exact_model.bake()

In [None]:
plot_pomegranate_bn_nx(chow_liu_model, node_size=1000, node_color='pink')
plot_pomegranate_bn_nx(greedy_model, node_size=1000, node_color='pink')
plot_pomegranate_bn_nx(exact_model, layout=nx.planar_layout, node_size=1000, node_color='pink')

In [None]:
nan_vec = np.array([[np.NaN]*X_complete_test.shape[0]]).T
complete_data_matrix_uknown_outcome = np.concatenate((X_complete_test.to_numpy(),nan_vec), axis=1)

%time %memit y_pred_complete_chow_liu = np.array(chow_liu_model.predict(complete_data_matrix_uknown_outcome))[:,8]

nan_vec = np.array([[np.NaN]*X_test.shape[0]]).T
data_matrix_uknown_outcome = np.concatenate((X_test.to_numpy(),nan_vec), axis=1)

# %time %memit y_pred_chow_liu = np.array(chow_liu_model.predict(data_matrix_uknown_outcome))[:,8]
%time %memit y_pred_greedy = np.array(greedy_model.predict(data_matrix_uknown_outcome))[:,8]
%time %memit y_pred_exact = np.array(exact_model.predict(data_matrix_uknown_outcome))[:,8]

In [None]:
print("Chow-Liu:\n", classification_report(list(y_pred_complete_chow_liu), list(y_complete_test)))
# print(classification_report(list(y_pred_chow_liu), list(y_test)))
print("Greedy:\n", classification_report(list(y_pred_greedy), list(y_test)))
print("Exact:\n", classification_report(list(y_pred_exact), list(y_test)))