# Importación de módulos generales

In [1]:
%pylab inline
%load_ext memory_profiler

# %pylab

import os
import tempfile
import pandas as pd
# import numpy as np
import networkx as nx
# import matplotlib
# import pylab  as plt
import pygraphviz

from pomegranate import BayesianNetwork

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

Populating the interactive namespace from numpy and matplotlib


# Funciones auxiliares visualización

Importamos las funciones auxiliares para visualizar redes que hemos definido:
- **plot_pomegranate_bn_nx**(pgm, layout=None, node_size=2000, node_color='pink')
- **plot_pomegranate_bn_pgvz**(pgm, filename=None, prog='dot', color='red')
- **plot_pgm_bn**(pgm, layout=None, node_size=2000, node_color='pink'):

In [2]:
from funciones_auxiliares import *

# Lectura de datos

Podemos obtener los datos originales de TODO.

In [3]:
X = pd.read_csv("data/fraud_detection/ucsd/DataminingContest2009.Task1.Train.Inputs")
y = pd.read_csv("data/fraud_detection/ucsd/DataminingContest2009.Task1.Train.Targets", header=None)
y.columns = ['target']
data = pd.merge(X, y, left_index=True, right_index=True)
X.head()

Unnamed: 0,amount,hour1,state1,zip1,field1,domain1,field2,hour2,flag1,total,field3,field4,field5,indicator1,indicator2,flag2,flag3,flag4,flag5
0,12.95,0,CA,925,3,AOL.COM,1,0,1,12.95,-4276,7,0,1,0,1,1,0,1
1,11.01,0,CA,925,3,AOL.COM,1,0,1,11.01,-4276,7,0,1,0,1,1,0,1
2,38.85,0,CA,928,3,HOTMAIL.COM,1,0,0,38.85,2602,21,1,0,0,0,0,0,1
3,25.9,0,NJ,77,0,AOL.COM,1,0,0,25.9,4139,6,0,0,0,1,1,0,1
4,12.95,0,CA,945,3,YAHOO.COM,0,0,1,12.95,3826,9,1,0,0,1,0,0,1


Vemos si hay valores nulos:

In [4]:
print(X.isnull().sum())

amount        0
hour1         0
state1        0
zip1          0
field1        0
domain1       1
field2        0
hour2         0
flag1         0
total         0
field3        0
field4        0
field5        0
indicator1    0
indicator2    0
flag2         0
flag3         0
flag4         0
flag5         0
dtype: int64


Encontramos el índice del valor nulo

In [5]:
null_index = X['domain1'].isnull().idxmax()
null_index

70382

Lo eliminamos tanto de X como de y:

In [6]:
X.dropna(inplace=True)
y.drop(null_index, inplace=True)
complete_data = data.dropna()

Vemos el número de valores únicos de cada variable

In [7]:
for col in list(X):
    print("{}: {}".format(col, X[col].nunique()))

amount: 88
hour1: 24
state1: 53
zip1: 899
field1: 5
domain1: 9809
field2: 2
hour2: 24
flag1: 2
total: 88
field3: 15786
field4: 38
field5: 25
indicator1: 2
indicator2: 2
flag2: 2
flag3: 2
flag4: 2
flag5: 36


Discretizamos las variables numéricas:

In [8]:
columns_to_discretize = ['amount', 'total', 'field3']
num_bins=[20,20,100]

for col,n in zip(columns_to_discretize, num_bins):
    X[col] = pd.cut(X[col], bins=n)
    
X.head()

Unnamed: 0,amount,hour1,state1,zip1,field1,domain1,field2,hour2,flag1,total,field3,field4,field5,indicator1,indicator2,flag2,flag3,flag4,flag5
0,"(9.54, 14.31]",0,CA,925,3,AOL.COM,1,0,1,"(9.54, 14.31]","(-4348.98, -3944.4]",7,0,1,0,1,1,0,1
1,"(9.54, 14.31]",0,CA,925,3,AOL.COM,1,0,1,"(9.54, 14.31]","(-4348.98, -3944.4]",7,0,1,0,1,1,0,1
2,"(38.16, 42.93]",0,CA,928,3,HOTMAIL.COM,1,0,0,"(38.16, 42.93]","(2528.88, 2933.46]",21,1,0,0,0,0,0,1
3,"(23.85, 28.62]",0,NJ,77,0,AOL.COM,1,0,0,"(23.85, 28.62]","(3742.62, 4147.2]",6,0,0,0,1,1,0,1
4,"(9.54, 14.31]",0,CA,945,3,YAHOO.COM,0,0,1,"(9.54, 14.31]","(3742.62, 4147.2]",9,1,0,0,1,0,0,1


Dividimos los datos en conjunto de entrenamiento y validación

In [9]:
X_complete = X
y_complete = y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)
X_complete_train, X_complete_test, y_complete_train, y_complete_test = train_test_split(X_complete, y_complete, test_size=1/3)

print("Particiones obtenidas sobre los datos iniciales:")
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test:  ", X_test.shape)
print("y_test:  ", y_test.shape)
print("\nParticiones obtenidas sobre los datos con los registros completos:")
print("X_complete_train: ", X_complete_train.shape)
print("y_complete_train: ", y_complete_train.shape)
print("X_complete_test:  ", X_complete_test.shape)
print("y_complete_test:  ", y_complete_test.shape)

Particiones obtenidas sobre los datos iniciales:
X_train:  (63120, 19)
y_train:  (63120, 1)
X_test:   (31561, 19)
y_test:   (31561, 1)

Particiones obtenidas sobre los datos con los registros completos:
X_complete_train:  (63120, 19)
y_complete_train:  (63120, 1)
X_complete_test:   (31561, 19)
y_complete_test:   (31561, 1)


# Algoritmos de aprendizaje

### Chow-Liu

Aplicamos el algoritmo de Chow-Liu para aprender una estructura de árbol:

In [None]:
# %time %memit pomegranate_tree = BayesianNetwork.from_samples(complete_data, algorithm='chow-liu', root=19, \
#                                                              state_names=list(data))
%time %memit pomegranate_tree = BayesianNetwork.from_samples(complete_data, algorithm='chow-liu', root=19, \
                                                             state_names=list(data), max_parents=1)

pomegranate_tree.bake()

Visualizamos el árbol resultante:

In [None]:
plot_pomegranate_bn_nx(pomegranate_tree, node_size=1000, node_color='pink')

Podemos hacer inferencia, predeciendo la variabel *Outcome* para los registros de entrenamiento:

In [None]:
nan_vec = np.array([[np.NaN]*X_complete_train.shape[0]]).T
complete_train_uknown_outcome = np.concatenate((X_complete_train.to_numpy(),nan_vec), axis=1)

%time %memit y_pred_train = np.array(pomegranate_tree.predict(complete_train_uknown_outcome))[:,19]

In [None]:
print(classification_report(list(y_pred_train), list(y_complete_train)))
print(accuracy_score(list(y_pred_train), list(y_complete_train)))

O sobre el dataset dejado en la partición para validación:

In [None]:
nan_vec = np.array([[np.NaN]*X_complete_test.shape[0]]).T
complete_data_matrix_uknown_outcome = np.concatenate((X_complete_test.to_numpy(),nan_vec), axis=1)

%time %memit y_pred = np.array(pomegranate_tree.predict(complete_data_matrix_uknown_outcome))[:,19]

In [None]:
print(classification_report(list(y_pred), list(y_complete_test)))
print(accuracy_score(list(y_pred), list(y_complete_test)))

# Comparación Chow-Liu vs Greedy vs Exact

In [None]:
%time %memit chow_liu_model = BayesianNetwork.from_samples(complete_data, algorithm='chow-liu', root=19, \
                                                             state_names=list(data))
%time %memit greedy_model = BayesianNetwork.from_samples(data, algorithm='greedy', state_names=list(data))
%time %memit exact_model = BayesianNetwork.from_samples(data, algorithm='exact', state_names=list(data))

chow_liu_model.bake()
greedy_model.bake()
exact_model.bake()

In [None]:
plot_pomegranate_bn_nx(chow_liu_model, node_size=1000, node_color='pink')
plot_pomegranate_bn_nx(greedy_model, node_size=1000, node_color='pink')
plot_pomegranate_bn_nx(exact_model, layout=nx.planar_layout, node_size=1000, node_color='pink')

In [None]:
nan_vec = np.array([[np.NaN]*X_complete_test.shape[0]]).T
complete_data_matrix_uknown_outcome = np.concatenate((X_complete_test.to_numpy(),nan_vec), axis=1)

%time %memit y_pred_complete_chow_liu = np.array(chow_liu_model.predict(complete_data_matrix_uknown_outcome))[:,19]

nan_vec = np.array([[np.NaN]*X_test.shape[0]]).T
data_matrix_uknown_outcome = np.concatenate((X_test.to_numpy(),nan_vec), axis=1)

# %time %memit y_pred_chow_liu = np.array(chow_liu_model.predict(data_matrix_uknown_outcome))[:,19]
%time %memit y_pred_greedy = np.array(greedy_model.predict(data_matrix_uknown_outcome))[:,19]
%time %memit y_pred_exact = np.array(exact_model.predict(data_matrix_uknown_outcome))[:,19]

In [None]:
print("Chow-Liu:\n", classification_report(list(y_pred_complete_chow_liu), list(y_complete_test)))
# print(classification_report(list(y_pred_chow_liu), list(y_test)))
print("Greedy:\n", classification_report(list(y_pred_greedy), list(y_test)))
print("Exact:\n", classification_report(list(y_pred_exact), list(y_test)))