In [55]:
import pandas as pd
import numpy as np
from pgmpy.estimators import K2Score
from pgmpy.models import BayesianModel
from pgmpy.estimators import HillClimbSearch, BayesianEstimator

def load_data(DATA_CSV):
    D = pd.read_csv(DATA_CSV)
    V = D.columns
    N = len(D.index)
    V_CARD = {v: len(D[v].unique()) for v in V}
    #print(f'ARQUIVO: {D}')
    print(f'VARIÁVEIS na ordem inicial: {V}')
    #print(f'NÚMERO DE AMOSTRAS: {N}')
    #print(f'MAPEAMENTO DAS VARIÁVEIS COM NÚMEROS DE VALORES ÚNICOS: {V_CARD}')
    return D, V, N, V_CARD

def calcular_k2(D):
    k2score = K2Score(D)
    return k2score

def estimar_modelo(D, scoring_method):
    estimator_k2 = HillClimbSearch(D)
    max_possible_edges = len(V) * (len(V) - 1) / 2
    max_iter = min(max_possible_edges, 1000)
    best_model = estimator_k2.estimate(scoring_method='k2score', tabu_length=50, max_indegree=4, max_iter=max_iter)
    return best_model

def tabular_cpd(best_model, D):
    bayesian_network = BayesianModel(best_model)
    estimator = BayesianEstimator(bayesian_network, D)
    cpds = []
    for node in bayesian_network.nodes():
        cpd = estimator.estimate_cpd(node)
        cpds.append(cpd)
    return cpds, bayesian_network

# Caminho do arquivo CSV
DATA_CSV = 'contact-lenses.csv'

# Carregar dados
D, V, N, V_CARD = load_data(DATA_CSV)

# Embaralhar os dados
D_shuffled_columns = D.sample(frac=1, axis=1)
#gerando arquivos csv para cada ordenação
D_shuffled_columns.to_csv('data_shuffled.csv', index=False)

V_shuffled = D_shuffled_columns.columns
print(f'Dados embaralhados: {V_shuffled}')


# Calcula a pontuação K2
k2score = calcular_k2(D_shuffled_columns)

# Estima a estrutura do modelo com o K2
best_model = estimar_modelo(D_shuffled_columns, k2score)
print(f'Melhor modelo: {best_model}')

# Valor do Score gerado pelo K2
k2_score = k2score.score(best_model)
print(f'Valor do score K2: {k2_score}')

# Exibe a estrutura do modelo
structure = (best_model)
print(f'Estrutura da rede: {structure}')

# Estima as CPDs e passa a bayesian_network
cpds, bayesian_network = tabular_cpd(best_model, D_shuffled_columns)
print(f'CPDs: {cpds}')
print(f'Bayesian Network: {bayesian_network}')


VARIÁVEIS na ordem inicial: Index(['age', 'spectacle-prescrip', 'astigmatism', 'tear-prod-rate',
       'contact-lenses'],
      dtype='object')
Dados embaralhados: Index(['tear-prod-rate', 'spectacle-prescrip', 'astigmatism', 'age',
       'contact-lenses'],
      dtype='object')


 20%|██        | 2/10 [00:00<00:00, 19.10it/s]


Melhor modelo: DAG with 5 nodes and 2 edges
Valor do score K2: -98.72503647411928
Estrutura da rede: DAG with 5 nodes and 2 edges
CPDs: [<TabularCPD representing P(tear-prod-rate:2 | contact-lenses:3) at 0x2570dca8f80>, <TabularCPD representing P(spectacle-prescrip:2) at 0x2570dca8b30>, <TabularCPD representing P(astigmatism:2 | contact-lenses:3) at 0x2570dcab890>, <TabularCPD representing P(age:3) at 0x2570dcaa600>, <TabularCPD representing P(contact-lenses:3) at 0x2570dcaaba0>]
Bayesian Network: BayesianModel with 5 nodes and 2 edges


In [93]:
import pandas as pd
import numpy as np
from pgmpy.estimators import K2Score
from pgmpy.models import BayesianModel
from pgmpy.estimators import HillClimbSearch, BayesianEstimator

def load_data(DATA_CSV):
    D = pd.read_csv(DATA_CSV)
    V = D.columns
    N = len(D.index)
    V_CARD = {v: len(D[v].unique()) for v in V}
    print(f'ARQUIVO: {D}')
    print(f'VARIÁVEIS: {V}')
    #print(f'NÚMERO DE AMOSTRAS: {N}')
    #print(f'MAPEAMENTO DAS VARIÁVEIS COM NÚMEROS DE VALORES ÚNICOS: {V_CARD}')
    return D, V, N, V_CARD

def calcular_k2(D):
    k2score = K2Score(D)
    return k2score

def estimar_modelo(D, V, V_CARD):
    estimator_k2 = HillClimbSearch(D)
    max_possible_edges = len(V) * (len(V) - 1) / 2
    max_iter = min(max_possible_edges, 1000)
    best_model = estimator_k2.estimate(scoring_method='k2score', tabu_length=50, max_indegree=4, max_iter=max_iter)
    return best_model

def tabular_cpd(best_model, D):
    bayesian_network = BayesianModel(best_model)
    estimator = BayesianEstimator(bayesian_network, D)
    cpds = []
    for node in bayesian_network.nodes():
        cpd = estimator.estimate_cpd(node)
        cpds.append(cpd)
    return cpds, bayesian_network

def generate_shuffled_csv(DATA_CSV):
    D = pd.read_csv(DATA_CSV)
    D_shuffled_columns = D.sample(frac=1, axis=1)
    D_shuffled_columns.to_csv('data_shuffled.csv', index=False)

# Caminho do arquivo CSV original
DATA_CSV = 'contact-lenses.csv'

# Gerar um novo arquivo CSV com os dados embaralhados
generate_shuffled_csv(DATA_CSV)

# Carregar dados
D, V, N, V_CARD = load_data('data_shuffled.csv')

# Calcula a pontuação K2
k2score = calcular_k2(D)

# Estima a estrutura do modelo com o K2
best_model = estimar_modelo(D, V, V_CARD)
print(f'Melhor modelo: {best_model}')

# Valor do Score gerado pelo K2
k2_score = k2score.score(best_model)
print(f'Valor do score K2: {k2_score}')

# Exibe a estrutura do modelo
structure = (best_model)
print(f'Estrutura da rede: {structure}')

# Estima as CPDs e passa a bayesian_network
cpds, bayesian_network = tabular_cpd(best_model, D)
print(f'CPDs: {cpds}')
print(f'Bayesian Network: {bayesian_network}')


ARQUIVO:    astigmatism             age contact-lenses spectacle-prescrip  \
0           no           young           none              myope   
1           no           young           soft              myope   
2          yes           young           none              myope   
3          yes           young           hard              myope   
4           no           young           none       hypermetrope   
5           no           young           soft       hypermetrope   
6          yes           young           none       hypermetrope   
7          yes           young           hard       hypermetrope   
8           no  pre-presbyopic           none              myope   
9           no  pre-presbyopic           soft              myope   
10         yes  pre-presbyopic           none              myope   
11         yes  pre-presbyopic           hard              myope   
12          no  pre-presbyopic           none       hypermetrope   
13          no  pre-presbyopic         

 20%|██        | 2/10 [00:00<00:00, 27.34it/s]


Melhor modelo: DAG with 5 nodes and 2 edges
Valor do score K2: -98.72503647411929
Estrutura da rede: DAG with 5 nodes and 2 edges
CPDs: [<TabularCPD representing P(astigmatism:2 | contact-lenses:3) at 0x2570dca4740>, <TabularCPD representing P(age:3) at 0x2570dca3ef0>, <TabularCPD representing P(contact-lenses:3) at 0x2570dca0fb0>, <TabularCPD representing P(spectacle-prescrip:2) at 0x2570dc9fe00>, <TabularCPD representing P(tear-prod-rate:2 | contact-lenses:3) at 0x2570dca11f0>]
Bayesian Network: BayesianModel with 5 nodes and 2 edges
