In [1]:
import warnings
warnings.filterwarnings('ignore')
import pgmpy
from pgmpy.readwrite import BIFReader
from pgmpy.sampling import BayesianModelSampling
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pickle
import networkx as nx
import pgmpy
from pgmpy.estimators import BDeuScore, K2Score, BicScore
from pgmpy.estimators import PC, HillClimbSearch, ExhaustiveSearch
from IPython import display
import pandas as pd

In [2]:
def preprocess(dataset_name, num_samples=10000, ntrails=10):
    reader = BIFReader('./data/{}.bif'.format(dataset_name))
    bayesmodel = reader.get_model()
    with open('./data/{}_model.pkl'.format(dataset_name), 'wb') as f:
        pickle.dump(bayesmodel, f)
    
    samples_dataframe = BayesianModelSampling(bayesmodel).forward_sample(size=num_samples, return_type='dataframe')
    x_df = samples_dataframe.drop(['asia', 'xray'], axis=1)
    y_df = samples_dataframe.loc[:, samples_dataframe.columns == 'xray']
    d_df = samples_dataframe.loc[:, samples_dataframe.columns == 'asia']
    network = nx.DiGraph(bayesmodel)
    network.remove_node('xray')
    network.remove_node('asia')
    nodes = list(network.nodes())
    edges = list(network.edges())
    length = len(nodes)
    
    layout = nx.spring_layout(network)
    with open('./data/{}_layout.pkl'.format(dataset_name), 'wb') as f:
        pickle.dump(layout, f)
    
    nfeatures = np.array([x_df[node].nunique() for node in x_df])
    nclass = y_df['xray'].nunique()
    ndomain = d_df['asia'].nunique()
    np.save('./data/{}_nfeatures.npy'.format(dataset_name), nfeatures)
    np.save('./data/{}_nclass.npy'.format(dataset_name), nclass)
    np.save('./data/{}_ndomain.npy'.format(dataset_name), ndomain)
    
    x_enc, y_enc, d_enc = OneHotEncoder(), OneHotEncoder(), OneHotEncoder()
    x = x_enc.fit_transform(x_df).toarray()
    y = y_enc.fit_transform(y_df).toarray()
    d = d_enc.fit_transform(d_df).toarray()
    with open('./data/{}_encoders.pkl'.format(dataset_name), 'wb') as f:
        pickle.dump((x_enc, y_enc, d_enc), f)    
    np.save('./data/{}_x_train.npy'.format(dataset_name), x[:num_samples//5*4,:])
    np.save('./data/{}_x_test.npy'.format(dataset_name), x[num_samples//5*4:,:])
    np.save('./data/{}_y_train.npy'.format(dataset_name), y[:num_samples//5*4,:])
    np.save('./data/{}_y_test.npy'.format(dataset_name), y[num_samples//5*4:,:])
    np.save('./data/{}_d_train.npy'.format(dataset_name), d[:num_samples//5*4,:])
    np.save('./data/{}_d_test.npy'.format(dataset_name), d[num_samples//5*4:,:])
    
    node_to_idx = dict(list(zip(nodes, range(length))))
    idx_to_node = dict(list(zip(range(length), nodes)))
    network = []
    for idx in range(length):
        local = [idx]
        for edge in edges:
            if edge[1] == idx_to_node[idx]:
                local.append(node_to_idx[edge[0]])
        network.append(local)
    with open('./data/{}_network.pkl'.format(dataset_name), 'wb') as f:
        pickle.dump(network, f)

In [3]:
DATASET_NAMES = ['asia']

for dataset_name in DATASET_NAMES:
    preprocess(dataset_name)

Generating for node: xray: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 53.64it/s]
