# Impementing Baynesian Network for Hott Partitions

In [1]:
path = r'----path to data----'

In [4]:
import os
import time
import random
import pickle
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.models import BayesianModel
from pgmpy.sampling import BayesianModelSampling

start_time = time.time()

# Suppressing unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

synthetic_partitions = []

class hottbayesian_net(object):
    
    def __init__(self, subpaths, sample_path, seed=None):
        self.subpaths = subpaths
        self.sample_path = sample_path
        self.seed = None
        """
        parameters: dict
            {'discrete': list of index for discrete features in one MP,
             'epochs': number of epochs}
        subpaths: list of MPs
        sample_path = path to save and load data
        seed: set seed
        """
        
    def bayesian_net(self, data, no):
        
        # Initialize Model
        model = BayesianNetwork()
        
        # Defining Network
        for col in data.columns:
            model.add_node(col) # Adding nodes
        for i in range(len(data.columns)-1):
            model.add_edge(data.columns[i], data.columns[i+1]) # Adding edges
        
        # Defining Estimator
        estimator = BayesianEstimator(model, data) 
        
        # Fitting the Model
        model.fit(data)
        
        # Generating Synthetic Data
        sampler = BayesianModelSampling(model)
        synthetic_data = sampler.forward_sample(len(data))
        
        return synthetic_data
    
    def bayesian_net_fit(self):
        """
        Start the training and data generation
        """
        
        if self.seed:
            random.seed(self.seed) # replicable
            np.random.seed(self.seed)
            
        # Iterating over paths to input data, also marking index for files
        for index, sub_path in enumerate(self.subpaths):
            
            # Joining path to load the hott-partitions
            path = self.sample_path + '\\'+ sub_path + '.csv' 
            
            # Loading data
            data = pd.read_csv(path) 
            
            data.drop('ind', axis=1, inplace=True)
            
            print('now running Bayesian for pattern :', index) # index corresponds to pattern here

            dataX_hat = self.bayesian_net(data, data.shape[0]) # Generating Synthetic Data
            
            synthetic_partitions.append(dataX_hat)
            
            wd = self.sample_path + '//samples//hottbayesian//' + sub_path + '//' # Saving data to given path
            if not os.path.isdir(wd): # If directory is not existing, create one
                os.makedirs(wd)
            with open(wd + sub_path + '.pk', 'wb') as f: # Pickle the file
                pickle.dump(file=f, obj=dataX_hat) 
                
#------------------ Execution---------------------
subpaths = ['sub1', 'sub2', 'sub3', 'sub4', 'sub5']
sample_path = path
my_bn = hottbayesian_net(subpaths, sample_path, seed=1)

my_bn.bayesian_net_fit()

print("--- Execution Time for Hott-Bayesian Network: %s seconds ---" % (time.time() - start_time))

now running Bayesian for pattern : 0


  0%|          | 0/2 [00:00<?, ?it/s]

now running Bayesian for pattern : 1


  0%|          | 0/3 [00:00<?, ?it/s]

now running Bayesian for pattern : 2


  0%|          | 0/4 [00:00<?, ?it/s]

now running Bayesian for pattern : 3


  0%|          | 0/5 [00:00<?, ?it/s]

now running Bayesian for pattern : 4


  0%|          | 0/6 [00:00<?, ?it/s]

--- Execution Time for Hott-Bayesian Network: 71.30917477607727 seconds ---


In [5]:
synthetic_partitions[0].head()

Unnamed: 0,0,1
0,-28.631023,-87.150273
1,-141.275976,2.768324
2,-137.786146,-93.48031
3,-15.881477,-72.8511
4,-110.943941,-103.660544
