<a href="https://colab.research.google.com/github/VineetMalik14/Causal-Structure-Learning-Survey/blob/main/discrete_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pgmpy
!pip install pgmpy networkx numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import networkx as nx
from pgmpy.models import BayesianModel
from pgmpy.sampling import BayesianModelSampling
from pgmpy.factors.discrete import TabularCPD

def generate_random_dag(num_nodes, num_edges):
    G = nx.DiGraph()
    G.add_nodes_from(range(num_nodes))
    
    while G.number_of_edges() < num_edges:
        u, v = np.random.choice(num_nodes, size=2, replace=False)
        if u == v or G.has_edge(u, v):
            continue
        G.add_edge(u, v)
        if not nx.is_directed_acyclic_graph(G):
            G.remove_edge(u, v)
            
    return G

def generate_random_cpd(variable, variable_card, evidence, evidence_cards):
    if len(evidence) == 0:
        probabilities = np.random.dirichlet(np.ones(variable_card))
        probabilities = np.array([probabilities])
        probabilities = probabilities.T
        cpd = TabularCPD(variable, variable_card, probabilities)
    else:
        num_rows = variable_card
        num_cols = np.prod(evidence_cards)
        probabilities = np.random.dirichlet(np.ones(num_rows), size=num_cols)
        cpd = TabularCPD(variable, variable_card, probabilities.T,
                         evidence=evidence, evidence_card=evidence_cards)
    return cpd

# Set the number of nodes and edges
num_nodes = 10
num_edges = 2 * num_nodes

# Set the number of categories for each variable (assuming the same for all)
variable_card = 2

# Generate a random DAG
random_dag = generate_random_dag(num_nodes, num_edges)
edges = list(random_dag.edges)

# Create a Bayesian network with the random DAG
model = BayesianModel(edges)

# Generate random CPD tables and add them to the Bayesian network
for node in random_dag.nodes:
    parent_nodes = list(random_dag.predecessors(node))
    parent_cards = [variable_card] * len(parent_nodes)
    cpd = generate_random_cpd(node, variable_card, parent_nodes, parent_cards)
    model.add_cpds(cpd)

# Verify if the model is consistent
assert model.check_model()

# Set the number of samples you want to generate
num_samples = 1000

# Instantiate the sampling class and generate samples
inference = BayesianModelSampling(model)
synthetic_data = inference.forward_sample(size=num_samples)

# Print the synthetic data
print(synthetic_data)




  0%|          | 0/10 [00:00<?, ?it/s]

     0  8  1  9  5  2  7  3  6  4
0    1  0  0  1  0  0  0  0  1  1
1    1  0  1  1  1  0  0  0  1  1
2    0  0  1  0  1  0  0  0  0  1
3    1  0  1  1  1  0  0  1  1  0
4    0  0  1  0  0  0  0  1  0  1
..  .. .. .. .. .. .. .. .. .. ..
995  0  1  1  0  1  0  1  0  0  1
996  0  0  1  0  1  1  0  1  1  1
997  1  0  1  0  1  1  1  0  1  1
998  0  1  1  0  1  1  0  0  1  1
999  1  1  0  0  0  1  0  1  1  1

[1000 rows x 10 columns]


  warn(
  warn(
  warn(


In [None]:
synthetic_data.to_csv('synthetic_data.csv', index=False)