#Making one set of 10,000 samples

In [1]:
import numpy as np
import random
import networkx as nx

In [2]:
# Step 1: Define the number of original bits and derived bits
num_original_bits = 50
start_index=num_original_bits+1
end_index=1000
num_samples = 10000

In [3]:
# Step 2: Generate formulas for x51 to x1000 in terms of x1 to x50
formulas = {} #list
for i in range(start_index, end_index + 1):

    num_terms = random.randint(2, num_original_bits) # Randomly determine the number of terms from x1 to x50 in the XOR (at least 2)

    selected_vars = random.sample(range(1, num_original_bits + 1), num_terms) # Randomly pick (num_terms many) variables from x1 to x50 for the XOR combination

    formulas[f"x{i}"] = selected_vars # Store the formula as a list of variables for computation

In [4]:
# Step 3: Generate 10k samples of x1 to x50 and compute x51 to x1000 for each sample
dataset = []  # List to store the final dataset
for sample_idx in range(num_samples):

    variables = {f"x{i}": random.randint(0, 1) for i in range(1, num_original_bits + 1)} # Generate random binary values for x1 to x50

    # Compute values for x51 to x1000 using the formulas
    for i in range(start_index, end_index + 1):

        selected_vars = formulas[f"x{i}"] # Get the variables involved in the formula

        # Compute the XOR value by reducing the selected variables
        value = 0
        for var in selected_vars:
            value ^= variables[f"x{var}"]  # XOR operation

        variables[f"x{i}"] = value # Store the computed value

    # Add the sample (x1 to x1000) to the dataset
    dataset.append([variables[f"x{i}"] for i in range(1, end_index + 1)])

In [5]:
# checking the sizes
print(f"Dataset shape: {len(dataset)} samples, {len(dataset[0])} features per sample")

Dataset shape: 10000 samples, 1000 features per sample


In [6]:
# Step 4: Generate the graph
G = nx.DiGraph()  # Directed graph

# Add nodes with features
for i in range(1, end_index + 1):

    # Each node x1 to x1000 will have features from the dataset (10k samples)
    features = [dataset[sample_idx][i-1] for sample_idx in range(num_samples)]  # Features for x1 to x1000
    G.add_node(f"x{i}", features=features)

# Add edges based on the formulas
for i in range(start_index, end_index + 1):

    # For each x51 to x1000, check which variables are involved in the formula
    selected_vars = formulas[f"x{i}"]

    # Create directed edges from the variables (x1 to x50) to x_i (for each x51 to x1000)
    for var in selected_vars:
        G.add_edge(f"x{var}", f"x{i}")

In [9]:
# checking on one random varibale

# Calculate the in-degree of x51
in_degree_x51 = G.in_degree("x51")
print(f"In-degree of x51: {in_degree_x51}")

# Count the number of elements in the formula for x51
num_elements_x51 = len(formulas['x51'])
print(f"Number of elements in the formula for x51: {num_elements_x51}")

In-degree of x51: 9
Number of elements in the formula for x51: 9
