### Generate Spurious-Motif Dataset

In [22]:
from BA3_loc import *
from tqdm import tqdm
import os.path as osp
import warnings
warnings.filterwarnings("ignore")

global_b = '0.9' # Set bias degree here
data_dir = f'../data/SPMotif-{global_b}/raw/'
os.makedirs(data_dir, exist_ok=True)

In [23]:
def get_house(basis_type, nb_shapes=80, width_basis=8, feature_generator=None, m=3, draw=True):
    """ Synthetic Graph #5:

    Start with a tree and attach grid-shaped subgraphs.

    Args:
        nb_shapes         :  The number of shapes (here 'houses') that should be added to the base graph.
        width_basis       :  The width of the basis graph (here a random 'grid').
        feature_generator :  A `FeatureGenerator` for node features. If `None`, add constant features to nodes.
        m                 :  The tree depth.

    Returns:
        G                 :  A networkx graph
        role_id           :  Role ID for each node in synthetic graph
        name              :  A graph identifier
    """
    list_shapes = [["house"]] * nb_shapes

    if draw:
        plt.figure(figsize=figsize)

    G, role_id, _ = synthetic_structsim.build_graph(
        width_basis, basis_type, list_shapes, start=0, rdm_basis_plugins=True
    )
    G = perturb([G], 0.05, id=role_id)[0]

    if feature_generator is None:
        feature_generator = featgen.ConstFeatureGen(1)
    feature_generator.gen_node_features(G)

    name = basis_type + "_" + str(width_basis) + "_" + str(nb_shapes)

    return G, role_id, name

def get_cycle(basis_type, nb_shapes=80, width_basis=8, feature_generator=None, m=3, draw=True):
    """ Synthetic Graph #5:

    Start with a tree and attach grid-shaped subgraphs.

    Args:
        nb_shapes         :  The number of shapes (here 'houses') that should be added to the base graph.
        width_basis       :  The width of the basis graph (here a random 'grid').
        feature_generator :  A `FeatureGenerator` for node features. If `None`, add constant features to nodes.
        m                 :  The tree depth.

    Returns:
        G                 :  A networkx graph
        role_id           :  Role ID for each node in synthetic graph
        name              :  A graph identifier
    """
    list_shapes = [["dircycle"]] * nb_shapes

    if draw:
        plt.figure(figsize=figsize)

    G, role_id, _ = synthetic_structsim.build_graph(
        width_basis, basis_type, list_shapes, start=0, rdm_basis_plugins=True
    )
    G = perturb([G], 0.05, id=role_id)[0]

    if feature_generator is None:
        feature_generator = featgen.ConstFeatureGen(1)
    feature_generator.gen_node_features(G)

    name = basis_type + "_" + str(width_basis) + "_" + str(nb_shapes)

    return G, role_id, name

def get_crane(basis_type, nb_shapes=80, width_basis=8, feature_generator=None, m=3, draw=True):
    """ Synthetic Graph #5:

    Start with a tree and attach grid-shaped subgraphs.

    Args:
        nb_shapes         :  The number of shapes (here 'houses') that should be added to the base graph.
        width_basis       :  The width of the basis graph (here a random 'grid').
        feature_generator :  A `FeatureGenerator` for node features. If `None`, add constant features to nodes.
        m                 :  The tree depth.

    Returns:
        G                 :  A networkx graph
        role_id           :  Role ID for each node in synthetic graph
        name              :  A graph identifier
    """
    list_shapes = [["crane"]] * nb_shapes

    if draw:
        plt.figure(figsize=figsize)

    G, role_id, _ = synthetic_structsim.build_graph(
        width_basis, basis_type, list_shapes, start=0, rdm_basis_plugins=True
    )
    G = perturb([G], 0.05, id=role_id)[0]

    if feature_generator is None:
        feature_generator = featgen.ConstFeatureGen(1)
    feature_generator.gen_node_features(G)

    name = basis_type + "_" + str(width_basis) + "_" + str(nb_shapes)

    return G, role_id, name

## Training Dataset

In [24]:
edge_index_list, label_list = [], []
ground_truth_list, role_id_list, pos_list = [], [], []
bias = float(global_b)

def graph_stats(base_num):
    if base_num == 1:
        base = 'tree'
        width_basis=np.random.choice(range(3))
    if base_num == 2:
        base = 'ladder'
        width_basis=np.random.choice(range(8,12))
    if base_num == 3:
        base = 'wheel'
        width_basis=np.random.choice(range(15,20))
    return base, width_basis

e_mean, n_mean = [], []
for _ in tqdm(range(3000)):
    base_num = np.random.choice([1,2,3], p=[bias,(1-bias)/2,(1-bias)/2])
    base, width_basis = graph_stats(base_num)

    G, role_id, name = get_cycle(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(0)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("#Graphs: %d    #Nodes: %.2f    #Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))

e_mean, n_mean = [], []
for _ in tqdm(range(3000)):
    base_num = np.random.choice([1,2,3], p=[(1-bias)/2,bias,(1-bias)/2])
    base, width_basis = graph_stats(base_num)

    G, role_id, name = get_house(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(1)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("#Graphs: %d    #Nodes: %.2f    #Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))


e_mean, n_mean = [], []
for _ in tqdm(range(3000)):
    base_num = np.random.choice([1,2,3], p=[(1-bias)/2,(1-bias)/2,bias])
    base, width_basis = graph_stats(base_num)
    
    G, role_id, name = get_crane(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(2)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("#Graphs: %d    #Nodes: %.2f    #Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))
np.save(osp.join(data_dir, 'train.npy'), (edge_index_list, label_list, ground_truth_list, role_id_list, pos_list))

 35%|███▌      | 1057/3000 [00:05<00:10, 177.15it/s]

## Val Dataset

In [None]:
edge_index_list, label_list = [], []
ground_truth_list, role_id_list, pos_list = [], [], []
bias = float(global_b)

def graph_stats(base_num):
    if base_num == 1:
        base = 'tree'
        width_basis=np.random.choice(range(3))
    if base_num == 2:
        base = 'ladder'
        width_basis=np.random.choice(range(8,12))
    if base_num == 3:
        base = 'wheel'
        width_basis=np.random.choice(range(15,20))
    return base, width_basis

e_mean, n_mean = [], []
for _ in tqdm(range(1000)):
    base_num = np.random.choice([1,2,3])
    base, width_basis = graph_stats(base_num)

    G, role_id, name = get_cycle(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(0)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("#Graphs: %d    #Nodes: %.2f    #Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))

e_mean, n_mean = [], []
for _ in tqdm(range(2000)):
    base_num = np.random.choice([1,2,3])
    base, width_basis = graph_stats(base_num)

    G, role_id, name = get_house(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(1)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("#Graphs: %d    #Nodes: %.2f    #Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))


e_mean, n_mean = [], []
for _ in tqdm(range(3000)):
    base_num = np.random.choice([1,2,3])
    base, width_basis = graph_stats(base_num)
    
    G, role_id, name = get_crane(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(2)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("# Graphs: %d    # Nodes: %.2f    # Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))
np.save(osp.join(data_dir, 'val.npy'), (edge_index_list, label_list, ground_truth_list, role_id_list, pos_list))

100%|██████████| 1000/1000 [00:06<00:00, 153.69it/s]
  1%|          | 16/2000 [00:00<00:12, 154.92it/s]

#Graphs: 1000    #Nodes: 18.54    #Edges: 27.45 


100%|██████████| 2000/2000 [00:12<00:00, 156.22it/s]
  1%|          | 18/3000 [00:00<00:17, 168.91it/s]

#Graphs: 3000    #Nodes: 18.60    #Edges: 28.56 


100%|██████████| 3000/3000 [00:18<00:00, 158.51it/s]


#Graphs: 6000    #Nodes: 18.48    #Edges: 28.22 


## Testing Dataset

In [None]:
# no bias for test dataset
edge_index_list, label_list = [], []
ground_truth_list, role_id_list, pos_list = [], [], []

def graph_stats_large(base_num):
    if base_num == 1:
        base = 'tree'
        width_basis=np.random.choice(range(3,6))
    if base_num == 2:
        base = 'ladder'
        width_basis=np.random.choice(range(30,50))
    if base_num == 3:
        base = 'wheel'
        width_basis=np.random.choice(range(60,80))
    return base, width_basis

e_mean, n_mean = [], []
for _ in tqdm(range(2000)):
    base_num = np.random.choice([1,2,3]) # uniform
    base, width_basis = graph_stats_large(base_num)

    G, role_id, name = get_cycle(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(0)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("#Graphs: %d    #Nodes: %.2f    #Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))

e_mean, n_mean = [], []
for _ in tqdm(range(2000)):
    base_num = np.random.choice([1,2,3])
    base, width_basis = graph_stats_large(base_num)

    G, role_id, name = get_house(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(1)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("#Graphs: %d    #Nodes: %.2f    #Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))

e_mean, n_mean = [], []
for _ in tqdm(range(2000)):
    base_num = np.random.choice([1,2,3])
    base, width_basis = graph_stats_large(base_num)

    G, role_id, name = get_crane(basis_type=base, nb_shapes=1, 
                                    width_basis=width_basis, feature_generator=None, m=3, draw=False)
    label_list.append(2)
    e_mean.append(len(G.edges))
    n_mean.append(len(G.nodes))

    role_id = np.array(role_id)
    edge_index = np.array(G.edges, dtype=np.int).T

    role_id_list.append(role_id)
    edge_index_list.append(edge_index)
    pos_list.append(np.array(list(nx.spring_layout(G).values())))
    ground_truth_list.append(find_gd(edge_index, role_id))

print("#Graphs: %d    #Nodes: %.2f    #Edges: %.2f " % (len(ground_truth_list), np.mean(n_mean), np.mean(e_mean)))
np.save(osp.join(data_dir, 'test.npy'), (edge_index_list, label_list, ground_truth_list, role_id_list, pos_list))

100%|██████████| 2000/2000 [01:06<00:00, 30.07it/s]
  0%|          | 5/2000 [00:00<00:45, 44.25it/s]

#Graphs: 2000    #Nodes: 89.94    #Edges: 131.95 


100%|██████████| 2000/2000 [01:07<00:00, 29.65it/s]
  0%|          | 5/2000 [00:00<00:40, 49.34it/s]

#Graphs: 4000    #Nodes: 90.93    #Edges: 134.21 


100%|██████████| 2000/2000 [01:07<00:00, 29.84it/s]


#Graphs: 6000    #Nodes: 90.61    #Edges: 134.20 
