In [1]:
import pathlib
import numpy as np
import pandas as pd
import networkx as nx
import torch
from torch_geometric.utils import from_networkx
from torch_geometric.data import InMemoryDataset
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
def summary(G, graph=False):
    """
    Computes summary statistics for a given graph G.

    Parameters:
    -----------
    G : NetworkX graph object
        The input graph.
    graph : bool, optional (default=False)
        Whether to plot the degree distribution of the graph or not.
    """
    if nx.number_connected_components(G) != 1:
         print("Number of connected components: ", nx.number_connected_components(G))
    else:
        print("Graph is connected.")
    print("Number of nodes: ", G.number_of_nodes())
    print("Number of edges: ", G.number_of_edges())

    if G.is_directed():
        in_degree = 0
        out_degree = 0
        for node in G.nodes():
            in_degree += G.in_degree(node)
            out_degree += G.out_degree(node)
        print("Average in-degree: ", in_degree/G.number_of_nodes())
        print("Average out-degree: ", out_degree/G.number_of_nodes())
        print("Maximum in-degree: ", max(dict(G.in_degree()).values()), "for node", max(dict(G.in_degree()).items(), key=lambda x: x[1])[0])
    else:
        print("Maximum degree: ", max(dict(G.degree()).values()), "for node", max(dict(G.degree()).items(), key=lambda x: x[1])[0])

    print("Average degree: ", 2*G.number_of_edges()/G.number_of_nodes())
    print("Density: ", nx.density(G))
    print("Average clustering coefficient: ", nx.average_clustering(G))
    degree_counts = Counter(dict(G.degree()).values())

    if graph:
        plt.loglog()
        plt.scatter(degree_counts.keys(), degree_counts.values())
        plt.xlabel("k")
        plt.ylabel("p_k")
        plt.show()


In [3]:
###########################################################################################
SEED = 42
#FILE_ABS_PATH = pathlib.Path(__file__) # absolute path of this file
###########################################################################################

# TODO:
# 1. add features to all the nodes
# 2. add the option to return the dataset as one big graph or a list of disconnected graphs
# 3. ...


# get a dataframe with source and target columns
# the columns contain the id's of the nodes in the graph
def tables_to_graph(df, source, target, directed=True):
    
    # generate a graph from the dataframe
    create_using=nx.DiGraph() if directed else nx.Graph()
    G = nx.from_pandas_edgelist(df, source=source, target=target, create_using=create_using)
    
    # TODO: features (if the df is a join of the two tables we can do it by specifying the source and target columns we want for features)
    # TODO: disconnected graphs (figure out if we want strongly/weakly connected graphs or just all the reachable nodes from the source node in the parent table)
    
    return G


# read in rossmann data and convert it to a graph
def rossman_to_graph(dir_path, train=True):
    store_df = pd.read_csv(dir_path / "store.csv")
    sales_df = pd.read_csv(dir_path / "train.csv") if train else pd.read_csv(dir_path / "test.csv")
    
    store_id_mapping = {store_id: i for i, store_id in enumerate(store_df["Store"].unique())}
    sales_id_mapping = {sales_id: i + len(store_id_mapping) for i, sales_id in enumerate(sales_df.index)}
    
    store_sales_df = pd.DataFrame()
    store_sales_df["Store"] = sales_df["Store"].map(store_id_mapping)
    store_sales_df["Sale"] = sales_df.index.map(sales_id_mapping)
    
    root_nodes = store_sales_df["Store"].unique().tolist()
    G = tables_to_graph(store_sales_df, source="Store", target="Sale")

    return G, root_nodes


# read in mutagenesis data and convert it to a graph
def mutagenesis_to_graph(dir_path):
    molecule_df = pd.read_csv(dir_path + "/molecule.csv")
    atom_df = pd.read_csv(dir_path + "/atom.csv")
    bond_df = pd.read_csv(dir_path + "/bond.csv")
    
    molecule_id_mapping = {molecule_id: i for i, molecule_id in enumerate(molecule_df["molecule_id"].unique())}
    atom_id_mapping = {atom_id: i + len(molecule_id_mapping) for i, atom_id in enumerate(atom_df["atom_id"].unique())}
    bond_id_mapping = {bond_id: i + len(molecule_id_mapping) + len(atom_id_mapping) for i, bond_id in enumerate(bond_df.index)}
    
    # first bipartite component
    molecule_atom_df = pd.DataFrame()
    molecule_atom_df["Molecule"] = atom_df["molecule_id"].map(molecule_id_mapping)
    molecule_atom_df["Atom"] = atom_df["atom_id"].map(atom_id_mapping)
    G_molecule_to_atom = tables_to_graph(molecule_atom_df, source="Molecule", target="Atom")

    # Label molecules and atoms in G_molecule_to_atom
    for node in G_molecule_to_atom.nodes():
        if node in molecule_id_mapping.values():
            G_molecule_to_atom.nodes[node]['y'] = 0
        else:
            G_molecule_to_atom.nodes[node]['y'] = 1

    # second bipartite component
    atom_bond_df = pd.DataFrame()
    atom_bond_df["Atom1"] = bond_df["atom1_id"].map(atom_id_mapping)
    atom_bond_df["Atom2"] = bond_df["atom2_id"].map(atom_id_mapping)
    atom_bond_df["Bond"] = bond_df.index.map(bond_id_mapping)
    # connect atom1 to bond
    G_atom1_to_bond = tables_to_graph(atom_bond_df, source="Atom1", target="Bond")
    # connect atom2 to bond
    G_atom2_to_bond = tables_to_graph(atom_bond_df, source="Atom2", target="Bond")
    # combine the two graphs
    G_atom_to_bond = nx.compose(G_atom1_to_bond, G_atom2_to_bond)

    # Label atoms and bonds in G_atom_to_bond
    for node in G_atom_to_bond.nodes():
        if node in atom_id_mapping.values():
            G_atom_to_bond.nodes[node]['y'] = 1
        else:
            G_atom_to_bond.nodes[node]['y'] = 2
    
    
    root_nodes = molecule_atom_df["Molecule"].unique().tolist()
    # combine the two bipartite components
    G = nx.compose(G_molecule_to_atom, G_atom_to_bond)

    return G, root_nodes


# assume we have a parent table with 1:N relationship
# -> we can split the graph by choosing the nodes in the parent table as root nodes and generating a tree for each root node
# NOTE: we can imagine this as modeling the dataset tables as a list of connected rows
def graph_to_subgraphs(G, root_nodes):
    subgraphs = []
    for root_node in root_nodes:
        subgraph_nodes = nx.descendants(G, root_node)
        subgraph_nodes.add(root_node) # add for sets works inplace
        subgraphs.append(G.subgraph(subgraph_nodes))
    return subgraphs

In [4]:
G1, G1_roots = mutagenesis_to_graph("../../data/mutagenesis")

In [5]:
# transform G1 to an undirected graph
G1_undirected = G1.to_undirected()
summary(G1_undirected)

Number of connected components:  188
Number of nodes:  10324
Number of edges:  15379
Maximum degree:  40 for node 135
Average degree:  2.979271600154979
Density:  0.00028860521167828915
Average clustering coefficient:  0.0


In [6]:
# G1_components = graph_to_subgraphs(G1, G1_roots)
# nx.draw(G1_components[5])

In [20]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

def convert_networkx_to_pyg(graph):
    # Convert NetworkX graph to PyG Data object
    data = from_networkx(graph)

    # Extract node features 'y' from NetworkX graph and convert to tensor
    features = []
    node_id_mapping = {}  # Mapping of node IDs to their indices
    for i, (node_id, node_data) in enumerate(graph.nodes(data=True)):
        features.append([node_data['y']])
        node_id_mapping[node_id] = i  # Map the original node ID to its index
    data.x = torch.tensor(features, dtype=torch.float)

    # Calculate and add node degrees
    degrees = [val for _, val in graph.degree()]
    data.y = torch.tensor(degrees, dtype=torch.float).view(-1, 1)

    return data, node_id_mapping


In [21]:
# get data from our graph
data, node_mapping = convert_networkx_to_pyg(G1)

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GINConv

class GINModel(nn.Module):
    def __init__(self, num_features):
        super(GINModel, self).__init__()
        nn1 = nn.Sequential(nn.Linear(num_features, 32), nn.ReLU(), nn.Linear(32, 32))
        nn2 = nn.Sequential(nn.Linear(32, 32), nn.ReLU(), nn.Linear(32, 32))

        self.conv1 = GINConv(nn1)
        self.conv2 = GINConv(nn2)
        self.fc = nn.Linear(32, 1)  # Output layer for degree prediction

    def forward(self, x, edge_index, return_embeds=False):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))

        if return_embeds:
            return x  # Return embeddings after the last GINConv layer

        return self.fc(x)  # Continue to the final output layer

In [23]:
# Assuming each node has one feature
model = GINModel(num_features=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_func = nn.MSELoss()
epochs = 200

# Training loop
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_func(out, data.y)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Extract embeddings after the final epoch
model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index, return_embeds=True)

Epoch 0, Loss: 20.16192626953125
Epoch 10, Loss: 13.418213844299316
Epoch 20, Loss: 10.96291446685791
Epoch 30, Loss: 8.75473690032959
Epoch 40, Loss: 2.9852118492126465
Epoch 50, Loss: 1.347115159034729
Epoch 60, Loss: 1.227333664894104
Epoch 70, Loss: 1.2706952095031738
Epoch 80, Loss: 1.2169338464736938
Epoch 90, Loss: 1.2114359140396118
Epoch 100, Loss: 1.211816668510437
Epoch 110, Loss: 1.2093065977096558
Epoch 120, Loss: 1.2096365690231323
Epoch 130, Loss: 1.2092987298965454
Epoch 140, Loss: 1.2093380689620972
Epoch 150, Loss: 1.2097351551055908
Epoch 160, Loss: 1.213737964630127
Epoch 170, Loss: 1.2095575332641602
Epoch 180, Loss: 1.2093513011932373
Epoch 190, Loss: 1.209480881690979


In [29]:
root_embeddings = {}
for node_id in G1_roots:
    node_index = node_mapping[node_id]
    embedding = embeddings[node_index]
    root_embeddings[node_id] = embedding

In [32]:
import json

import json

# Convert tensor embeddings to lists for JSON serialization
root_embeddings_serializable = {node_id: embedding.tolist() for node_id, embedding in root_embeddings.items()}

# Define the file path
file_path = "root_embeddings.json"

# Save to a JSON file
with open(file_path, 'w') as f:
    json.dump(root_embeddings_serializable, f)

print(f"Saved embeddings to {file_path}")

Saved embeddings to root_embeddings.json


In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch_geometric.nn import GINConv

# class GINModel(nn.Module):
#     def __init__(self, num_features):
#         super(GINModel, self).__init__()
#         nn1 = nn.Sequential(nn.Linear(num_features, 32), nn.ReLU(), nn.Linear(32, 32))
#         nn2 = nn.Sequential(nn.Linear(32, 32), nn.ReLU(), nn.Linear(32, 32))

#         self.conv1 = GINConv(nn1)
#         self.conv2 = GINConv(nn2)
#         self.fc = nn.Linear(32, 1)  # Output layer for degree prediction

#     def forward(self, x, edge_index):
#         x = F.relu(self.conv1(x, edge_index))
#         x = F.relu(self.conv2(x, edge_index))
#         return self.fc(x)
