## Initialize the Graph, ImpactCalculator and GNNAnalyzer


In [1]:
from lib.graph.graph.graph import Graph
from lib.gnnanalyzer.gnnanalyzer.localgnnanalyzer import LocalGNNAnalyzer
from lib.gnnanalyzer.gnnanalyzer.localimpactcalculator import (
    LocalImpactCalculator,
    LocalImpactCalculationMethod,
)

In [2]:
NODE_IDX = 0  # Index for the identifier column in nodes
NODE_FEATURE_START_IDX = 1  # Start index for node features columns
NODE_FEATURE_END_IDX = 1433  # End index for node features columns
CLASS_IDX = 1434  # Index for the class column in nodes
SOURCE_IDX = 0  # Index for the source column in edges
TARGET_IDX = 1  # Index for the target column in edges

In [3]:
g = Graph()
g.import_edges_from_edge_list(
    data="./datasets/cora/cora.cites",
    source_target_col=(SOURCE_IDX, TARGET_IDX),
    override_data_file_extension=".txt",
    delimiter="	",
)
g.import_nodes_from_node_list(
    data="./datasets/cora/cora.content",
    node_identifier_col=NODE_IDX,
    features_cols=[i for i in range(NODE_FEATURE_START_IDX, NODE_FEATURE_END_IDX)],
    class_col=CLASS_IDX,
    override_data_file_extension=".txt",
    delimiter="	",
)

# To please PyLance
assert g.nodes is not None
assert g.edges is not None
assert g.nodes.columns is not None
assert g.edges.columns is not None

In [4]:
impact = LocalImpactCalculator(method=LocalImpactCalculationMethod.ABSOLUTE_DIFFERENCE)
analyzer = LocalGNNAnalyzer(graph=g, local_impact_calculator=impact)

## Initial testings


### Creation of a basic model


In [17]:
from lib.graph.graph.graph import ExportFileFormat
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
import torch
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
import polars as pl
import numpy as np

We define a very basic GNN


In [6]:
# Define the GNN model
class GNNModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

We define the device to be used


In [7]:
device_str = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available() and torch.backends.mps.is_built()
    else "cpu"
)
device = torch.device(device_str)

The Graph Class allows us to get back the dataset in a multitude of formats; here we are getting a Polars DataFrame.


In [8]:
# Let's see how is our datasets once exported

nodes_df = g.export_nodes_as_node_list(ExportFileFormat.POLARS_DF)
edges_df = g.export_edges_as_edge_list(ExportFileFormat.POLARS_DF)

assert isinstance(nodes_df, pl.DataFrame)
assert isinstance(edges_df, pl.DataFrame)

print(nodes_df)
print(edges_df)

shape: (2_708, 1_435)
┌─────────┬───────────┬───────────┬───────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ Node    ┆ Feature 0 ┆ Feature 1 ┆ Feature 2 ┆ … ┆ Feature    ┆ Feature   ┆ column_14 ┆ Class     │
│ ---     ┆ ---       ┆ ---       ┆ ---       ┆   ┆ 1430       ┆ 1431      ┆ 34        ┆ ---       │
│ i64     ┆ i64       ┆ i64       ┆ i64       ┆   ┆ ---        ┆ ---       ┆ ---       ┆ str       │
│         ┆           ┆           ┆           ┆   ┆ i64        ┆ i64       ┆ i64       ┆           │
╞═════════╪═══════════╪═══════════╪═══════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 31336   ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0          ┆ 0         ┆ 0         ┆ Neural_Ne │
│         ┆           ┆           ┆           ┆   ┆            ┆           ┆           ┆ tworks    │
│ 1061127 ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0          ┆ 0         ┆ 0         ┆ Rule_Lear │
│         ┆           ┆           ┆           ┆   ┆            ┆     

We need to map the name of the columns in the dataframe to the indices we defined ealier


In [9]:
# idx to column name
nodes_idx_to_col_name = {idx: col_name for idx, col_name in enumerate(nodes_df.columns)}  # type: ignore

edges_idx_to_col_name = {idx: col_name for idx, col_name in enumerate(edges_df.columns)}  # type: ignore

We also need a function that converts the Graph Class to a PyTorch Geometric Data object, as this will repeatedly be used in the analysis process


In [22]:
def create_torch_geometric_data(
    graph: Graph,
    class_idx: int,
    node_col_idx: int,
    node_feature_start_idx: int,
    node_feature_end_idx: int,
    source_idx: int,
    target_idx: int,
) -> Data:
    # Export the nodes and edges to the desired format
    nodes_df = graph.export_nodes_as_node_list(ExportFileFormat.POLARS_DF)
    edges_df = graph.export_edges_as_edge_list(ExportFileFormat.POLARS_DF)

    assert isinstance(nodes_df, pl.DataFrame)
    assert isinstance(edges_df, pl.DataFrame)

    # From nodes idx to column name
    nodes_col_node_name = nodes_idx_to_col_name[node_col_idx]
    nodes_col_class_name = nodes_idx_to_col_name[class_idx]
    node_col_features_name = [
        nodes_idx_to_col_name[i]
        for i in range(node_feature_start_idx, node_feature_end_idx)
    ]

    # From edges idx to column name
    edges_col_source_name = edges_idx_to_col_name[source_idx]
    edges_col_target_name = edges_idx_to_col_name[target_idx]

    # Convert categorical class labels to numeric
    class_labels = nodes_df.select(nodes_col_class_name).to_numpy()
    encoder = LabelEncoder()
    Y = encoder.fit_transform(class_labels)

    # Process the nodes DataFrame
    x = torch.tensor(
        nodes_df.select(node_col_features_name).to_numpy(),
        dtype=torch.float,
    )
    y = torch.tensor(Y, dtype=torch.long)

    # Process the edges DataFrame
    edge_index = (
        torch.tensor(
            edges_df.select([edges_col_source_name, edges_col_target_name]).to_numpy(),
            dtype=torch.long,
        )
        .t()
        .contiguous()
    )

    data = Data(x=x, edge_index=edge_index, y=y)
    return data

### Training and testing the model


In [23]:
# Convert the graph to a PyTorch Geometric Data object
torch_geo_data = create_torch_geometric_data(
    graph=g,
    class_idx=CLASS_IDX,
    node_col_idx=NODE_IDX,
    node_feature_start_idx=NODE_FEATURE_START_IDX,
    node_feature_end_idx=NODE_FEATURE_END_IDX,
    source_idx=SOURCE_IDX,
    target_idx=TARGET_IDX,
)

# Define some parameters for the GNN model
input_dim = torch_geo_data.num_node_features
hidden_dim = 64  # Example hidden dimension size
output_dim = torch_geo_data.y.max().item() + 1  # Number of classes

model = GNNModel(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(
    device=device
)
torch_geo_data = torch_geo_data.to(device=device_str)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()  # Criterion assumes model returns raw logits

# Train the GNN model
model.train()
for epoch in range(200):  # A default number of epochs for the example
    optimizer.zero_grad()
    output = model(torch_geo_data)
    loss = criterion(
        output[torch_geo_data.train_mask], torch_geo_data.y[torch_geo_data.train_mask]
    )
    loss.backward()
    optimizer.step()
    # Print the loss every 10 epochs or the last epoch
    if epoch % 10 == 0 or epoch == 199:
        print(f"Epoch {epoch}: Loss {loss.item()}")

# Get prediction for a specific node (example node index 0)
model.eval()
with torch.no_grad():
    output = model(torch_geo_data)
    # Use softmax to convert logits to probabilities
    probs = F.softmax(output, dim=1)
    predicted_class = (
        probs[0].argmax().item()
    )  # Get the predicted class for node at index 0

print(f"Predicted class for the node at index 0: {predicted_class}")

  y = column_or_1d(y, warn=True)


AttributeError: 'GlobalStorage' object has no attribute 'train_mask'

In [None]:
# Step 7: Integrate the trained model with the LocalGNNAnalyzer
def predict_fn(node_features, edge_index):
    # Prepare a tensor for node_features and edge_index based on GNN requirements
    # Example: For PyG, use the following, adapted for the complete graph.
    model.eval()
    with torch.no_grad():
        out = model(data)
    return out


# Assuming we run the ablation step for node 5 (as an example)
analyzer.prepare_ablation_plan(starting_node="5", max_depth=3)
while analyzer.has_next_step():
    current_prediction = predict_fn(data.x, data.edge_index)
    analyzer.execute_ablation_step(current_prediction)

# Get the interpretation of the GNN's predictions
interpretation = analyzer.get_interpretation()
print(interpretation)