## Initialize the Graph, ImpactCalculator and GNNAnalyzer


In [1]:
from lib.graph.graph.graph import Graph
from lib.gnnanalyzer.gnnanalyzer.localgnnanalyzer import LocalGNNAnalyzer
from lib.gnnanalyzer.gnnanalyzer.localimpactcalculator import (
    LocalImpactCalculator,
    LocalImpactCalculationMethod,
)

In [7]:
NODE_IDX = 0  # Index for the identifier column in nodes
NODE_FEATURE_START_IDX = 1  # Start index for node features columns
NODE_FEATURE_END_IDX = 1433  # End index for node features columns
CLASS_IDX = 1434  # Index for the class column in nodes
SOURCE_IDX = 0  # Index for the source column in edges
TARGET_IDX = 1  # Index for the target column in edges

In [2]:
g = Graph()
g.import_edges_from_edge_list(
    data="./datasets/cora/cora.cites",
    source_target_col=(SOURCE_IDX, TARGET_IDX),
    override_data_file_extension=".txt",
    delimiter="	",
)
g.import_nodes_from_node_list(
    data="./datasets/cora/cora.content",
    node_identifier_col=NODE_IDX,
    features_cols=[i for i in range(NODE_FEATURE_START_IDX, NODE_FEATURE_END_IDX)],
    class_col=CLASS_IDX,
    override_data_file_extension=".txt",
    delimiter="	",
)

# To please PyLance
assert g.nodes is not None
assert g.edges is not None
assert g.nodes.columns is not None
assert g.edges.columns is not None

In [3]:
impact = LocalImpactCalculator(method=LocalImpactCalculationMethod.ABSOLUTE_DIFFERENCE)
analyzer = LocalGNNAnalyzer(graph=g, local_impact_calculator=impact)

## Initial testings


### Creation of a basic model


In [4]:
from lib.graph.graph.graph import ExportFileFormat
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
import torch
import torch.nn.functional as F
import polars as pl

In [5]:
# Define the GNN model
class GNNModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [6]:
# Let's see how is our datasets once exported

nodes_df = g.export_nodes_as_node_list(ExportFileFormat.POLARS_DF)
edges_df = g.export_edges_as_edge_list(ExportFileFormat.POLARS_DF)

assert isinstance(nodes_df, pl.DataFrame)
assert isinstance(edges_df, pl.DataFrame)

print(nodes_df)
print(edges_df)

shape: (2_708, 1_435)
┌─────────┬───────────┬───────────┬───────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ Node    ┆ Feature 0 ┆ Feature 1 ┆ Feature 2 ┆ … ┆ Feature    ┆ Feature   ┆ Feature   ┆ Class     │
│ ---     ┆ ---       ┆ ---       ┆ ---       ┆   ┆ 1430       ┆ 1431      ┆ 1432      ┆ ---       │
│ i64     ┆ i64       ┆ i64       ┆ i64       ┆   ┆ ---        ┆ ---       ┆ ---       ┆ str       │
│         ┆           ┆           ┆           ┆   ┆ i64        ┆ i64       ┆ i64       ┆           │
╞═════════╪═══════════╪═══════════╪═══════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 31336   ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0          ┆ 0         ┆ 0         ┆ Neural_Ne │
│         ┆           ┆           ┆           ┆   ┆            ┆           ┆           ┆ tworks    │
│ 1061127 ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0          ┆ 0         ┆ 0         ┆ Rule_Lear │
│         ┆           ┆           ┆           ┆   ┆            ┆     

In [14]:
# idx to column name
nodes_idx_to_col_name = {idx: col_name for idx, col_name in enumerate(nodes_df.columns)}  # type: ignore

edges_idx_to_col_name = {idx: col_name for idx, col_name in enumerate(edges_df.columns)}  # type: ignore

In [None]:
def create_torch_geometric_data(
    graph: Graph,
    class_idx: int,
    node_col_idx: int,
    node_feature_start_idx: int,
    node_feature_end_idx: int,
    source_idx: int,
    target_idx: int,
) -> Data:
    # Export the nodes and edges to the desired format
    nodes_df = graph.export_nodes_as_node_list(ExportFileFormat.PANDAS_DF)
    edges_df = graph.export_edges_as_edge_list(ExportFileFormat.PANDAS_DF)

    assert isinstance(nodes_df, pl.DataFrame)
    assert isinstance(edges_df, pl.DataFrame)

    # From nodes idx to column name
    nodes_col_node_name = nodes_idx_to_col_name[node_col_idx]
    nodes_col_class_name = nodes_idx_to_col_name[class_idx]
    node_col_features_name = [
        nodes_idx_to_col_name[i]
        for i in range(node_feature_start_idx, node_feature_end_idx)
    ]

    # From edges idx to column name
    edges_col_source_name = edges_idx_to_col_name[source_idx]
    edges_col_target_name = edges_idx_to_col_name[target_idx]

    # Process the nodes DataFrame
    x = torch.tensor(
        nodes_df.select(node_col_features_name).item(),
        dtype=torch.float,
    )
    y = torch.tensor(nodes_df.select(nodes_col_class_name).item(), dtype=torch.long)

    # Process the edges DataFrame
    edge_index = (
        torch.tensor(
            edges_df.select([edges_col_source_name, edges_col_target_name]).item(),
            dtype=torch.long,
        )
        .t()
        .contiguous()
    )

    data = Data(x=x, edge_index=edge_index, y=y)
    return data

In [None]:
# Initialize the GNN model with the appropriate dimensions
input_dim = g.get_node_feature_count()  # Number of features
output_dim = (
    g.get_node_feature_count()
)  # Number of classes, assuming a 'label' column exists
hidden_dim = 16  # Example hidden dimension size
model = GNNModel(input_dim, hidden_dim, output_dim)

# Train the GNN model on the Cora dataset
data = dataset[0]
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [None]:
# Step 7: Integrate the trained model with the LocalGNNAnalyzer
def predict_fn(node_features, edge_index):
    # Prepare a tensor for node_features and edge_index based on GNN requirements
    # Example: For PyG, use the following, adapted for the complete graph.
    model.eval()
    with torch.no_grad():
        out = model(data)
    return out


# Assuming we run the ablation step for node 5 (as an example)
analyzer.prepare_ablation_plan(starting_node="5", max_depth=3)
while analyzer.has_next_step():
    current_prediction = predict_fn(data.x, data.edge_index)
    analyzer.execute_ablation_step(current_prediction)

# Get the interpretation of the GNN's predictions
interpretation = analyzer.get_interpretation()
print(interpretation)