In [1]:
# PyTorch and PyTorch Geometric imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
import torch_geometric.utils as pyg_utils
from torch_geometric.utils import from_networkx

# NetworkX imports
import networkx as nx
from networkx.algorithms.centrality import betweenness_centrality

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression

# Other imports
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
import random

# Custom imports
from Datasets.synthetics import BA_2grid, BA_2grid_house, ProbingDataset, BA_2grid_to_test
from models.models_BA_2grid import GIN_framework as framework

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def set_seed(seed):
    if seed == -1:
        seed = random.randint(0, 1000)
    # Pandas also uses np random state by default
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

    # if you are using GPU
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [4]:
set_seed(37)

Model

In [5]:
MODEL = "GCN"
DATASET = "BA_2grid_house"

# import the model
from models.models_BA_2grid_house import GCN_framework as framework
# import the dataset
from Datasets.synthetics import BA_2grid_house

In [6]:
# inizialize the framework
dataset = BA_2grid_house()
gnn = framework(dataset,device="cpu")



In [7]:
#dataset length
len(dataset)

2000

In [None]:
#pritn the distribution of the number of nodes per graph in the dataset

# number of nodes per graph
nodes = [len(graph.x) for graph in dataset]
plt.hist(nodes, bins=range(0, max(nodes)+1, 1), alpha=0.75, rwidth=0.85, color='b', edgecolor='black')
plt.xlabel('Number of nodes')
plt.ylabel('Frequency')
plt.title('Number of nodes per graph')
plt.show()

#print the min and max number of nodes in the dataset
min(nodes), max(nodes)

In [None]:
#number of nodes of the first graph 
len(dataset[0].x)

In [None]:
# the gnn object contains the train test split and the model.

print(gnn.model)
print(gnn.train_idx)
print(gnn.dataset[gnn.train_idx])

In [7]:
#now that the model is instaziated, we have to load the weights
gnn.load_model("models/"+DATASET+"_"+MODEL)
print("we loaded the weights")
# right now the model has trained weights.
#print(gnn.model.mlp1.weight[0])

we loaded the weights


In [8]:
# train the model
gnn.evaluate()

Test Loss: 0.029, Train Acc: 0.971 Test Acc: 0.965


In [None]:
def compute_graph_properties(data):
    properties = []
    for graph_data in data:
        G = nx.from_edgelist(graph_data.edge_index.t().tolist())
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        density = nx.density(G)
        avg_path_len = nx.average_shortest_path_length(G) if nx.is_connected(G) else None
        num_cliques = len(list(nx.find_cliques(G)))
        num_triangles = sum(nx.triangles(G).values()) / 3
        num_squares = sum(nx.square_clustering(G).values()) / 4
        number_of_nodes_in_the_largest_fully_connected_component = len(max(nx.connected_components(G), key=len))
        small_world = nx.algorithms.smallworld.sigma(G)

        properties.append((num_nodes, num_edges, density, avg_path_len, num_cliques, num_triangles, num_squares, number_of_nodes_in_the_largest_fully_connected_component, small_world))
    return properties

train_properties = compute_graph_properties(gnn.dataset[gnn.train_idx])
test_properties = compute_graph_properties(gnn.dataset[gnn.test_idx])

#save the properties in a file
# with open("results/"+DATASET+"_"+MODEL+"_train_properties.pkl", "wb") as f:
#     pkl.dump(train_properties, f)

# with open("results/"+DATASET+"_"+MODEL+"_test_properties.pkl", "wb") as f:
#     pkl.dump(test_properties, f)

In [None]:
print(train_properties[0:3])
print(test_properties[0:3])

In [None]:
#save the properties in a file
# with open("results/"+DATASET+"_"+MODEL+"_train_properties_special.pkl", "wb") as f:
#     pkl.dump(train_properties, f)

# with open("results/"+DATASET+"_"+MODEL+"_test_properties_special.pkl", "wb") as f:
#     pkl.dump(test_properties, f)

In [9]:
#open the properties
with open("results/"+DATASET+"_"+MODEL+"_train_properties_special.pkl", "rb") as f:
    train_properties = pkl.load(f)

with open("results/"+DATASET+"_"+MODEL+"_test_properties_special.pkl", "rb") as f:
    test_properties = pkl.load(f)

### Probing pipeline 

from extracting features to training a diagnostic classifier

In [10]:
"""
train_features and test_features are lists of lists. Each element correspond to a graph, each graph has 4 layers of features, and each features has a shape of (num_nodes, num_features) where each row is a node and each column is a feature. 
To probe our model we will use the four different layers and probe for node properties based on their corresponding row in the feature matrix.
"""
train_features, test_features = gnn.evaluate_with_features2(return_node_embeddings=True)

x1 shape: torch.Size([25, 60])
x2 shape: torch.Size([25, 60])
x3 shape: torch.Size([25, 60])
x4 shape: torch.Size([25, 60])
len of features:  4
features[0].shape:  torch.Size([25, 60])
features[1].shape:  torch.Size([25, 60])
features[2].shape:  torch.Size([25, 60])
features[3].shape:  torch.Size([25, 60])
train_features[0][0].shape:  (25, 60)
x1 shape: torch.Size([27, 60])
x2 shape: torch.Size([27, 60])
x3 shape: torch.Size([27, 60])
x4 shape: torch.Size([27, 60])
len of features:  4
features[0].shape:  torch.Size([27, 60])
features[1].shape:  torch.Size([27, 60])
features[2].shape:  torch.Size([27, 60])
features[3].shape:  torch.Size([27, 60])
train_features[0][0].shape:  (25, 60)
x1 shape: torch.Size([27, 60])
x2 shape: torch.Size([27, 60])
x3 shape: torch.Size([27, 60])
x4 shape: torch.Size([27, 60])
len of features:  4
features[0].shape:  torch.Size([27, 60])
features[1].shape:  torch.Size([27, 60])
features[2].shape:  torch.Size([27, 60])
features[3].shape:  torch.Size([27, 60])


In [40]:
train_features[0][3].shape

(25, 60)

In [38]:
len(train_features[0])

4

In [None]:
# Check the shape of the first graph's features
first_graph_features = train_features[0]
for i, feature in enumerate(first_graph_features):
    print(f"Feature {i+1} shape:", feature.shape)

In [49]:
len(train_features), len(test_features)

(1600, 400)

In [12]:
len(train_features[0])

4

In [18]:
len(train_features[1][0])

27

In [19]:
print("train_features[0][0].shape: ", train_features[0][0].shape)

train_features[0][0].shape:  (25, 60)


In [22]:
train_features[0][0][24].shape

(60,)

In [23]:
# Check the shape of the first graph's features
first_graph_features = test_features[0]
for i, feature in enumerate(first_graph_features):
    print(f"Feature {i+1} shape:", feature.shape)

Feature 1 shape: (26, 60)
Feature 2 shape: (26, 60)
Feature 3 shape: (26, 60)
Feature 4 shape: (26, 60)


In [47]:
# Extract x embeddings
train_x = np.array([feat[0] for graph_feat in train_features for feat in graph_feat])
test_x = np.array([feat[0] for graph_feat in test_features for feat in graph_feat])

# Extract 2, 3, 4 embeddings
train_x2 = np.array([feat[1] for graph_feat in train_features for feat in graph_feat])
test_x2 = np.array([feat[1] for graph_feat in test_features for feat in graph_feat])

train_x3 = np.array([feat[2] for graph_feat in train_features for feat in graph_feat])
test_x3 = np.array([feat[2] for graph_feat in test_features for feat in graph_feat])

train_x4 = np.array([feat[3] for graph_feat in train_features for feat in graph_feat])
test_x4 = np.array([feat[3] for graph_feat in test_features for feat in graph_feat])

In [43]:
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)


In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Assuming you've already defined LinearModel class and computed node properties

# Define property names and embeddings
property_names = ['degree', 'clustering_coefficient', 'centrality']
embeddings = train_features  # Assuming train_features is a list of embeddings for each layer
test_embeddings = test_features  # Assuming test_features is a list of embeddings for each layer
embeddings_names = ['x1', 'x2', 'x3', 'x4']

# Create a dictionary to store the results
results = {}

for ii, (train_embedding, test_embedding) in enumerate(zip(embeddings, test_embeddings)):
    # Convert list of embeddings to a single numpy array
    train_embedding = np.concatenate(train_embedding)
    test_embedding = np.concatenate(test_embedding)
    
    input_size = train_embedding.shape[1]

    for i, property_name in enumerate(property_names):
        model = LinearModel(input_size, 1)  # Predicting one property at a time
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        num_epochs = 1000  # Adjust as needed

        # Convert to PyTorch tensors
        train_X = torch.FloatTensor(train_embedding)
        train_y = torch.FloatTensor(train_node_properties)[:, i].unsqueeze(1)
        test_X = torch.FloatTensor(test_embedding)
        test_y = torch.FloatTensor(test_node_properties)[:, i].unsqueeze(1)

        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()

            outputs = model(train_X)
            loss = criterion(outputs, train_y)
            loss.backward()
            optimizer.step()

            if (epoch+1) % 100 == 0:  # Adjust for desired print frequency
                print(f'Epoch [{epoch+1}/{num_epochs}], Embedding: {embeddings_names[ii]}, Property: {property_name}, Loss: {loss.item():.4f}')

        # Evaluate the model
        model.eval()
        with torch.no_grad():
            train_pred = model(train_X).cpu().numpy()
            test_pred = model(test_X).cpu().numpy()

            train_mse = mean_squared_error(train_y.cpu().numpy(), train_pred)
            test_mse = mean_squared_error(test_y.cpu().numpy(), test_pred)

            train_r2 = r2_score(train_y.cpu().numpy(), train_pred)
            test_r2 = r2_score(test_y.cpu().numpy(), test_pred)

            print(f'Embedding: {embeddings_names[ii]}, Shape: {train_embedding.shape}')
            print(f'Property: {property_name}')
            print(f'  Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
            print(f'  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}')

            # Add the results to the dictionary
            results[(embeddings_names[ii], property_name)] = (train_mse, test_mse, train_r2, test_r2)

# Print summary of results
print("\nSummary of Results:")
for (embedding, property_name), (train_mse, test_mse, train_r2, test_r2) in results.items():
    print(f"Embedding: {embedding}, Property: {property_name}")
    print(f"  Test MSE: {test_mse:.4f}, Test R²: {test_r2:.4f}")
    print()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\tomdu\miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\tomdu\AppData\Local\Temp\ipykernel_11028\3424013990.py", line 33, in <module>
    train_y = torch.FloatTensor(train_node_properties)[:, i].unsqueeze(1)
ValueError: expected sequence of length 25 at dim 2 (got 27)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\tomdu\miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2105, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "c:\Users\tomdu\miniconda3\lib\site-packages\IPython\core\ultratb.py", line 1428, in structured_traceback
    return FormattedTB.structured_traceback(
  File "c:\Users\tomdu\miniconda3\lib\site-packages\IPython\core\ultratb.py", line 1319, in structured_traceback
    return VerboseTB.structur

##### Testes of different posibilities

In [None]:
#check if the gnn.dataset[gnn.train_idx] is the same as the one in the dataset to check if the split is reproducible

# let's create the train and test loaders from dataset
idx = torch.arange(len(dataset))
train_idx, test_idx = train_test_split(idx, train_size=0.8, stratify=dataset.data.y,random_state=10)

train_loader = DataLoader(dataset[train_idx],batch_size=256)
test_loader = DataLoader(dataset[test_idx],batch_size=256)

# let's check if the loaders are the same as the one in the gnn object
for i, (data) in enumerate(train_loader):
    print(data)
    print(gnn.dataset[gnn.train_idx[i]])
    if i == 3: 
        break


In [None]:
# save gnn.dataset[gnn.train_idx] and gnn.dataset[gnn.test_idx] in a file
with open("models/"+DATASET+"_"+MODEL+"_train.pkl", "wb") as f:
    pkl.dump(gnn.dataset[gnn.train_idx], f)

with open("models/"+DATASET+"_"+MODEL+"_test.pkl", "wb") as f:
    pkl.dump(gnn.dataset[gnn.test_idx], f)

In [None]:
# load the data
with open("models/"+DATASET+"_"+MODEL+"_train.pkl", "rb") as f:
    train_data = pkl.load(f)

In [None]:
# Evaluate to get features
train_features, test_features = gnn.evaluate_with_features2()

# Extract x_global and x_lin1 embeddings for training set
train_x_global = np.array([feat[0] for feat in train_features])
train_x_lin1 = np.array([feat[1] for feat in train_features])

#take only the first 10 elements
# train_x_global = train_x_global[:10]
# train_x_lin1 = train_x_lin1[:10]

# Extract x_global and x_lin1 embeddings for test set
test_x_global = np.array([feat[0] for feat in test_features])
test_x_lin1 = np.array([feat[1] for feat in test_features])

#take only the first 10 elements
# test_x_global = test_x_global[:10]
# test_x_lin1 = test_x_lin1[:10]


In [None]:
#print the number of embeddings for the train and test set
print(len(train_x_global), len(train_x_lin1), len(test_x_global), len(test_x_lin1))

linear regression

In [None]:
import numpy as np

# Prepare training data
train_x = train_x_global  # or use train_x_lin1 if you prefer
train_y = np.array(train_properties)

# Prepare testing data
test_x = test_x_global  # or use test_x_lin1 if you prefer
test_y = np.array(test_properties)


In [None]:
# Train a regression model for each graph property
models = []
for i in range(train_y.shape[1]):
    model = LinearRegression()
    model.fit(train_x, train_y[:, i])
    models.append(model)

# Evaluate the models
for i, model in enumerate(models):
    train_pred = model.predict(train_x)
    test_pred = model.predict(test_x)
    
    train_mse = mean_squared_error(train_y[:, i], train_pred)
    test_mse = mean_squared_error(test_y[:, i], test_pred)
    
    print(f'Property {i}: Train MSE: {train_mse:.3f}, Test MSE: {test_mse:.3f}')


Linear model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class LinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)


In [None]:
# Convert to PyTorch tensors
train_x = torch.tensor(train_x_global, dtype=torch.float32)
train_y = torch.tensor(train_properties, dtype=torch.float32)

test_x = torch.tensor(test_x_global, dtype=torch.float32)
test_y = torch.tensor(test_properties, dtype=torch.float32)


In [None]:
len(train_x)

In [None]:
train_y[:, 4]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Define the linear model
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)

# Assume we have already evaluated to get features
# train_features, test_features = gnn.evaluate_with_features()

# Extract x_global embeddings
train_x_global = np.array([feat[0] for feat in train_features])
test_x_global = np.array([feat[0] for feat in test_features])

# Extract 5th, 6th and 7th embeddings
train_x5 = np.array([feat[1] for feat in train_features])
test_x5 = np.array([feat[1] for feat in test_features])

train_x6 = np.array([feat[2] for feat in train_features])
test_x6 = np.array([feat[2] for feat in test_features])

train_x7 = np.array([feat[3] for feat in train_features])
test_x7 = np.array([feat[3] for feat in test_features])

# Compute graph properties
train_properties = compute_graph_properties(gnn.dataset[gnn.train_idx])
test_properties = compute_graph_properties(gnn.dataset[gnn.test_idx])

# Convert to PyTorch tensors
train_x = torch.tensor(train_x_global, dtype=torch.float32)
train_y = torch.tensor(train_properties, dtype=torch.float32)

test_x = torch.tensor(test_x_global, dtype=torch.float32)
test_y = torch.tensor(test_properties, dtype=torch.float32)

train_x5 = torch.tensor(train_x5, dtype=torch.float32)
train_x6 = torch.tensor(train_x6, dtype=torch.float32)
train_x7 = torch.tensor(train_x7, dtype=torch.float32)



# Train and evaluate a model for each graph property and each embeddings
input_size = train_x.shape[1]
output_size = 1  # Predicting one property at a time

property_names = ['num_nodes', 'num_edges', 'density', 'avg_path_len', 'num_cliques']
""" 
for i, property_name in enumerate(property_names):
    model = LinearModel(input_size, output_size)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    num_epochs = 10000

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        
        outputs = model(train_x).squeeze()  # Ensure outputs is the correct shape
        target = train_y[:, i].squeeze()  # Ensure target is the correct shape

        # Debugging information
        #print(f"Epoch [{epoch+1}/{num_epochs}], Property: {property_name}")
        #print(f"outputs shape: {outputs.shape}, target shape: {target.shape}")

        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        
        if (epoch+1) % 1000 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        train_pred = model(train_x).squeeze().cpu().numpy()
        test_pred = model(test_x).squeeze().cpu().numpy()
        
        train_target = train_y[:, i].cpu().numpy()
        test_target = test_y[:, i].cpu().numpy()
        
        train_mse = mean_squared_error(train_target, train_pred)
        test_mse = mean_squared_error(test_target, test_pred)
        
        train_r2 = r2_score(train_target, train_pred)
        test_r2 = r2_score(test_target, test_pred)
        
        print(f'Property: {property_name}')
        print(f'  Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
        print(f'  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}') """

# Train and evaluate a model for each graph property and each embeddings
input_size = train_x.shape[1]
output_size = 1  # Predicting one property at a time

property_names = ['num_nodes', 'num_edges', 'density', 'avg_path_len', 'num_cliques', 'num_triangles', 'num_squares', 'number_of_nodes_in_the_largest_fully_connected_component']

embeddings = [train_x, train_x5, train_x6, train_x7]

for embedding in embeddings:
    # Train and evaluate a model for each graph property
    for i, property_name in enumerate(property_names):
        model = LinearModel(input_size, output_size)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        num_epochs = 10000

        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()

            outputs = model(embedding).squeeze()
            target = train_y[:, i].squeeze()

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

            if (epoch+1) % 1000 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

        # Evaluate the model
        model.eval()
        with torch.no_grad():
            train_pred = model(embedding).squeeze().cpu().numpy()
            test_pred = model(test_x).squeeze().cpu().numpy()

            train_target = train_y[:, i].cpu().numpy()
            test_target = test_y[:, i].cpu().numpy()

            train_mse = mean_squared_error(train_target, train_pred)
            test_mse = mean_squared_error(test_target, test_pred)

            train_r2 = r2_score(train_target, train_pred)
            test_r2 = r2_score(test_target, test_pred)

            print(f'Embedding: {embedding}')
            print(f'Property: {property_name}')
            print(f'  Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
            print(f'  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}')


### Diagnostic classifier

In [None]:
"""
The embeddings of GCN are like this:
return F.log_softmax(x7, dim=-1), (x1, x2, x3, x4, x_global, x5, x6, x7)
"""

output_size = 1  # Predicting one property at a time
# Define the linear model
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)

# Assume we have already evaluated to get features
# train_features, test_features = gnn.evaluate_with_features2()

# Extract x embeddings
train_x = np.array([feat[0] for feat in train_features])
test_x = np.array([feat[0] for feat in test_features])

# Extract 2, 3, 4, global, 5, 6, 7 embeddings
train_x2 = np.array([feat[1] for feat in train_features])
test_x2 = np.array([feat[1] for feat in test_features])

train_x3 = np.array([feat[2] for feat in train_features])
test_x3 = np.array([feat[2] for feat in test_features])

train_x4 = np.array([feat[3] for feat in train_features])
test_x4 = np.array([feat[3] for feat in test_features])

train_x_global = np.array([feat[4] for feat in train_features])
test_x_global = np.array([feat[4] for feat in test_features])

train_x5 = np.array([feat[5] for feat in train_features])
test_x5 = np.array([feat[5] for feat in test_features])

train_x6 = np.array([feat[6] for feat in train_features])
test_x6 = np.array([feat[6] for feat in test_features])

train_x7 = np.array([feat[7] for feat in train_features])
test_x7 = np.array([feat[7] for feat in test_features])

# Compute graph properties
# train_properties = compute_graph_properties(gnn.dataset[gnn.train_idx])
# test_properties = compute_graph_properties(gnn.dataset[gnn.test_idx])

# Convert to PyTorch tensors
train_x = torch.tensor(train_x, dtype=torch.float32)
train_x2 = torch.tensor(train_x2, dtype=torch.float32)
train_x3 = torch.tensor(train_x3, dtype=torch.float32)
train_x4 = torch.tensor(train_x4, dtype=torch.float32)
train_x_global = torch.tensor(train_x_global, dtype=torch.float32)
train_x5 = torch.tensor(train_x5, dtype=torch.float32)
train_x6 = torch.tensor(train_x6, dtype=torch.float32)
train_x7 = torch.tensor(train_x7, dtype=torch.float32)

test_x = torch.tensor(test_x, dtype=torch.float32)
test_x2 = torch.tensor(test_x2, dtype=torch.float32)
test_x3 = torch.tensor(test_x3, dtype=torch.float32)
test_x4 = torch.tensor(test_x4, dtype=torch.float32)
test_x_global = torch.tensor(test_x_global, dtype=torch.float32)
test_x5 = torch.tensor(test_x5, dtype=torch.float32)
test_x6 = torch.tensor(test_x6, dtype=torch.float32)
test_x7 = torch.tensor(test_x7, dtype=torch.float32)

train_y = torch.tensor(train_properties, dtype=torch.float32)
test_y = torch.tensor(test_properties, dtype=torch.float32)

# Train and evaluate a model for each graph property and each embedding
property_names = ['num_nodes', 'num_edges', 'density', 'avg_path_len', 'num_cliques', 'num_triangles', 'num_squares', 'number_of_nodes_in_the_largest_fully_connected_component', 'small_world']
embeddings = [(train_x, test_x), (train_x2, test_x2), (train_x3, test_x3), (train_x4, test_x4), (train_x_global, test_x_global), (train_x5, test_x5), (train_x6, test_x6), (train_x7, test_x7)]
embeddings_names = ['x', 'x2', 'x3', 'x4', 'x_global', 'x5', 'x6', 'x7']

#create a dictionary where we will sotre the results for each embeddings, each property
results = {}

ii = 0

for train_embedding, test_embedding in embeddings:
    input_size = train_embedding.shape[1]

    for i, property_name in enumerate(property_names):
        model = LinearModel(input_size, output_size)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        num_epochs = 10000  # Adjust this as needed

        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()

            outputs = model(train_embedding).squeeze()
            target = train_y[:, i].squeeze()

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

            if (epoch+1) % 1000 == 0:  # Adjust this for more frequent/lower print frequency
                print(f'Epoch [{epoch+1}/{num_epochs}], Property: {property_name}, Loss: {loss.item():.4f}')

        # Evaluate the model
        model.eval()
        with torch.no_grad():
            train_pred = model(train_embedding).squeeze().cpu().numpy()
            test_pred = model(test_embedding).squeeze().cpu().numpy()

            train_target = train_y[:, i].cpu().numpy()
            test_target = test_y[:, i].cpu().numpy()

            train_mse = mean_squared_error(train_target, train_pred)
            test_mse = mean_squared_error(test_target, test_pred)

            train_r2 = r2_score(train_target, train_pred)
            test_r2 = r2_score(test_target, test_pred)

            print(f'Embedding: {train_embedding.shape}')
            print(f'Property: {property_name}')
            print(f'  Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
            print(f'  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}')

            #add the results to the dictionary
            name_of_embdedding = embeddings_names[ii]
            results[(name_of_embdedding, property_name)] = (train_mse, test_mse, train_r2, test_r2)

    ii += 1

#save the results in a file
# with open("results/"+DATASET+"_"+MODEL+"_results.pkl", "wb") as f:
#     pkl.dump(results, f)

In [None]:
#load the results
with open("results/"+DATASET+"_"+MODEL+"_results.pkl", "rb") as f:
    results = pkl.load(f)

In [None]:
import matplotlib.pyplot as plt

# Assuming results, embeddings, and other necessary variables are defined as in your context
property_names = ['num_nodes', 'num_edges', 'density', 'avg_path_len', 'num_cliques', 'num_triangles', 'num_squares', 'number_of_nodes_in_the_largest_fully_connected_component', 'small_world']
embeddings = [(train_x, test_x), (train_x2, test_x2), (train_x3, test_x3), (train_x4, test_x4), (train_x_global, test_x_global), (train_x5, test_x5), (train_x6, test_x6), (train_x7, test_x7)]
embeddings_names = ['x', 'x2', 'x3', 'x4', 'x_global', 'x5', 'x6', 'x7']
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'tab:orange', 'tab:purple']

plt.figure(figsize=(12, 8))

for i, property_name in enumerate(property_names):
    x_points = []
    y_points = []
    for j, embedding in enumerate(embeddings):
        name_of_embedding = embeddings_names[j]
        test_r2 = results[(name_of_embedding, property_name)][3]
        if test_r2 < -0.05:  # Handle negative R² values
            test_r2 = -0.05
        x_points.append(j)
        y_points.append(test_r2)
    
    # Plotting the line for the current property
    plt.plot(x_points, y_points, label=property_name, color=colors[i], marker='x')

plt.xticks(range(len(embeddings)), embeddings_names)
plt.xlabel('Embedding')
plt.ylabel('R²')
plt.legend()
plt.title('R² for different embeddings and properties')
plt.show()


### Node embedding probing

This time we will try to probe for node embedding learnt representations. What would make sense is that the GNN uses some node characteristics like the betweeness centrality of some nodes to predict if a graph could contain both a grid and a house. 

In [44]:
# Define function to compute node-level properties
def compute_node_properties(data):
    properties = []
    for graph_data in data:
        G = nx.from_edgelist(graph_data.edge_index.t().tolist())
        node_degrees = list(dict(G.degree()).values())
        clustering_coeffs = list(nx.clustering(G).values())
        betweenness_centralities = list(nx.betweenness_centrality(G).values())
        eigenvector_centralities = list(nx.eigenvector_centrality(G, max_iter=10000).values())
        Local_clustering_coefficients = list(nx.clustering(G).values())

        properties.append((node_degrees, clustering_coeffs, betweenness_centralities, eigenvector_centralities, Local_clustering_coefficients))
    return properties

# Compute node-level properties for train and test sets
train_node_properties = compute_node_properties(gnn.dataset[gnn.train_idx])
test_node_properties = compute_node_properties(gnn.dataset[gnn.test_idx])

#train_node_properties is a list of tuples, where each tuple contains 5 lists, each list contains the node-level property for each node in the graph

In [None]:
len(train_node_properties), len(test_node_properties)

In [None]:
#visualise the first graph of the test set to see if the properties are correct
G = nx.from_edgelist(gnn.dataset[gnn.test_idx][0].edge_index.t().tolist())
nx.draw(G, with_labels=True)

#print the properties of the first graph
print(test_node_properties[0])

In [None]:
#represent the properties of betweenness centrality on the graph by chaging the size of the nodes
betweenness_centrality = test_node_properties[0][2]
node_degrees = test_node_properties[0][0]

plt.figure(figsize=(10, 10))
nx.draw(G, with_labels=True, node_size=[v * 1000 for v in betweenness_centrality], node_color=node_degrees, cmap='viridis')
plt.show()


In [None]:
#same for eigenvector centrality
eigenvector_centralities = test_node_properties[0][3]

plt.figure(figsize=(10, 10))
nx.draw(G, with_labels=True, node_size=[v * 1000 for v in eigenvector_centralities], node_color=node_degrees, cmap='viridis')
plt.show()

In [None]:
#same for local clustering coefficients
Local_clustering_coefficients = test_node_properties[0][4]

plt.figure(figsize=(10, 10))
nx.draw(G, with_labels=True, node_size=[v * 1000 for v in Local_clustering_coefficients], node_color=node_degrees, cmap='viridis')
plt.show()

Hypothesis : it would make a lot of sense for the GNN to use both the node propeties of local clustering coefficient and eigenvector centrality. The interesting thing is that these two features individually are not enough to render he problem linearly separable. It would thus make sense to use both of these properties and combine them. At the end, the problem makes more sense to be solved in a global property fashion with the number of squares. 

Test avec concatenate

In [None]:
train_x2.shape

In [None]:
# Concatenate properties with embeddings
def concatenate_embeddings_with_properties(node_embeddings, node_properties):
    concatenated_features = []
    for embeddings, properties in zip(node_embeddings, node_properties):
        combined = [np.concatenate((embedding, np.array(property))) for embedding, property in zip(embeddings, zip(*properties))]
        concatenated_features.append(combined)
    return concatenated_features

train_combined_features = concatenate_embeddings_with_properties(train_x2, train_node_properties)
test_combined_features = concatenate_embeddings_with_properties(test_x2, test_node_properties)


test avec extend() and flatten

Here we use padding in order to flatten the inconsistent of train_node_properties_flat and test_node_properties_flat. 

In [None]:
import numpy as np
import torch

# Padding and Masking for Consistency
def pad_sequences(sequences, pad_value=0):
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = []
    for seq in sequences:
        padded_seq = np.pad(seq, (0, max_length - len(seq)), 'constant', constant_values=pad_value)
        padded_sequences.append(padded_seq)
    return np.array(padded_sequences)

# Pad node embeddings
train_x_padded = pad_sequences(train_features)
test_x_padded = pad_sequences(test_features)

# Pad node properties
train_y_padded = pad_sequences(train_node_properties)
test_y_padded = pad_sequences(test_node_properties)

# Convert to PyTorch tensors
train_x_tensor = torch.tensor(train_x_padded, dtype=torch.float32)
test_x_tensor = torch.tensor(test_x_padded, dtype=torch.float32)

train_y_tensor = torch.tensor(train_y_padded, dtype=torch.float32)
test_y_tensor = torch.tensor(test_y_padded, dtype=torch.float32)


In [None]:
#print the lendgth of the elements of train_node_properties_flat

for i in range(len(train_node_properties_flat)):
    print(len(train_node_properties_flat[i]))

Try the diag

In [None]:
# Extract x embeddings
train_x = np.array([feat[0] for feat in train_features])
test_x = np.array([feat[0] for feat in test_features])

# Extract 2, 3, 4 embeddings
train_x2 = np.array([feat[1] for feat in train_features])
test_x2 = np.array([feat[1] for feat in test_features])

train_x3 = np.array([feat[2] for feat in train_features])
test_x3 = np.array([feat[2] for feat in test_features])

train_x4 = np.array([feat[3] for feat in train_features])
test_x4 = np.array([feat[3] for feat in test_features])

# Convert to PyTorch tensors
train_x = torch.tensor(train_x, dtype=torch.float32)
train_x2 = torch.tensor(train_x2, dtype=torch.float32)
train_x3 = torch.tensor(train_x3, dtype=torch.float32)
train_x4 = torch.tensor(train_x4, dtype=torch.float32)

test_x = torch.tensor(test_x, dtype=torch.float32)
test_x2 = torch.tensor(test_x2, dtype=torch.float32)
test_x3 = torch.tensor(test_x3, dtype=torch.float32)
test_x4 = torch.tensor(test_x4, dtype=torch.float32)

# Compute node-level properties for train and test sets
train_node_properties = compute_node_properties(gnn.dataset[gnn.train_idx])
test_node_properties = compute_node_properties(gnn.dataset[gnn.test_idx])

# Convert properties to tensors
train_y_node = torch.tensor(train_node_properties, dtype=torch.float32)
test_y_node = torch.tensor(test_node_properties, dtype=torch.float32)

# Train and evaluate a model for each graph property and each embedding
property_names = ['degree', 'clustering_coeff', 'betweenness_centrality', 'eigenvector_centrality', 'local_clustering_coeff']
embeddings = [(train_x, test_x), (train_x2, test_x2), (train_x3, test_x3), (train_x4, test_x4)]
embeddings_names = ['x', 'x2', 'x3', 'x4']

#### Diagnostic classifier

In [None]:
# Assume we have already evaluated to get features
# train_features, test_features = gnn.evaluate_with_features2()

# Extract x embeddings
train_x = np.array([feat[0] for feat in train_features])
test_x = np.array([feat[0] for feat in test_features])

# Extract 2, 3, 4, global, 5, 6, 7 embeddings
train_x2 = np.array([feat[1] for feat in train_features])
test_x2 = np.array([feat[1] for feat in test_features])

train_x3 = np.array([feat[2] for feat in train_features])
test_x3 = np.array([feat[2] for feat in test_features])

train_x4 = np.array([feat[3] for feat in train_features])
test_x4 = np.array([feat[3] for feat in test_features])

train_x_global = np.array([feat[4] for feat in train_features])
test_x_global = np.array([feat[4] for feat in test_features])

train_x5 = np.array([feat[5] for feat in train_features])
test_x5 = np.array([feat[5] for feat in test_features])

train_x6 = np.array([feat[6] for feat in train_features])
test_x6 = np.array([feat[6] for feat in test_features])

train_x7 = np.array([feat[7] for feat in train_features])
test_x7 = np.array([feat[7] for feat in test_features])

# Compute graph properties
# train_properties = compute_graph_properties(gnn.dataset[gnn.train_idx])
# test_properties = compute_graph_properties(gnn.dataset[gnn.test_idx])

# Convert to PyTorch tensors
train_x = torch.tensor(train_x, dtype=torch.float32)
train_x2 = torch.tensor(train_x2, dtype=torch.float32)
train_x3 = torch.tensor(train_x3, dtype=torch.float32)
train_x4 = torch.tensor(train_x4, dtype=torch.float32)
train_x_global = torch.tensor(train_x_global, dtype=torch.float32)
train_x5 = torch.tensor(train_x5, dtype=torch.float32)
train_x6 = torch.tensor(train_x6, dtype=torch.float32)
train_x7 = torch.tensor(train_x7, dtype=torch.float32)

test_x = torch.tensor(test_x, dtype=torch.float32)
test_x2 = torch.tensor(test_x2, dtype=torch.float32)
test_x3 = torch.tensor(test_x3, dtype=torch.float32)
test_x4 = torch.tensor(test_x4, dtype=torch.float32)
test_x_global = torch.tensor(test_x_global, dtype=torch.float32)
test_x5 = torch.tensor(test_x5, dtype=torch.float32)
test_x6 = torch.tensor(test_x6, dtype=torch.float32)
test_x7 = torch.tensor(test_x7, dtype=torch.float32)

# train_y_node = torch.tensor(train_node_properties, dtype=torch.float32)
# test_y_node = torch.tensor(test_node_properties, dtype=torch.float32)

# Train and evaluate a model for each graph property and each embedding
property_names = ['num_nodes', 'num_edges', 'density', 'avg_path_len', 'num_cliques', 'num_triangles', 'num_squares', 'number_of_nodes_in_the_largest_fully_connected_component', 'small_world']
embeddings = [(train_x, test_x), (train_x2, test_x2), (train_x3, test_x3), (train_x4, test_x4), (train_x_global, test_x_global), (train_x5, test_x5), (train_x6, test_x6), (train_x7, test_x7)]
embeddings_names = ['x', 'x2', 'x3', 'x4', 'x_global', 'x5', 'x6', 'x7']

In [None]:
#compare train_node_properties and train_propertiesn their length
print(train_node_properties[0])
print(train_properties[0])

print(len(train_node_properties), len(train_properties))

In [None]:
train_y_node = torch.tensor(train_node_properties, dtype=torch.float32)

In [48]:
#check the length of the embeddings for train_x, test_x, train_x2, test_x2, train_x3, test_x3, train_x4, test_x4
len(train_x), len(test_x), len(train_x2), len(test_x2), len(train_x3), len(test_x3), len(train_x4), len(test_x4)

(6400, 1600, 6400, 1600, 6400, 1600, 6400, 1600)

In [None]:
train_x = np.array([feat[0] for feat in train_features])

print(len(train_x))
print(train_x[0])
print(train_x[0].shape)

In [None]:
#pritn the distribution of the number of elements per embeddings in train_x
elements = [len(feat) for feat in train_x2]
plt.hist(elements, bins=range(0, max(elements)+1, 1), alpha=0.75, rwidth=0.85, color='b', edgecolor='black')
plt.xlabel('Number of elements')
plt.ylabel('Frequency')
plt.title('Number of elements per embeddings in train_x')
plt.show()

#print the min and max number of elements in the embeddings in train_x
min(elements), max(elements), len(elements)

In [None]:
# Flatten node properties into a single list for training
train_node_degrees = np.concatenate([props[0] for props in train_node_properties])
train_clustering_coeffs = np.concatenate([props[1] for props in train_node_properties])
train_betweenness_centralities = np.concatenate([props[2] for props in train_node_properties])

test_node_degrees = np.concatenate([props[0] for props in test_node_properties])
test_clustering_coeffs = np.concatenate([props[1] for props in test_node_properties])
test_betweenness_centralities = np.concatenate([props[2] for props in test_node_properties])

# Convert node properties to PyTorch tensors
train_y_degrees = torch.tensor(train_node_degrees, dtype=torch.float32)
train_y_clustering = torch.tensor(train_clustering_coeffs, dtype=torch.float32)
train_y_betweenness = torch.tensor(train_betweenness_centralities, dtype=torch.float32)

test_y_degrees = torch.tensor(test_node_degrees, dtype=torch.float32)
test_y_clustering = torch.tensor(test_clustering_coeffs, dtype=torch.float32)
test_y_betweenness = torch.tensor(test_betweenness_centralities, dtype=torch.float32)

# Define node properties names and graph properties names
node_properties_names = ['node_degrees', 'clustering_coeffs', 'betweenness_centralities']
embeddings_for_node_properties = [(train_x, test_x), (train_x2, test_x2), (train_x3, test_x3), (train_x4, test_x4)]

In [None]:
# Define the node property tensors for training and testing
train_y_list = [train_y_degrees, train_y_clustering, train_y_betweenness]
test_y_list = [test_y_degrees, test_y_clustering, test_y_betweenness]

# Train and evaluate model for node-level properties
for train_embedding, test_embedding in embeddings_for_node_properties:
    input_size = train_embedding.shape[2]  # Since train_embedding is 3D: [num_graphs, num_nodes, embedding_dim]

    for i, property_name in enumerate(node_properties_names):
        model = LinearModel(input_size, output_size=1)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        num_epochs = 100  # Adjust this as needed

        train_y = train_y_list[i]
        test_y = test_y_list[i]

        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()

            # Flatten the embeddings and the targets to handle as a batch
            train_embedding_flat = train_embedding.view(-1, input_size)
            train_y_flat = train_y.view(-1)

            outputs = model(train_embedding_flat).squeeze()
            target = train_y_flat.squeeze()

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

            if (epoch+1) % 10 == 0:  # Adjust this for more frequent/lower print frequency
                print(f'Epoch [{epoch+1}/{num_epochs}], Property: {property_name}, Loss: {loss.item():.4f}')

        # Evaluate the model
        model.eval()
        with torch.no_grad():
            train_pred = model(train_embedding_flat).squeeze().cpu().numpy()
            test_embedding_flat = test_embedding.view(-1, input_size)
            test_pred = model(test_embedding_flat).squeeze().cpu().numpy()

            train_target = train_y_flat.cpu().numpy()
            test_target = test_y.view(-1).cpu().numpy()

            train_mse = mean_squared_error(train_target, train_pred)
            test_mse = mean_squared_error(test_target, test_pred)

            train_r2 = r2_score(train_target, train_pred)
            test_r2 = r2_score(test_target, test_pred)

            print(f'Embedding: {train_embedding.shape}')
            print(f'Node Property: {property_name}')
            print(f'  Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
            print(f'  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}')

In [None]:
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)


In [None]:
node_properties = ['node_degrees', 'clustering_coeffs', 'betweenness_centralities']
node_y_train = [train_y_degrees, train_y_clustering, train_y_betweenness]
node_y_test = [test_y_degrees, test_y_clustering, test_y_betweenness]

embeddings = [(train_x, test_x), (train_x2, test_x2), (train_x3, test_x3), (train_x4, test_x4)]
embeddings_names = ['x', 'x2', 'x3', 'x4']

for embedding_name, (train_embedding, test_embedding) in zip(embeddings_names, embeddings):
    input_size = train_embedding.shape[1]

    for property_name, train_y, test_y in zip(node_properties, node_y_train, node_y_test):
        model = LinearModel(input_size, 1)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        num_epochs = 100  # Adjust this as needed

        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()

            outputs = model(train_embedding).squeeze()
            target = train_y.squeeze()

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

            if (epoch+1) % 10 == 0:  # Adjust this for more frequent/lower print frequency
                print(f'Epoch [{epoch+1}/{num_epochs}], Embedding: {embedding_name}, Property: {property_name}, Loss: {loss.item():.4f}')

        # Evaluate the model
        model.eval()
        with torch.no_grad():
            train_pred = model(train_embedding).squeeze().cpu().numpy()
            test_pred = model(test_embedding).squeeze().cpu().numpy()

            train_target = train_y.cpu().numpy()
            test_target = test_y.cpu().numpy()

            train_mse = mean_squared_error(train_target, train_pred)
            test_mse = mean_squared_error(test_target, test_pred)

            train_r2 = r2_score(train_target, train_pred)
            test_r2 = r2_score(test_target, test_pred)

            print(f'Embedding: {embedding_name}, Node Property: {property_name}')
            print(f'  Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
            print(f'  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}')


In [None]:
# Extract node-level embeddings
# This example assumes the node-level embeddings are already extracted
# and the train_node_properties and test_node_properties are computed as before

# Flatten node properties into a single list for training
train_node_degrees = np.concatenate([props[0] for props in train_node_properties])
train_clustering_coeffs = np.concatenate([props[1] for props in train_node_properties])
train_betweenness_centralities = np.concatenate([props[2] for props in train_node_properties])

test_node_degrees = np.concatenate([props[0] for props in test_node_properties])
test_clustering_coeffs = np.concatenate([props[1] for props in test_node_properties])
test_betweenness_centralities = np.concatenate([props[2] for props in test_node_properties])

# Convert node properties to PyTorch tensors
train_y_degrees = torch.tensor(train_node_degrees, dtype=torch.float32)
train_y_clustering = torch.tensor(train_clustering_coeffs, dtype=torch.float32)
train_y_betweenness = torch.tensor(train_betweenness_centralities, dtype=torch.float32)

test_y_degrees = torch.tensor(test_node_degrees, dtype=torch.float32)
test_y_clustering = torch.tensor(test_clustering_coeffs, dtype=torch.float32)
test_y_betweenness = torch.tensor(test_betweenness_centralities, dtype=torch.float32)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, r2_score

# Define the linear model for node-level properties
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)

node_properties = ['node_degrees', 'clustering_coeffs', 'betweenness_centralities']
node_y_train = [train_y_degrees, train_y_clustering, train_y_betweenness]
node_y_test = [test_y_degrees, test_y_clustering, test_y_betweenness]

embeddings = [(train_x, test_x), (train_x2, test_x2), (train_x3, test_x3), (train_x4, test_x4)]
embeddings_names = ['x', 'x2', 'x3', 'x4']

for embedding_name, (train_embedding, test_embedding) in zip(embeddings_names, embeddings):
    input_size = train_embedding.shape[1]

    for property_name, train_y, test_y in zip(node_properties, node_y_train, node_y_test):
        model = LinearModel(input_size, 1)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        num_epochs = 100  # Adjust this as needed

        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()

            outputs = model(train_embedding).squeeze()
            target = train_y.squeeze()

            # Debugging information
            print(f"Epoch [{epoch+1}/{num_epochs}], Embedding: {embedding_name}, Property: {property_name}")
            print(f"outputs shape: {outputs.shape}, target shape: {target.shape}")

            if outputs.shape != target.shape:
                print(f"Shape mismatch: outputs shape {outputs.shape}, target shape {target.shape}")
                continue

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

            if (epoch+1) % 10 == 0:  # Adjust this for more frequent/lower print frequency
                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

        # Evaluate the model
        model.eval()
        with torch.no_grad():
            train_pred = model(train_embedding).squeeze().cpu().numpy()
            test_pred = model(test_embedding).squeeze().cpu().numpy()

            train_target = train_y.cpu().numpy()
            test_target = test_y.cpu().numpy()

            train_mse = mean_squared_error(train_target, train_pred)
            test_mse = mean_squared_error(test_target, test_pred)

            train_r2 = r2_score(train_target, train_pred)
            test_r2 = r2_score(test_target, test_pred)

            print(f'Embedding: {embedding_name}, Node Property: {property_name}')
            print(f'  Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
            print(f'  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}')


In [None]:
# Define function to compute node-level properties
def compute_node_properties(data):
    properties = []
    for graph_data in data:
        G = nx.from_edgelist(graph_data.edge_index.t().tolist())
        node_degrees = list(dict(G.degree()).values())
        clustering_coeffs = list(nx.clustering(G).values())
        betweenness_centralities = list(nx.betweenness_centrality(G).values())

        properties.append((node_degrees, clustering_coeffs, betweenness_centralities))
    return properties

# Compute node-level properties for train and test sets
train_node_properties = compute_node_properties(gnn.dataset[gnn.train_idx])
test_node_properties = compute_node_properties(gnn.dataset[gnn.test_idx])

In [None]:


# Evaluate to get node-level features
train_node_features, test_node_features = gnn.evaluate_with_features()
