In [1]:
import numpy as np
import torch
from torch_geometric.data import Data
import networkx as nx
from pyvis.network import Network

  from .autonotebook import tqdm as notebook_tqdm


In [57]:
import random
import copy
from torch_geometric.nn import GCNConv, GATConv
import torch.nn as nn

In [29]:
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch_geometric.utils import to_networkx


In [4]:
inc_matrix_aug = np.loadtxt("Aug_inc_matrix")

In [5]:
inc_matrix_aug = inc_matrix_aug.reshape(-1,50)

In [6]:
inc_matrix_aug.shape

(45, 50)

In [7]:
num_nodes, num_edges = inc_matrix_aug.shape

# --- Step 2: Convert to edge_index for PyG (multi-edges allowed) ---
edge_list = []
for j in range(num_edges):
    col = inc_matrix_aug[:, j]
    src = np.where(col == 1)[0]
    dst = np.where(col == -1)[0]
    if len(src) == 1 and len(dst) == 1:
        edge_list.append((src[0], dst[0]))  # directed edge

edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()  # shape [2, num_edges]
x = torch.eye(45, dtype=torch.float)

# --- Step 3: Create PyG Data object ---
data_inp= Data(x=x, edge_index=edge_index)

# --- Step 4: Visualize with Pyvis ---
# G = nx.MultiDiGraph()
# edge_tuples = edge_index.t().tolist()
# G.add_edges_from(edge_tuples)

# # Assign label, color, and tooltip (identity vector)
# for node in G.nodes():
#     G.nodes[node]["label"] = str(node)
#     G.nodes[node]["title"] = f"Feature: {x[node].tolist()}"
#     G.nodes[node]["color"] = "green" if node < 26 else "blue"

# # Create Pyvis graph
# net = Network(height='600px', width='100%', directed=True, notebook=True)
# net.from_nx(G)
# net.save_graph("incidence_multigraph.html")


In [8]:
def generate_connected_subgraphs(G, k, n, seed=None):
    if seed is not None:
        random.seed(seed)

    if G.number_of_nodes() <= k:
        raise ValueError("Cannot remove more nodes than exist in the graph.")

    subgraphs = []
    attempts = 0
    max_attempts = 100 * n  # safety to avoid infinite loops

    while len(subgraphs) < n and attempts < max_attempts:
        attempts += 1
        nodes_to_remove = random.sample(list(G.nodes()), k)
        G_sub = G.copy()
        G_sub.remove_nodes_from(nodes_to_remove)

        if nx.is_weakly_connected(G_sub):
            subgraphs.append(G_sub)

    return subgraphs

In [9]:
def pyg_data_to_nx_multigraph(data):
    G = nx.MultiDiGraph()

    # Step 1: Add all nodes with features
    for i in range(data.num_nodes):
        G.add_node(i, x=data.x[i].tolist())  # attach node features

    # Step 2: Add all edges (with support for multiple edges)
    edge_list = data.edge_index.t().tolist()
    G.add_edges_from(edge_list)

    return G
G = pyg_data_to_nx_multigraph(data=data_inp)

In [10]:
graph_data_obj_ls = []
subgraph_ls = []
for k in range(5):
    subgraphs = generate_connected_subgraphs(G, k, n=10, seed=123)
    subgraph_ls.extend(subgraphs)

for nx_graph in subgraph_ls:
    # Get all edges with duplicates preserved
    edge_list = [(u, v) for u, v, _ in nx_graph.edges(keys=True)]
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    # Build identity features using original node indices
    all_nodes = list(nx_graph.nodes())
    max_node_id = max(all_nodes)
    x = torch.eye(max_node_id + 1)  # size = [max_node_id + 1, max_node_id + 1]

    # Some nodes might be missing → subset x to only the active node set
    node_mask = torch.zeros_like(x)
    for node in all_nodes:
        node_mask[node] = x[node]
    x_subset = node_mask  # shape = [max_node_id + 1, feature_dim]

    data = Data(x=x_subset, edge_index=edge_index)
    graph_data_obj_ls.append(data)



In [30]:
subgraph_data_obj_ls = []

for data in graph_data_obj_ls:
    G_nx = to_networkx(data, to_undirected=False)
    incidence_matrix = nx.incidence_matrix(G_nx, oriented=True).toarray()
    rank = np.linalg.matrix_rank(incidence_matrix)
    num_edges = data.edge_index.size(1)
    masked_graphs_per_data = []  # inner list for each data graph

    for edges_to_remove in range(rank, min(rank + 6, num_edges)):  # from 1 to 5
        for _ in range(15):  # generate 15 graphs per mask level
            if num_edges <= edges_to_remove:
                continue  # can't remove more edges than exist

            data_copy = copy.deepcopy(data)
            edge_indices = list(range(num_edges))
            to_remove = random.sample(edge_indices, edges_to_remove)

            mask = torch.ones(num_edges, dtype=torch.bool)
            mask[to_remove] = False

            data_copy.edge_index = data.edge_index[:, mask]

            if hasattr(data, 'edge_attr') and data.edge_attr is not None:
                data_copy.edge_attr = data.edge_attr[mask]

            masked_graphs_per_data.append(data_copy)

    subgraph_data_obj_ls.append(masked_graphs_per_data)


In [61]:

# -------------------------------
# CONFIG
# -------------------------------
TOTAL_NODES = 45  # Size of node space (from G)
HIDDEN_DIM1 = 64
HIDDEN_DIM2 = 128
EPOCHS = 20
LEARNING_RATE = 0.01


class GCNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)

class EdgeDecoder(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.linear1 = nn.Linear(in_channels * 2, 64)
        self.linear2 = nn.Linear(64, 1)
        #self.relu = F.relu()

    def forward(self, z, edge_index):
        src, dst = edge_index
        edge_feats = torch.cat([z[src], z[dst]], dim=1)
        edge_feats = F.relu(self.linear1(edge_feats))

        return self.linear2(edge_feats).squeeze()

class GraphCompletionModel(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.encoder = GCNEncoder(in_channels, hidden_channels)
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x, edge_index, candidate_edges):
        z = self.encoder(x, edge_index)
        scores = self.decoder(z, candidate_edges)
        return scores


In [62]:
def sample_non_edges(num_nodes, existing_edges, num_samples):
    existing_set = set(existing_edges)
    all_possible = [(i, j) for i in range(num_nodes) for j in range(num_nodes) if i != j]
    candidates = list(set(all_possible) - existing_set)
    return random.sample(candidates, min(num_samples, len(candidates)))

def compute_accuracy(scores, labels, threshold=0.5):
    preds = (torch.sigmoid(scores) > threshold).float()
    correct = (preds == labels).sum().item()
    return correct / len(labels)

In [63]:
def prepare_supervised_data(G_prime_list, G_double_prime_LOL, total_nodes):
    data = []
    for i in range(len(G_prime_list)):
        G_prime = G_prime_list[i]
        G_double_primes = G_double_prime_LOL[i]

        true_edges = list(map(tuple, G_prime.edge_index.t().tolist()))

        for G_double_prime in G_double_primes:
            observed_edges = list(map(tuple, G_double_prime.edge_index.t().tolist()))
            positive_edges = [e for e in true_edges if e not in observed_edges]
            negative_edges = sample_non_edges(total_nodes, true_edges, len(positive_edges))

            data.append((G_double_prime, positive_edges, negative_edges))
    return data


In [64]:
def train_model(model, train_data, test_data, total_nodes, epochs=20, lr=0.01, device=torch.device('cuda')):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for G_double_prime, pos_edges, neg_edges in train_data:
            x = torch.eye(total_nodes).to(device)
            edge_index = G_double_prime.edge_index.to(device)
            candidate_edges = torch.tensor(pos_edges + neg_edges, dtype=torch.long).t().contiguous().to(device)
            labels = torch.tensor([1]*len(pos_edges) + [0]*len(neg_edges), dtype=torch.float).to(device)

            optimizer.zero_grad()
            scores = model(x, edge_index, candidate_edges)
            loss = F.binary_cross_entropy_with_logits(scores, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluation
        model.eval()
        with torch.no_grad():
            total_val_loss = 0
            total_val_acc = 0
            total_samples = 0

            for G_double_prime, pos_edges, neg_edges in test_data:
                x = torch.eye(total_nodes).to(device)
                edge_index = G_double_prime.edge_index.to(device)
                
                candidate_edges = torch.tensor(pos_edges + neg_edges, dtype=torch.long).t().contiguous().to(device)
                labels = torch.tensor([1]*len(pos_edges) + [0]*len(neg_edges), dtype=torch.float).to(device)

                scores = model(x, edge_index, candidate_edges)
                probs = torch.sigmoid(scores)
                val_loss = F.binary_cross_entropy_with_logits(scores, labels)
                total_val_loss += val_loss.item()
                total_val_acc += compute_accuracy(scores, labels) * len(labels)
                total_samples += len(labels)
        
    

            print(f"[Epoch {epoch+1}] Train Loss: {total_loss:.4f} | Val Loss: {total_val_loss:.4f} | Val Acc: {total_val_acc/total_samples:.4f}")

    return probs


In [65]:
def run_pipeline(G, G_prime_list, G_double_prime_LOL):
    data = prepare_supervised_data(G_prime_list, G_double_prime_LOL, TOTAL_NODES)
    train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

    model = GraphCompletionModel(in_channels=TOTAL_NODES, hidden_channels=HIDDEN_DIM1)
    probs = train_model(model, train_set, test_set, TOTAL_NODES, epochs=EPOCHS, lr=LEARNING_RATE)

    return model, probs

In [66]:
model, probs = run_pipeline(G=G, G_prime_list=graph_data_obj_ls, G_double_prime_LOL=subgraph_data_obj_ls)

[Epoch 1] Train Loss: 202.0785 | Val Loss: 28.5726 | Val Acc: 0.9915
[Epoch 2] Train Loss: 113.8320 | Val Loss: 21.4275 | Val Acc: 0.9909
[Epoch 3] Train Loss: 122.2442 | Val Loss: 29.0307 | Val Acc: 0.9877
[Epoch 4] Train Loss: 136.5381 | Val Loss: 39.5173 | Val Acc: 0.9869
[Epoch 5] Train Loss: 151.7276 | Val Loss: 36.7098 | Val Acc: 0.9869
[Epoch 6] Train Loss: 162.9283 | Val Loss: 38.7807 | Val Acc: 0.9860
[Epoch 7] Train Loss: 165.6715 | Val Loss: 41.6221 | Val Acc: 0.9833
[Epoch 8] Train Loss: 161.5062 | Val Loss: 32.8479 | Val Acc: 0.9880
[Epoch 9] Train Loss: 147.1002 | Val Loss: 36.4201 | Val Acc: 0.9887
[Epoch 10] Train Loss: 143.2305 | Val Loss: 32.8868 | Val Acc: 0.9905
[Epoch 11] Train Loss: 140.4238 | Val Loss: 27.8203 | Val Acc: 0.9893
[Epoch 12] Train Loss: 131.1798 | Val Loss: 38.1622 | Val Acc: 0.9869
[Epoch 13] Train Loss: 146.1524 | Val Loss: 28.9722 | Val Acc: 0.9901
[Epoch 14] Train Loss: 135.5919 | Val Loss: 30.0220 | Val Acc: 0.9906
[Epoch 15] Train Loss: 140.20

In [67]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate_graph_completion(model, G_prime_list, G_double_prime_LOL, total_nodes=45, threshold=0.5, device=torch.device('cuda')):
    all_results = []
    model.eval()

    for i, G_prime in enumerate(G_prime_list):
        edges_G_prime = set(map(tuple, G_prime.edge_index.t().tolist()))

        for G_double_prime in G_double_prime_LOL[i]:
            edges_G_double_prime = set(map(tuple, G_double_prime.edge_index.t().tolist()))
            true_missing_edges = list(edges_G_prime - edges_G_double_prime)

            # Evaluate only on these ground-truth missing edges
            candidate_edges = true_missing_edges

            if len(candidate_edges) == 0:
                continue  # skip if no missing edges

            candidate_edges_tensor = torch.tensor(candidate_edges, dtype=torch.long).t().contiguous().to(device)

            x = torch.eye(total_nodes).to(device)
            edge_index = G_double_prime.edge_index.to(device)
            with torch.no_grad():
                probs = torch.sigmoid(model(x, edge_index, candidate_edges_tensor)).cpu()

            # Predict only over true missing edges
            y_true = [1] * len(candidate_edges)
            y_pred = [1 if p > threshold else 0 for p in probs]

            predicted_edges = [candidate_edges[i] for i, p in enumerate(probs) if p > threshold]

            precision = precision_score(y_true, y_pred, zero_division=0)
            recall = recall_score(y_true, y_pred, zero_division=0)
            f1 = f1_score(y_true, y_pred, zero_division=0)
            acc = accuracy_score(y_true, y_pred)

            result = {
                'G_index': i,
                'correct_predictions': len(predicted_edges),  # since only evaluated on true edges
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'accuracy': acc,
                'num_predicted': len(predicted_edges),
                'num_true_missing': len(true_missing_edges)
            }

            all_results.append(result)

    return all_results


all_results = evaluate_graph_completion(model, graph_data_obj_ls, subgraph_data_obj_ls)

In [68]:
for result in all_results:
    correct_pred = result['correct_predictions']
    #incorrect_pred = result['incorrect_predictions']
    num_masked = result['num_true_missing']
    print(f'correct_pred = {correct_pred}, num_masked = {num_masked}')

correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 44, num_masked = 44
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 43, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, num_masked = 45
correct_pred = 45, n

In [94]:
A = [
    [1,0,0],
    [1,1.3,1.3*1.3],
    [1,4,16]
]

y = [0,1.5,1.2]

A = np.array(A)
y = np.array(y)

A.shape

y = np.vstack(y)

In [97]:
coeff = np.matmul(np.linalg.inv(np.matmul(A.T, A)), np.matmul(A.T, y))
coeff

array([[ 7.25285217e-15],
       [ 1.56495726e+00],
       [-3.16239316e-01]])

In [4]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 3050 Laptop GPU
