# Creating a networkx graph from a given edge list and node list

In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import defaultdict
import networkx as nx
import community

In [127]:
edges = pd.read_csv("/content/edges.csv")
edges.head()

Unnamed: 0,source,target
0,0,1378
1,0,1544
2,0,6092
3,0,7636
4,0,14442


In [128]:
nodes = pd.read_csv("/content/nodes.csv")
nodes.head()

Unnamed: 0,id,branch,train_mask,test_mask
0,0,ECE,True,False
1,1,ECE,True,False
2,2,CSE,True,False
3,3,EEE,True,False
4,4,,False,True


In [129]:
def make_graph(nodes_df, edges_df):

    G = nx.Graph()

    # Add nodes to the graph
    for i, row in nodes_df.iterrows():
        attrs = row.to_dict()
        G.add_node(attrs.pop('id'), **attrs)

    # Add edges to the graph
    for i, row in edges_df.iterrows():
        attrs = row.to_dict()
        G.add_edge(attrs.pop('source'), attrs.pop('target'), **attrs)

    return G

G = make_graph(nodes, edges)

In [130]:
print(G)

Graph with 19717 nodes and 44324 edges


In [131]:
G.degree()

DegreeView({0: 5, 1: 3, 2: 3, 3: 1, 4: 1, 5: 2, 6: 22, 7: 17, 8: 1, 9: 9, 10: 6, 11: 1, 12: 10, 13: 1, 14: 1, 15: 6, 16: 29, 17: 6, 18: 8, 19: 8, 20: 1, 21: 2, 22: 2, 23: 1, 24: 4, 25: 1, 26: 5, 27: 3, 28: 1, 29: 2, 30: 1, 31: 1, 32: 1, 33: 1, 34: 3, 35: 17, 36: 1, 37: 1, 38: 4, 39: 1, 40: 8, 41: 4, 42: 1, 43: 1, 44: 1, 45: 1, 46: 11, 47: 31, 48: 18, 49: 2, 50: 1, 51: 1, 52: 1, 53: 1, 54: 6, 55: 2, 56: 1, 57: 3, 58: 7, 59: 3, 60: 25, 61: 1, 62: 22, 63: 1, 64: 2, 65: 3, 66: 1, 67: 2, 68: 1, 69: 7, 70: 1, 71: 3, 72: 5, 73: 1, 74: 1, 75: 1, 76: 3, 77: 9, 78: 1, 79: 2, 80: 3, 81: 2, 82: 1, 83: 2, 84: 3, 85: 7, 86: 6, 87: 1, 88: 14, 89: 1, 90: 1, 91: 2, 92: 1, 93: 1, 94: 1, 95: 4, 96: 1, 97: 2, 98: 1, 99: 13, 100: 2, 101: 19, 102: 3, 103: 2, 104: 1, 105: 15, 106: 1, 107: 1, 108: 5, 109: 2, 110: 23, 111: 1, 112: 17, 113: 2, 114: 1, 115: 2, 116: 24, 117: 27, 118: 2, 119: 1, 120: 1, 121: 1, 122: 1, 123: 1, 124: 16, 125: 1, 126: 1, 127: 2, 128: 1, 129: 7, 130: 1, 131: 2, 132: 11, 133: 1, 134: 1

In [None]:
plt.figure(figsize=(50,50))
pos = nx.fruchterman_reingold_layout(G)
nx.draw(G, with_labels=True, node_size=1000, font_size=6, pos=pos)
nx.draw_networkx_edges(G, pos=pos)
labels = nx.get_edge_attributes(G,'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
plt.show()

# Converting networkx graph to Pytorch geometric graph and add the features given in features.pt

In [133]:
import networkx as nx
import numpy as np
import torch
from torch_geometric.utils.convert import from_networkx

pyg_graph = from_networkx(G)

In [None]:
pip install torch-geometric

In [59]:
import torch
from torch_geometric.data import Data

edge_index = torch.tensor(pyg_graph.edge_index, dtype=torch.long)
data = Data(edge_index=edge_index)

features = torch.load('/content/features.pt')

data.x = features

  edge_index = torch.tensor(pyg_graph.edge_index, dtype=torch.long)


# GNN model

In [None]:
import os
import torch


os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [74]:
import os.path as osp

import torch
from sklearn.metrics import roc_auc_score

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling
from torch_geometric.transforms import RandomLinkSplit

import pandas as pd
import networkx as nx
import numpy as np
from sklearn.preprocessing import StandardScaler

import torch
import pandas as pd
from torch_geometric.data import InMemoryDataset, Data
from sklearn.model_selection import train_test_split
import torch_geometric.transforms as T

from torch_geometric.utils import convert
from torch_geometric.nn import SAGEConv

In [109]:
nodes['s'] = pd.factorize(nodes['id'])[0]
tag = dict(zip(nodes.id, nodes.s))
print(tag)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 99: 99, 100: 100, 101: 101, 102: 102, 103: 103, 104: 104, 105: 105, 106: 106, 107: 107, 108: 108, 109: 109, 110: 110, 111: 111, 112: 112, 113: 113, 114: 114, 115: 115, 116: 116, 117: 117, 118: 118, 119: 119, 120: 120, 121: 121,

In [110]:
edges["source"] = edges['source'].map(tag)
edges["target"] = edges['target'].map(tag)
edges = edges[['source', 'target']]
edges.head()

Unnamed: 0,source,target
0,0,1378
1,0,1544
2,0,6092
3,0,7636
4,0,14442


In [111]:
G_edge = nx.Graph()
# Iterating through the DataFrame to add edges
for _, edge in edges.iterrows():
    G_edge.add_edge(edge['source'], edge['target'])

In [112]:
pyg = convert.from_networkx(G_edge)
pyg

Data(edge_index=[2, 88648], num_nodes=19717)

In [113]:
embeddings = np.array(list(dict(G_edge.degree()).values()))

In [114]:
# normalizing degree values
scale = StandardScaler()
embeddings = scale.fit_transform(embeddings.reshape(-1,1))
pyg.x = torch.from_numpy(embeddings).type(torch.float32)
pyg

Data(edge_index=[2, 88648], num_nodes=19717, x=[19717, 1])

In [115]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      add_negative_train_samples=False),
])

In [116]:
dataset = pyg
dataset = dataset.to(device)
transform = T.RandomLinkSplit(is_undirected=True)
train_data, val_data, test_data = transform(dataset)

In [117]:
class GCNNet(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

In [118]:
GCNmodel = GCNNet(dataset.num_features, 128, 64).to(device)

In [119]:
optimizer = torch.optim.Adam(params=GCNmodel.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

In [120]:
def train():
    GCNmodel.train()
    optimizer.zero_grad()
    z = GCNmodel.encode(train_data.x, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = GCNmodel.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss

In [121]:
@torch.no_grad()
def test(data):
    GCNmodel.eval()
    z = GCNmodel.encode(data.x, data.edge_index)
    out = GCNmodel.decode(z, data.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

In [122]:
best_val_auc = final_test_auc = 0
for epoch in range(1, 101):
    loss = train()
    val_auc = test(val_data)
    test_auc = test(test_data)
    if val_auc > best_val_auc:
        best_val = val_auc
        final_test_auc = test_auc
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
          f'Test: {test_auc:.4f}')

print(f'Final Test (GCN Model): {final_test_auc:.4f}')

Epoch: 001, Loss: 0.6687, Val: 0.6784, Test: 0.6836
Epoch: 002, Loss: 0.6690, Val: 0.6769, Test: 0.6745
Epoch: 003, Loss: 0.6560, Val: 0.6721, Test: 0.6680
Epoch: 004, Loss: 0.6525, Val: 0.6625, Test: 0.6589
Epoch: 005, Loss: 0.6495, Val: 0.6535, Test: 0.6507
Epoch: 006, Loss: 0.6520, Val: 0.6517, Test: 0.6487
Epoch: 007, Loss: 0.6513, Val: 0.6548, Test: 0.6515
Epoch: 008, Loss: 0.6489, Val: 0.6590, Test: 0.6549
Epoch: 009, Loss: 0.6493, Val: 0.6606, Test: 0.6559
Epoch: 010, Loss: 0.6472, Val: 0.6607, Test: 0.6557
Epoch: 011, Loss: 0.6469, Val: 0.6601, Test: 0.6551
Epoch: 012, Loss: 0.6473, Val: 0.6596, Test: 0.6548
Epoch: 013, Loss: 0.6480, Val: 0.6594, Test: 0.6550
Epoch: 014, Loss: 0.6480, Val: 0.6593, Test: 0.6551
Epoch: 015, Loss: 0.6468, Val: 0.6590, Test: 0.6550
Epoch: 016, Loss: 0.6480, Val: 0.6579, Test: 0.6541
Epoch: 017, Loss: 0.6463, Val: 0.6566, Test: 0.6527
Epoch: 018, Loss: 0.6453, Val: 0.6558, Test: 0.6519
Epoch: 019, Loss: 0.6443, Val: 0.6557, Test: 0.6519
Epoch: 020, 

In [125]:
# Evaluate the model
model.eval()
_, pred = model(data.x.to(device), data.edge_index.to(device)).max(dim=1)
test_preds = pred[test_mask].detach().cpu().numpy()
test_ids = node_data['id'][test_mask].values

# Decode the branch labels
test_preds = le.inverse_transform(test_preds)

# Create the submission file
submission_df = pd.DataFrame({'id': test_ids, 'branch': test_preds})
submission_df.to_csv('submission.csv', index=False)