# Initial Setup

## Import the required libraries

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch_geometric.nn import SimpleConv
from torch_geometric.data import Data
import warnings
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader

## Set the Device

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Load the Dataset

In [6]:
trainEdgeList = []
with open('train.txt') as inputFile:
    numNodes = int(inputFile.readline())
    for line in inputFile.readlines():
        src, dst, weight = map(int, line.split())
        trainEdgeList.append([src, dst, weight])
len(trainEdgeList)

1113811

In [7]:
trainEdgeDF = pd.DataFrame(trainEdgeList, columns = ['from', 'to', 'weight'])
trainEdgeDF['weight'] = trainEdgeDF.groupby('from')['weight'].transform(lambda x: x / x.sum())
trainEdgeDF.head()

Unnamed: 0,from,to,weight
0,701683,871966,0.007194
1,729650,974012,0.1
2,93571,927327,0.010989
3,339405,563090,1.0
4,1003443,652581,0.01087


# Topological Feature Extraction

## Populate Initial Embeddings

In [8]:
nodes = [bin(node).replace('0b', '') for node in range(numNodes)]
maxLen = max([len(node) for node in nodes])
for i in range(len(nodes)):
    nodes[i] = '0'*(maxLen - len(nodes[i])) + nodes[i]
    nodes[i] = list(map(int, [ch for ch in nodes[i]]))
features = pd.DataFrame(nodes)
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Create PyTorch-Geometric Object

In [9]:
train_edge_index = torch.tensor((trainEdgeDF[['from', 'to']].values).T)
train_edge_weight = torch.tensor((trainEdgeDF[['weight']].values).T)

In [10]:
train_features = torch.tensor(np.array(features.values, dtype = np.float32), dtype = torch.float32)
train_features.shape, train_edge_index.shape, train_edge_weight.shape

(torch.Size([1563421, 21]), torch.Size([2, 1113811]), torch.Size([1, 1113811]))

In [11]:
data = Data(x = train_features, edge_index = train_edge_index, edge_attr = train_edge_weight)
mask_length = features.shape[0]
mask = np.array([True for _ in range(mask_length)])

## Prepare Data Loader

In [12]:
loader = NeighborLoader(data, input_nodes = mask, num_neighbors = [25, 10], batch_size = 1024)

## Define and Run the Aggregator Model

In [13]:
class embedding(torch.nn.Module):
    def __init__(self):
        super(embedding, self).__init__()
        self.conv1 = SimpleConv(aggr = 'add', combine_root = 'sum')

    def forward(self, x, edge_index, edge_weight):
        return self.conv1(x, edge_index, edge_weight)

In [14]:
model = embedding().to(device)

In [15]:
for epoch in range(1, 101):
    for batch in loader:
        batch = batch.to(device)
        embeddings = model(batch.x, batch.edge_index, batch.edge_attr.t()[batch.e_id])
    if epoch % 10 == 0:
        print(f'Epoch: {epoch}')

Epoch: 10
Epoch: 20
Epoch: 30
Epoch: 40
Epoch: 50
Epoch: 60
Epoch: 70
Epoch: 80
Epoch: 90
Epoch: 100


## Extract the Topological Features

In [16]:
data = data.to(device)
embeddings = model(data.x, data.edge_index, data.edge_attr)
embeddings

tensor([[0.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [0.0000, 0.0000, 0.0035,  ..., 0.0035, 0.0035, 1.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        ...,
        [1.0000, 0.0000, 1.0000,  ..., 1.0000, 1.0000, 0.0000],
        [1.0000, 0.0000, 1.0000,  ..., 0.0000, 1.0000, 1.0000],
        [1.0000, 0.0000, 1.0000,  ..., 1.0000, 0.0000, 0.0000]],
       device='cuda:0', dtype=torch.float64)

In [17]:
embeddings_numpy = embeddings.cpu().numpy()
embeddings_numpy.shape

(1563421, 21)

In [18]:
features = pd.DataFrame(embeddings_numpy, columns = [feature for feature in range(embeddings_numpy.shape[1])])
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
1,0.0,0.0,0.003521,0.003521,0.003521,0.0,0.003521,0.003521,0.0,0.0,...,0.003521,0.003521,0.0,0.0,0.003521,0.003521,0.0,0.003521,0.003521,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0
4,0.006667,0.0,0.0,0.0,0.0,0.006667,0.006667,0.0,0.006667,0.006667,...,0.0,0.006667,0.0,0.0,0.006667,0.006667,0.006667,1.0,0.006667,0.0


## Save the Topological Features

In [None]:
features.to_csv('features.csv', index = False)