In [1]:
import pandas as pd
import torch
import json

edges_df = pd.read_csv('./git_web_ml/musae_git_edges.csv')
edge_index = torch.tensor(edges_df[['id_1', 'id_2']].values.T, dtype=torch.long)

with open('./git_web_ml/musae_git_features.json', 'r') as f:
    node_features_dict = json.load(f)

max_feats = max([len(item) for item in list(node_features_dict.values())])

l = []
for item in node_features_dict.values():
    if len(item) < max_feats:
        l.append(item + [0.] * (max_feats - len(item)))
        
node_features = torch.tensor(l, dtype=torch.float)

target_df = pd.read_csv('./git_web_ml/musae_git_target.csv')
node_labels = torch.tensor(target_df['ml_target'].values, dtype=torch.long)

In [2]:
num_edges = edge_index.size(1)
edge_features = torch.zeros(num_edges, 64)

In [3]:
from torch_geometric.data import Data

data = Data(
    x=node_features,         
    edge_index=edge_index,   
    edge_attr=edge_features, 
    y=node_labels            
)

print(f"Number of nodes: {data.x.size(0)}")
print(f"Number of edges: {data.edge_index.size(1)}")
print(f"Node features shape: {data.x.shape}")
print(f"Edge features shape: {data.edge_attr.shape}")
print(f"Node labels shape: {data.y.shape}")

Number of nodes: 37699
Number of edges: 289003
Node features shape: torch.Size([37699, 42])
Edge features shape: torch.Size([289003, 64])
Node labels shape: torch.Size([37700])


In [8]:
import torch.nn as nn
from torch_geometric.nn import MessagePassing

class MLP(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout_rate=0.0, final_activation=None):
        super().__init__()
        layers = []
        prev = in_channels
        for hidden in hidden_channels:
            layers.append(nn.Linear(prev, hidden))
            layers.append(nn.ReLU())
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            prev = hidden
        layers.append(nn.Linear(prev, out_channels))
        if final_activation is not None:
            layers.append(final_activation)
        self.net = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.net(x)
    
    def reset_parameters(self):
        for m in self.net:
            if hasattr(m, 'reset_parameters'):
                m.reset_parameters()

class RGINConv(MessagePassing):
    def __init__(
        self,
        mlp_dims_node: list,
        mlp_dims_edge: list,
        aggr: str = 'sum',
        dropout_rate: float = 0.0,
    ):
        super().__init__(node_dim=0, aggr=aggr)
        self.node_dimses = mlp_dims_node
        self.edge_dimses = mlp_dims_edge
        self.aggr = aggr
        self.dropout_rate = dropout_rate
        
        self.node_mlp = nn.ModuleList([
            MLP(mlp_dims_node[0], mlp_dims_node[1:-1], mlp_dims_node[-1],
                final_activation=None, dropout_rate=dropout_rate)
            for _ in range(mlp_dims_edge[-1])
        ])
        self.edge_mlp = MLP(mlp_dims_edge[0], mlp_dims_edge[1:-1], mlp_dims_edge[-1],
                            dropout_rate=dropout_rate, final_activation=None)
        self.reset_parameters()
        
    def reset_parameters(self):
        for m in self.node_mlp:
            m.reset_parameters()
        self.edge_mlp.reset_parameters()
        
    @classmethod
    def from_config(cls, config):
        return cls(**config)
        
    def get_config(self):
        return {
            'mlp_dims_node': self.node_dimses,
            'mlp_dims_edge': self.edge_dimses,
            'aggr': self.aggr,
            'dropout_rate': self.dropout_rate,
        }
    
    def forward(self, x: torch.Tensor, edge_index, edge_attr: torch.Tensor):
        # Proses edge_attr melalui MLP dan softmax-kan di sepanjang dim terakhir
        edge_attr = self.edge_mlp(edge_attr).softmax(-1)
        size = (x.size(0), x.size(0))
        out = torch.zeros(x.size(0), self.node_dimses[-1], device=x.device)
        
        for edge_idx in range(self.edge_dimses[-1]):
            h = self.propagate(edge_index, edge_attr=edge_attr, x=x, size=size, edge_idx=edge_idx)
            h2 = self.node_mlp[edge_idx](h)
            out = out + h2
        return out
    
    def message(self, x_j: torch.Tensor, edge_attr, edge_idx) -> torch.Tensor:
        return x_j * edge_attr[:, edge_idx].unsqueeze(-1)

In [12]:
import torch.nn as nn

class GNN6Layer(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate=0.0):
        super(GNN6Layer, self).__init__()

        
        mlp_dims_node = [input_dim, hidden_dim, hidden_dim]
        mlp_dims_edge = [64, 32, 16]  
        
        
        self.convs = nn.ModuleList([
            RGINConv(
                mlp_dims_node=mlp_dims_node,
                mlp_dims_edge=mlp_dims_edge,
                aggr='sum',  
                dropout_rate=dropout_rate
            ) for _ in range(6)
        ])
        
        
        self.out_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, edge_index, edge_attr):
        print("x:",x.shape)
        for conv in self.convs:
            x = conv(x, edge_index, edge_attr)
            print("x:",x.shape)
        
        
        out = self.out_layer(x)
        return out


In [38]:
edge_index.shape

torch.Size([2, 289003])

In [44]:
from torch_geometric.utils import scatter
scatter(node_features[edge_index[1]],edge_index[0])

IndexError: index 37699 is out of bounds for dimension 0 with size 37699

In [42]:
node_features.shape

torch.Size([37699, 42])

In [14]:
model = GNN6Layer(input_dim=42, hidden_dim=256, output_dim=2, dropout_rate=0.1)
output = model(node_features, edge_index, edge_features)

print(output.shape)

x: torch.Size([37699, 42])


ValueError: Encountered invalid 'dim_size' (got '37699' but expected >= '37700')