In [2]:
!pip install torch-geometric



In [3]:
import torch

import pandas as pd

from torch_geometric.data import Data
from torch_geometric.nn   import GATConv
from tqdm                 import tqdm

In [4]:
# edges
df_G = pd.read_csv('network.csv', header = None)
df_G.columns = ['source', 'target', 'weight']
df_G.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 834421 entries, 0 to 834420
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   source  834421 non-null  int64  
 1   target  834421 non-null  int64  
 2   weight  834421 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 19.1 MB


In [5]:
# features
df_nodes = pd.read_csv('node_features.csv')
df_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677640 entries, 0 to 677639
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   user_id              677640 non-null  int64  
 1   user_rt              677640 non-null  int64  
 2   num_post             677640 non-null  int64  
 3   user_time_rt         677640 non-null  float64
 4   num_post_unverified  677640 non-null  int64  
 5   num_post_non-rumor   677640 non-null  int64  
 6   num_post_true        677640 non-null  int64  
 7   num_post_false       677640 non-null  int64  
 8   num_rt_unverified    677640 non-null  int64  
 9   num_rt_non-rumor     677640 non-null  int64  
 10  num_rt_true          677640 non-null  int64  
 11  num_rt_false         677640 non-null  int64  
 12  score                677640 non-null  float64
dtypes: float64(2), int64(11)
memory usage: 67.2 MB


In [6]:
df_nodes = df_nodes.drop(columns=['score'])

In [7]:
# they need to increment from 0
nodes = df_nodes['user_id'].tolist()
node2idx = {node: idx for idx, node in enumerate(nodes)}

In [8]:
# edges tensor
edge_index = torch.tensor([[node2idx[src] for src in df_G["source"]],
                           [node2idx[tgt] for tgt in df_G["target"]]], dtype=torch.long)
edge_index.shape

torch.Size([2, 834421])

In [9]:
# weights tensor
edge_weight = torch.tensor(df_G["weight"].values, dtype=torch.float)
edge_weight.shape

torch.Size([834421])

In [10]:
# features tensor
node_features = torch.tensor(df_nodes.values, dtype=torch.float)
node_features.shape

torch.Size([677640, 12])

In [11]:
data = Data(x = node_features, edge_index=edge_index, edge_attr=edge_weight)
data

Data(x=[677640, 12], edge_index=[2, 834421], edge_attr=[834421])

In [12]:
# architecture
class GAT(torch.nn.Module):
    def __init__(self, in_channels, out_channels, layer_size = 128, heads=1):
        super(GAT, self).__init__()
        self.gat1 = GATConv(in_channels, layer_size, heads=heads, concat=False)
        self.gat2 = GATConv(layer_size, out_channels, heads=heads, concat=False)

    def forward(self, x, edge_index, edge_attr):
        x = self.gat1(x, edge_index)
        x, attention_weights = self.gat2(x, edge_index, return_attention_weights=True)
        return x, attention_weights

In [13]:
model  = GAT(in_channels=node_features.shape[1], out_channels=1)
model

GAT(
  (gat1): GATConv(12, 128, heads=1)
  (gat2): GATConv(128, 1, heads=1)
)

In [14]:
def train_model(model, lr = 0.01, epochs = 100):

  optimizer = torch.optim.Adam(model.parameters(), lr=lr)

  for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    node_embeddings, attention_weights = model(data.x, data.edge_index, data.edge_attr)
    loss = torch.mean(attention_weights[1])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

  return attention_weights

In [15]:
attention_weights = train_model(model, epochs = 10)

Epoch 1, Loss: 0.4484439194202423


In [16]:
attention_weights[1].shape

torch.Size([1511092, 1])

In [17]:
attention_scores = attention_weights[1].sum(dim=-1)

src, dst = data.edge_index
node_importance = torch.zeros(len(nodes), device=attention_scores.device)

with torch.no_grad():
  node_importance.scatter_add_(0, src, attention_scores)
  node_importance.scatter_add_(0, dst, attention_scores)

In [18]:
node_importance = node_importance.tolist()
node_importances = {node: node_importance[node2idx[node]] for node in nodes}

df = pd.DataFrame(list(node_importances.items()), columns=['user_id', 'importance'])

df = df.sort_values(by='importance', ascending=False)
df.head(10)

Unnamed: 0,user_id,importance
549576,5402612,34578.230469
265236,2557521,32386.173828
479813,428333,20113.384766
60311,1367531,7928.902344
325813,28785486,7671.791992
353527,30313925,7641.931152
624073,759251,5361.534668
326906,2883841,5351.5
79888,14780915,4447.5
113058,1652541,4209.017578


In [20]:
df.to_csv('GAT_importances.csv', index = False)