In [6]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt

2.3.0+cu121
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


In [7]:
import json
import pickle
import torch
import torch_geometric
from torch_geometric.data import Data
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv


In [8]:
def json_open(file_path):
  with open(file_path, 'r') as f:
    data = json.load(f)
  return data

def pickle_open(file_path):
  with open(file_path, 'rb') as f:
    data = pickle.load(f)
  return data

In [12]:
edges = json_open('edges.json')
labels = json_open('labels.json')
keyword_embeddings = pickle_open('keyword_embeddings.pkl')
tweet_embeddings = pickle_open('tweet_embeddings.pkl')
user_embeddings = pickle_open('user_embeddings.pkl')

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import numpy as np

embeddings = np.concatenate((keyword_embeddings['embeddings'],
                            tweet_embeddings['embeddings'],
                            user_embeddings['embeddings']), axis = 0)

embeddings = torch.tensor(embeddings, dtype=torch.float)
print(embeddings.shape)

user_embeddings_tensor = torch.tensor(user_embeddings['embeddings'], dtype=torch.float)
print(user_embeddings_tensor.shape)


torch.Size([147824, 768])
torch.Size([28839, 768])


In [15]:
node_ids = keyword_embeddings['ids'] + tweet_embeddings['ids'] + user_embeddings['ids']
ids_index = {node_id: index for index, node_id in enumerate(node_ids)}


In [16]:
edge_index = torch.tensor(
    [[ids_index[edge['source_id']], ids_index[edge['target_id']]] for edge in edges],
    dtype= torch.long
)
edge_index = edge_index.t().contiguous()
print(edge_index.shape)

torch.Size([2, 729030])


In [17]:
seen = set()
for label in labels.values():
  if label not in seen:
    print(label)
    seen.add(label)


Buyer
Related
Seller
Negative


In [18]:
y_labels = [labels[node_id] for node_id in user_embeddings['ids']]
y = torch.tensor([1 if label == 'Buyer'
                  else 2 if label == 'Seller'
                  else 3 if label == 'related'
                  else 4
                  for label in y_labels],
                 dtype = torch.long)
print(y.shape)

torch.Size([28839])


In [19]:
num_nodes = len(user_embeddings['ids'])
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
print( train_mask.shape, test_mask.shape, val_mask.shape)

torch.Size([28839]) torch.Size([28839]) torch.Size([28839])


In [20]:
from sklearn.model_selection import train_test_split

train_ratio, val_ratio, test_ratio = 0.1, 0.1, 0.8
num_nodes = user_embeddings_tensor.shape[0]
indices = np.arange(num_nodes)
train_size = int(train_ratio * num_nodes)
val_size = int(val_ratio * num_nodes)
test_size = int(test_ratio * num_nodes)

train_indices, temp_indices = train_test_split(indices, train_size=train_size, random_state=42)
val_indices, test_indices = train_test_split(temp_indices, test_size=test_size, random_state=42)

train_mask[train_indices] = True
val_mask[val_indices] = True
test_mask[test_indices] = True
print(train_mask)
print(val_mask)
print(test_mask)


tensor([False, False, False,  ..., False,  True, False])
tensor([False,  True,  True,  ..., False, False, False])
tensor([ True, False, False,  ...,  True, False,  True])


In [22]:
data = Data(x=user_embeddings_tensor, y = y, edge_index= edge_index,
            train_mask = train_mask, val_mask = val_mask, test_mask = test_mask)

In [23]:
print(data.x.shape)
print(data.y.shape)
print(data.edge_index.shape)
print(data.train_mask.shape)
print(data.val_mask.shape)
print(data.test_mask.shape)
print(data.num_node_features)
print(data.y.max())


torch.Size([28839, 768])
torch.Size([28839])
torch.Size([2, 729030])
torch.Size([28839])
torch.Size([28839])
torch.Size([28839])
768
tensor(4)


In [24]:
import torch.nn

In [25]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        return x

def train(model, optimizer, data, train_mask):
    model.train()
    optimizer.zero_grad()
    if model == GCN:
        out = model(data.x, data.edge_index, data.edge_weight)
    else:
        out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, mask):
    model.eval()
    logits = model(data.x, data.edge_index)
    loss = F.cross_entropy(logits[mask], data.y[mask])
    pred = logits[mask].max(1)[1]
    correct = pred.eq(data.y[mask]).sum().item()
    accuracy = correct / mask.sum().item()
    return accuracy, loss.item()

In [28]:
model = GCN(num_features= data.num_node_features,
            hidden_channels=25,
            num_classes=4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

train_mask = data.train_mask
val_mask = data.val_mask
test_mask = data.test_mask
for epoch in range(10):
  train_loss = train(model, optimizer, data, train_mask)
  val_acc, val_loss = evaluate(model, data, val_mask)
  test_acc, test_loss = evaluate(model, data, test_mask)

  if epoch % 200 == 0:

      print(f"Epoch: {epoch + 1}/{10} | Train Loss: {train_loss:.4f} | Val Acc: {val_acc:.4f} | Test Acc: {test_acc:.4f}")


RuntimeError: index 121809 is out of bounds for dimension 0 with size 28839