In [1]:
import pandas as pd
from bert import generate_node_embeddings, preprocessing
from transformers import BertTokenizer
from tqdm import tqdm
import time
import torch_geometric.transforms as T
from ogb.nodeproppred import PygNodePropPredDataset
import torch


nodeidx2paperid = pd.read_csv(
    'dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv.gz', compression='gzip')

raw_text = pd.read_csv('dataset/ogbn_arxiv/titleabs.tsv',
                       sep='\t', header=None, names=['paper id', 'title', 'abs'])
X = pd.merge(nodeidx2paperid, raw_text, on='paper id')
X


Unnamed: 0,node idx,paper id,title,abs
0,0,9657784,evasion attacks against machine learning at te...,"In security-sensitive applications, the succes..."
1,1,39886162,how hard is computing parity with noisy commun...,We show a tight lower bound of $\Omega(N \log\...
2,2,116214155,on the absence of the rip in real world applic...,The purpose of this paper is twofold. The firs...
3,3,121432379,a promise theory perspective on data networks,Networking is undergoing a transformation thro...
4,4,231147053,analysis of asymptotically optimal sampling ba...,Over the last 20 years significant effort has ...
...,...,...,...,...
169338,169338,3011696425,sentinet detecting localized universal attacks...,SentiNet is a novel detection framework for lo...
169339,169339,3011708313,interpretable mtl from heterogeneous domains u...,Multi-task learning (MTL) aims at improving th...
169340,169340,3011798063,learning compositional rules via neural progra...,"Many aspects of human reasoning, including lan..."
169341,169341,3012226457,certified defenses for adversarial patches,Adversarial patch attacks are among one of the...


In [2]:
device = 0
device = f'cuda:{device}' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)
dataset = PygNodePropPredDataset(
    name='ogbn-arxiv', transform=T.ToSparseTensor())
data = dataset[0]
data.adj_t = data.adj_t.to_symmetric()
data = data.to(device)

split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)


In [3]:
text = []
for ti, ab in zip(X['title'], X['abs']):
    t = 'Title: ' + ti + '\n'+'Abstract: ' + ab
    # t = ti + ab
    text.append(t)
token_id = []
attention_masks = []
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', do_lower_case=True)
for sample in tqdm(text):
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])
token_id = torch.cat(token_id, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


  0%|          | 0/169343 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 169343/169343 [09:05<00:00, 310.60it/s]


In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertModel

# Prepare DataLoader
batch_size = 16
dataset = TensorDataset(token_id, attention_masks)
dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),
    batch_size=batch_size
)

# Load the BertForSequenceClassification model
bert = BertModel.from_pretrained(
    'bert-base-uncased',
    output_attentions=False,
    output_hidden_states=False,
)

# Run on GPU
print("[!] Generating node embeddings")
start = time.time()
bert.cuda()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
features = generate_node_embeddings(bert, dataloader, device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[!] Generating node embeddings


100%|██████████| 10584/10584 [16:24<00:00, 10.75it/s]


In [7]:
from main_arxiv_gnn import GCN, train, test
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator

dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                 transform=T.ToSparseTensor())

data = dataset[0]
data.adj_t = data.adj_t.to_symmetric()
data.x = features
data = data.cuda()

split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)

gnn = GCN(data.num_features, hidden_channels=features.shape[1],
          out_channels=dataset.num_classes, num_layers=3, dropout=0.5).cuda()

evaluator = Evaluator(name='ogbn-arxiv')


gnn.reset_parameters()
EPOCHS = 500
optimizer = torch.optim.Adam(gnn.parameters(), lr=0.01)


for epoch in range(1, 1 + EPOCHS):
    loss = train(gnn, data, train_idx, optimizer)
    result = test(gnn, data, split_idx, evaluator)

    train_acc, valid_acc, test_acc = result
    print(f'Epoch: {epoch:02d}, '
          f'Loss: {loss:.4f}, '
          f'Train: {100 * train_acc:.2f}%, '
          f'Valid: {100 * valid_acc:.2f}% '
          f'Test: {100 * test_acc:.2f}%')


Epoch: 01, Loss: 4.0770, Train: 10.99%, Valid: 22.97% Test: 21.56%
Epoch: 02, Loss: 4.4701, Train: 7.69%, Valid: 14.96% Test: 22.10%
Epoch: 03, Loss: 4.0741, Train: 17.91%, Valid: 7.63% Test: 5.86%
Epoch: 04, Loss: 3.9564, Train: 4.77%, Valid: 9.56% Test: 9.53%
Epoch: 05, Loss: 3.4713, Train: 10.99%, Valid: 22.97% Test: 21.56%
Epoch: 06, Loss: 3.4462, Train: 5.97%, Valid: 3.45% Test: 2.92%
Epoch: 07, Loss: 3.4560, Train: 4.77%, Valid: 9.56% Test: 9.53%
Epoch: 08, Loss: 3.3950, Train: 7.69%, Valid: 14.96% Test: 22.10%
Epoch: 09, Loss: 3.3471, Train: 7.69%, Valid: 14.96% Test: 22.10%
Epoch: 10, Loss: 3.2365, Train: 10.99%, Valid: 22.97% Test: 21.56%
Epoch: 11, Loss: 3.1547, Train: 10.99%, Valid: 22.97% Test: 21.55%
Epoch: 12, Loss: 3.1448, Train: 5.57%, Valid: 12.54% Test: 14.02%
Epoch: 13, Loss: 3.1459, Train: 3.44%, Valid: 3.32% Test: 3.31%
Epoch: 14, Loss: 3.1255, Train: 7.55%, Valid: 13.89% Test: 20.52%
Epoch: 15, Loss: 3.0929, Train: 7.32%, Valid: 14.57% Test: 21.18%
Epoch: 16, Loss

In [8]:
features[0]

tensor([-0.2683, -0.3904, -0.9956,  0.3222,  0.8800, -0.0257, -0.6225,  0.1184,
        -0.9612, -0.9998, -0.7657,  0.9723,  0.7014,  0.8677, -0.1716, -0.4991,
         0.3457, -0.1709,  0.1376,  0.9450,  0.2497,  1.0000, -0.5088,  0.4366,
         0.1875,  0.9898, -0.5866,  0.2618,  0.6200,  0.4965,  0.3178,  0.1491,
        -0.9007, -0.0413, -0.9964, -0.9122,  0.4677,  0.0558,  0.1609,  0.0814,
        -0.2902,  0.1601,  1.0000, -0.3534,  0.7455,  0.0467, -1.0000,  0.2013,
        -0.0332,  0.9895,  0.9663,  0.9947,  0.1496,  0.2611,  0.4517, -0.7545,
        -0.3114,  0.0227, -0.2535, -0.3701, -0.3778,  0.1775, -0.9339, -0.2378,
         0.9820,  0.9800, -0.1038, -0.3181, -0.0381, -0.1220,  0.1029,  0.1095,
        -0.7900, -0.4588,  0.9399,  0.1473, -0.7871,  1.0000, -0.1108, -0.7061,
         0.9872,  0.9593,  0.6898, -0.8105,  0.9396, -1.0000,  0.6023,  0.0604,
        -0.8297,  0.0989,  0.5366, -0.1621,  0.9774,  0.7457, -0.6837, -0.7392,
        -0.1590, -0.9745, -0.3571, -0.67

In [9]:
dataset = PygNodePropPredDataset(
    name='ogbn-arxiv', transform=T.ToSparseTensor())
data = dataset[0]

In [12]:
features.shape

torch.Size([169343, 768])