In [1]:
import os
parent_path = os.path.dirname(os.getcwd())
os.chdir(parent_path)

In [2]:
from utils.dataloader import GraphTextDataset, GraphDataset, TextDataset
from torch_geometric.data import DataLoader
from torch.utils.data import DataLoader as TorchDataLoader
from models.Model import BaseModel
import numpy as np
from transformers import AutoTokenizer
import torch
from torch import optim
import time
import pandas as pd
from utils.variables import ROOT_DIR

torch.cuda.empty_cache()

In [3]:
train = pd.read_csv(ROOT_DIR + '/data/train.tsv', sep='\t', header=None)
sample_train = train.sample(6000, random_state=42)
sample_train.to_csv(ROOT_DIR + '/data/sample_train.tsv', sep='\t', header=False, index=False)

In [4]:
CE = torch.nn.CrossEntropyLoss()
def contrastive_loss(v1, v2):
  logits = torch.matmul(v1,torch.transpose(v2, 0, 1))
  labels = torch.arange(logits.shape[0], device=v1.device)
  return CE(logits, labels) + CE(torch.transpose(logits, 0, 1), labels)

In [5]:
model_name = 'allenai/scibert_scivocab_uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
gt = np.load("./data/token_embedding_dict.npy", allow_pickle=True)[()]
val_dataset = GraphTextDataset(root='./data/', gt=gt, split='val', tokenizer=tokenizer)
train_dataset = GraphTextDataset(root='./data/', gt=gt, split='sample_train', tokenizer=tokenizer)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
train_dataset.len()

6000

In [8]:
nb_epochs = 5
batch_size = 96
learning_rate = 2e-5

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



In [9]:
model = BaseModel(model_name=model_name, num_node_features=300, nout=768, nhid=300, graph_hidden_channels=300) # nout = bert model hidden dim
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=learning_rate,
                                betas=(0.9, 0.999),
                                weight_decay=0.01)

2024-11-19 18:22:12.202286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732036932.214770   88153 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732036932.218617   88153 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-19 18:22:12.231214: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)



In [11]:
epoch = 0
loss = 0
losses = []
count_iter = 0
time1 = time.time()
printEvery = 10
best_validation_loss = 1000000

for i in range(nb_epochs):
    print('-----EPOCH{}-----'.format(i+1))
    model.train()
    for batch in train_loader:
        print(f"Remaining memory: {torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)} bytes")
        torch.cuda.empty_cache()
        print(f"Remaining memory after emptying: {torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)} bytes")
        input_ids = batch.input_ids
        batch.pop('input_ids')
        attention_mask = batch.attention_mask
        batch.pop('attention_mask')
        graph_batch = batch
        
        x_graph, x_text = model(graph_batch.to(device), 
                                input_ids.to(device), 
                                attention_mask.to(device))
        current_loss = contrastive_loss(x_graph, x_text)   
        optimizer.zero_grad()
        current_loss.backward()
        optimizer.step()
        loss += current_loss.item()
        
        count_iter += 1
        if count_iter % printEvery == 0:
            time2 = time.time()
            print("Iteration: {0}, Time: {1:.4f} s, training loss: {2:.4f}".format(count_iter,
                                                                        time2 - time1, loss/printEvery))
            losses.append(loss)
            loss = 0 
    model.eval()       
    val_loss = 0        
    for batch in val_loader:
        print(f"Remaining memory: {torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)} bytes")
        torch.cuda.empty_cache()
        print(f"Remaining memory after emptyin: {torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)} bytes")
        input_ids = batch.input_ids
        batch.pop('input_ids')
        attention_mask = batch.attention_mask
        batch.pop('attention_mask')
        graph_batch = batch
        x_graph, x_text = model(graph_batch.to(device), 
                                input_ids.to(device), 
                                attention_mask.to(device))
        current_loss = contrastive_loss(x_graph, x_text)   
        val_loss += current_loss.item()
    scheduler.step(val_loss)

    best_validation_loss = min(best_validation_loss, val_loss)
    print('-----EPOCH'+str(i+1)+'----- done.  Validation loss: ', str(val_loss/len(val_loader)) )
    # if best_validation_loss==val_loss:
    #     print('validation loss improoved saving checkpoint...')
    #     save_path = os.path.join('./logs/', 'model'+str(i)+'.pt')
    #     torch.save({
    #     'epoch': i,
    #     'model_state_dict': model.state_dict(),
    #     'optimizer_state_dict': optimizer.state_dict(),
    #     'validation_accuracy': val_loss,
    #     'loss': loss,
    #     }, save_path)
    #     print('checkpoint saved to: {}'.format(save_path))

  data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(self.idx_to_cid[idx])))


-----EPOCH1-----
Remaining memory: 55576576 bytes
Remaining memory after emptying: 55576576 bytes
Remaining memory: 14289734144 bytes
Remaining memory after emptying: 94112256 bytes
Remaining memory: 15616806912 bytes
Remaining memory after emptying: 114659328 bytes
Remaining memory: 15616107008 bytes
Remaining memory after emptying: 191554048 bytes
Remaining memory: 15633904128 bytes
Remaining memory after emptying: 228225536 bytes
Remaining memory: 15633894912 bytes
Remaining memory after emptying: 190467584 bytes
Remaining memory: 15614715392 bytes
Remaining memory after emptying: 190162432 bytes
Remaining memory: 15633121280 bytes
Remaining memory after emptying: 170819584 bytes
Remaining memory: 15615037440 bytes
Remaining memory after emptying: 228233216 bytes
Remaining memory: 15595379712 bytes
Remaining memory after emptying: 227449856 bytes
Iteration: 10, Time: 13.3528 s, training loss: 8.9535
Remaining memory: 15595357696 bytes
Remaining memory after emptying: 189679104 bytes

OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB. GPU 0 has a total capacity of 23.65 GiB of which 180.19 MiB is free. Including non-PyTorch memory, this process has 23.35 GiB memory in use. Of the allocated memory 23.01 GiB is allocated by PyTorch, and 96.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [12]:
print(f"Remaining memory: {torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)} bytes")

Remaining memory: 101567488 bytes


In [None]:
print(x_graph)

tensor([[-0.0960, -0.0144,  0.0120,  ..., -0.0751,  0.0200, -0.0201],
        [-0.0612, -0.0020,  0.0129,  ..., -0.0721,  0.0245, -0.0284],
        [-0.0926, -0.0240,  0.0602,  ..., -0.0586, -0.0239, -0.0348],
        ...,
        [-0.0667, -0.0247,  0.0144,  ..., -0.0562,  0.0446, -0.0238],
        [-0.0623,  0.0007, -0.0461,  ..., -0.0858,  0.0347, -0.0224],
        [-0.0685, -0.0046,  0.0322,  ..., -0.0323,  0.0093,  0.0016]],
       device='cuda:0', grad_fn=<AddmmBackward0>)


In [None]:
print(x_text)

tensor([[ 0.6709, -0.7624, -1.4448,  ..., -0.2884, -1.3536,  0.7449],
        [ 0.2619,  1.4373, -0.7355,  ..., -0.3672,  0.3473,  0.0438],
        [ 0.4230, -0.9616,  0.3963,  ..., -0.0186, -0.4200,  0.2488],
        ...,
        [ 0.8151, -0.9137, -1.6336,  ..., -1.0619,  0.5583,  1.3338],
        [ 1.3773,  1.2581, -1.0856,  ...,  1.0481,  0.4775,  0.0367],
        [ 0.1642,  0.4170,  0.5409,  ...,  1.2186, -0.7704,  0.8937]],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [None]:
print(input_ids)
print(attention_mask)
print(graph_batch)

tensor([[  102, 14194, 11597,  ...,     0,     0,     0],
        [  102, 13094,  6120,  ...,     0,     0,     0],
        [  102, 14188, 26909,  ...,     0,     0,     0],
        ...,
        [  102,  7592,   136,  ...,     0,     0,     0],
        [  102, 17130, 17501,  ...,     0,     0,     0],
        [  102,   170,   422,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
DataBatch(x=[1444, 300], edge_index=[2, 3024], batch=[1444], ptr=[38])
