In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from models.gat_encoder import GATEncoder
from Dataset.gat_dataset import GATDataset

## Dummy Code for GAT

In [2]:
# Initialize the dataset and dataloader
data_path = "Dataset\processed.jsonl"  # Replace with your dataset path
dataset = GATDataset(data_path)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [3]:
# Initialize the GAT model
gat_model = GATEncoder(in_channels=768, hidden_channels=64, out_channels=32, heads=4)
gat_model.train()  # Set model to training mode

GATEncoder(
  (gat1): GATConv(768, 64, heads=4)
  (gat2): GATConv(256, 32, heads=1)
)

In [4]:
# Define a dummy objective (MSE loss)
criterion = nn.MSELoss()

# Optimizer
optimizer = optim.Adam(gat_model.parameters(), lr=0.001)

In [5]:
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        # Extract data from the batch
        node_features = batch["node_features"].squeeze(0)  # [num_nodes, in_channels]
        edge_list = batch["edge_list"].squeeze(0)  # [2, num_edges]
        
        # Forward pass
        output_embeddings = gat_model(node_features, edge_list)  # [num_nodes, out_channels]
        
        # Generate a dummy target (same shape as output)
        target_embeddings = torch.rand_like(output_embeddings)  # Random target embeddings
        
        # Compute the loss
        loss = criterion(output_embeddings, target_embeddings)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

Epoch 1/100, Loss: 0.21608763337135314
Epoch 2/100, Loss: 0.18426741063594818
Epoch 3/100, Loss: 0.13559029027819633
Epoch 4/100, Loss: 0.1079394280910492
Epoch 5/100, Loss: 0.1065760925412178
Epoch 6/100, Loss: 0.0951688177883625
Epoch 7/100, Loss: 0.09486806392669678
Epoch 8/100, Loss: 0.09466254562139512
Epoch 9/100, Loss: 0.09888159185647964
Epoch 10/100, Loss: 0.09477415382862091
Epoch 11/100, Loss: 0.09652042835950851
Epoch 12/100, Loss: 0.09184089899063111
Epoch 13/100, Loss: 0.0952316477894783
Epoch 14/100, Loss: 0.09304699525237084
Epoch 15/100, Loss: 0.09469532817602158
Epoch 16/100, Loss: 0.09131534323096276
Epoch 17/100, Loss: 0.0945090651512146
Epoch 18/100, Loss: 0.09299236536026001
Epoch 19/100, Loss: 0.09167257621884346
Epoch 20/100, Loss: 0.090387711673975
Epoch 21/100, Loss: 0.09034134224057197
Epoch 22/100, Loss: 0.09319190531969071
Epoch 23/100, Loss: 0.08991318494081497
Epoch 24/100, Loss: 0.09075582921504974
Epoch 25/100, Loss: 0.0924159824848175
Epoch 26/100, Los

## Dummy Code for Triple List Encoder


In [7]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from models.triple_list_encoder import TripleListEncoder 
from Dataset.triple_list_dataset import TripleListDataset

In [8]:
# Initialize dataset and dataloader
data_path = "Dataset\processed.jsonl"  # Replace with the path to your preprocessed dataset
dataset = TripleListDataset(data_path)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [9]:
# Initialize TripleListEncoder
embedding_dim = 128
tokenizer = BertTokenizer.from_pretrained("DataPrep/tokenizer_w_special_tokens")
encoder = TripleListEncoder(
    bert_model_name="bert-base-uncased",
    embedding_dim=128,
    tokenizer=tokenizer
)
encoder.train()  # Set to training mode

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TripleListEncoder(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30527, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [10]:
# Dummy optimizer
optimizer = torch.optim.Adam(encoder.parameters(), lr=0.001)

In [11]:
# Dummy training loop
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        # Extract batch data
        triple_input_ids = batch["triple_input_ids"]  # [batch_size, max_seq_length]
        triple_attention_mask = batch["triple_attention_mask"]  # [batch_size, max_seq_length]

        # Forward pass through TripleListEncoder
        triple_embeddings = encoder(triple_input_ids, triple_attention_mask)  # [batch_size, embedding_dim]

        # Generate dummy target embeddings
        target_embeddings = torch.rand_like(triple_embeddings)  # [batch_size, embedding_dim]

        # Compute a dummy loss (MSE loss)
        loss = torch.nn.functional.mse_loss(triple_embeddings, target_embeddings)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

Epoch 1/3, Loss: 0.21631427407264708
Epoch 2/3, Loss: 0.4535333514213562
Epoch 3/3, Loss: 0.11575669646263123


## Dummy Code for Text Unicoder


In [19]:
from torch.utils.data import DataLoader
from models.text_encoder import TextUnicoder
from Dataset.text_dataset import TextDataset


In [20]:
# Initialize dataset and dataloader
text_dataset = TextDataset("Dataset\processed.jsonl")
text_dataloader = DataLoader(text_dataset, batch_size=2, shuffle=True)
tokenizer = BertTokenizer.from_pretrained("DataPrep/tokenizer_w_special_tokens")

In [21]:
# Initialize Text Unicoder
text_unicoder = TextUnicoder(
    bert_model_name="bert-base-uncased",
    embedding_dim=128,
    tokenizer=tokenizer  # Reuse the tokenizer with added special tokens
)
text_unicoder.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TextUnicoder(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30527, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      

In [22]:
# Dummy optimizer
optimizer = torch.optim.Adam(text_unicoder.parameters(), lr=0.001)

In [23]:
# Dummy training loop
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for batch in text_dataloader:
        text_input_ids = batch["text_input_ids"]
        text_attention_mask = batch["text_attention_mask"]

        # Forward pass through Text Unicoder
        text_embeddings = text_unicoder(text_input_ids, text_attention_mask)

        # Dummy target embeddings
        target_embeddings = torch.rand_like(text_embeddings)

        # Compute a dummy loss (e.g., MSE)
        loss = torch.nn.functional.mse_loss(text_embeddings, target_embeddings)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(text_dataloader)}")

Epoch 1/3, Loss: 0.4590041220188141
Epoch 2/3, Loss: 0.16240791082382203
Epoch 3/3, Loss: 0.12611118704080582
