In [2]:
import json
from transformers import BertTokenizer, BertTokenizerFast, BertModel
import numpy as np

In [3]:
# file_path = r"C:\Users\Ribhav\Desktop\UIUC\Classes\Git Repos\Work\BGTT\Dataset\WikiOFGraph-test.jsonl"
file_path=r"..\Dataset\WikiOFGraph-test.jsonl"

## Tokenizer

In [4]:
# ------------------------------------------------------------------------------
# 1. Define special tokens
# ------------------------------------------------------------------------------
special_tokens = [
    "[TRIPLE_START]",
    "[TRIPLE_END]",
    "[SUBJECT]",
    "[PREDICATE]",
    "[OBJECT]"
]

# ------------------------------------------------------------------------------
# 2. Initialize tokenizer and add special tokens
# ------------------------------------------------------------------------------

# Initialize tokenizer (fast or python version)
# tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Add custom tokens
num_added_toks = tokenizer.add_tokens(special_tokens)
print(f"Added {num_added_toks} new tokens:", special_tokens)

# Example max_length for both text and triple encodings
MAX_LENGTH = 128



# --------- Test Example ----------

# Example text
text_string = "Hello world, this is a short sentence."

# Encode (or tokenize) with padding and truncation
encoding = tokenizer(
    text_string,
    padding="max_length",  # can also use 'longest' if batching multiple sequences
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors="pt"    # return PyTorch tensors
)

print("Input IDs:", encoding["input_ids"])
print("Attention Mask:", encoding["attention_mask"])
print("Decoded back:", tokenizer.batch_decode(encoding["input_ids"]))


Added 5 new tokens: ['[TRIPLE_START]', '[TRIPLE_END]', '[SUBJECT]', '[PREDICATE]', '[OBJECT]']
Input IDs: tensor([[ 101, 7592, 2088, 1010, 2023, 2003, 1037, 2460, 6251, 1012,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
Attention Mask: tensor([[

## Helper Functions

In [5]:
def parse_triples(triple_str):
    """
    Given a string like:
      '(<S> College of william & mary| <P> Represented by| <O> 1977 william & mary indians football team), (<S> ... ), ...'
    Return a list of dictionaries with keys 'subject', 'predicate', 'object'.
    """
    # Split on '),'
    triple_str = triple_str.strip()
    raw_triples = triple_str.split('),')
    
    # Clean up each triple chunk
    # We expect them to be in format: 
    # '(<S> College of william & mary| <P> Represented by| <O> 1977 william & mary indians football team)'
    parsed_triples = []
    for raw_triple in raw_triples:
        # Remove possible surrounding parentheses and spaces
        raw_triple = raw_triple.strip()
        # e.g., '(<S> College of william & mary| <P> Represented by| <O> 1977 william & mary indians football team)'
        raw_triple = raw_triple.replace('(', '').replace(')', '').strip()
        
        # Now split on '|'
        parts = raw_triple.split('|')
        # parts[0] should have '<S> ...', parts[1] => '<P> ...', parts[2] => '<O> ...'
        subject = parts[0].replace('<S>', '').strip()
        predicate = parts[1].replace('<P>', '').strip()
        obj = parts[2].replace('<O>', '').strip()
        
        parsed_triples.append({
            'subject': subject,
            'predicate': predicate,
            'object': obj
        })
    
    return parsed_triples

In [6]:
def linearize_triples(triples):
    """
    Convert the set of triples into a linear sequence like:
      [TRIPLE_START] [SUBJECT] Bob [PREDICATE] likes [OBJECT] apples [TRIPLE_END] ...
    """
    triple_str_list = []
    for t in triples:
        triple_repr = (
            "[TRIPLE_START] [SUBJECT] " + t['subject'] +
            " [PREDICATE] " + t['predicate'] +
            " [OBJECT] " + t['object'] +
            " [TRIPLE_END]"
        )
        triple_str_list.append(triple_repr)
    
    # Concat all
    return " ".join(triple_str_list)

In [7]:
# def build_edge_list(triples):
#     """
#     Build an edge list from the triples:
#     E.g. [("College of william & mary", "1977 william & mary indians football team"),
#           ("1977 william & mary indians football team", "1977 ncaa division i football season"),
#           ... ]
#     """
#     edges = []
#     for t in triples:
#         edges.append((t['subject'], t['object']))
#     return edges

def build_edge_list(triples):
    """
    Build an edge list and encode edge types:
    E.g. [("College of william & mary", "Represented by", "1977 william & mary indians football team"),
          ("1977 william & mary indians football team", "Played in", "1977 ncaa division i football season"),
          ...]
    """
    edges = []
    edge_types = []
    for t in triples:
        edges.append((t['subject'], t['object']))
        edge_types.append(t['predicate'])  # Include predicate (relationship)
    return edges, edge_types

In [8]:
def build_adjacency_matrix(triples):
    """
    Build an adjacency matrix from the given triples.
    1. Collect all unique entities.
    2. Create a matrix of size NxN (N = number of unique entities).
    3. Fill 1 where there's an edge from subject to object (can also store
       relationship/predicate if you want a more complex adjacency).
    """
    # Collect unique entities
    entities = set()
    for t in triples:
        entities.add(t['subject'])
        entities.add(t['object'])
    entities = list(entities)  # fix an ordering
    
    # Create a map from entity -> index
    entity2idx = {ent: i for i, ent in enumerate(entities)}
    
    # Initialize adjacency matrix
    N = len(entities)
    adjacency_matrix = np.zeros((N, N), dtype=int)
    
    # Fill adjacency matrix
    for t in triples:
        s_idx = entity2idx[t['subject']]
        o_idx = entity2idx[t['object']]
        adjacency_matrix[s_idx, o_idx] = 1  # or any weighting if needed
    
    return adjacency_matrix, entities


## Data Processing

In [12]:
max_test = 5  # Process only the first 5 entries as a test

processed_dataset = []

bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= max_test:
            break  # Stop after processing max_test entries

        entry = json.loads(line)
        triple_string = entry['triplet']
        text_string = entry['text']

        # a) Parse triples
        parsed = parse_triples(triple_string)

        # b) Linearize the triple set
        linearized_triples = linearize_triples(parsed)

        # c) Tokenize the text & linearized triples (with padding & truncation)
        # ---------------------------------------------------------
        text_encodings = tokenizer(
            text_string,
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )
        triple_encodings = tokenizer(
            linearized_triples,
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )

        # Convert input_ids and attention_masks to Python lists
        text_input_ids = text_encodings["input_ids"].squeeze(0).tolist()
        text_attention_mask = text_encodings["attention_mask"].squeeze(0).tolist()

        triple_input_ids = triple_encodings["input_ids"].squeeze(0).tolist()
        triple_attention_mask = triple_encodings["attention_mask"].squeeze(0).tolist()
        # ---------------------------------------------------------

        # d) Graph structure encoding
        edge_list, edge_types = build_edge_list(parsed)  # Modified to include edge types
        adjacency_matrix, entities = build_adjacency_matrix(parsed)

        # e) Generate node features using BERT embeddings
        entity_embeddings = []
        for entity in entities:
            inputs = bert_tokenizer(entity, return_tensors="pt", padding=True, truncation=True)
            outputs = bert_model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over tokens
            entity_embeddings.append(embedding.squeeze(0).tolist())

        # f) Store final outputs
        processed_dataset.append({
            "original_text": text_string,
            "original_triples": triple_string,
            "parsed_triples": parsed,
            "linearized_triples_str": linearized_triples,

            "text_input_ids": text_input_ids,
            "text_attention_mask": text_attention_mask,
            "triple_input_ids": triple_input_ids,
            "triple_attention_mask": triple_attention_mask,

            "edge_list": edge_list,  # Edge list without relationships
            "edge_types": edge_types,  # Encoded relationships
            "node_features": entity_embeddings,  # Precomputed entity embeddings
            "entities": entities  # Original entity list
        })


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Save processed output

In [13]:
out_file = r"..\Dataset\processed.jsonl"
with open(out_file, 'w', encoding='utf-8') as f_out:
    for item in processed_dataset:
        f_out.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Processed {len(processed_dataset)} entries and saved to '{out_file}'.")

Processed 5 entries and saved to '..\Dataset\processed.jsonl'.


## Split Datasets

In [32]:
import random

# Final processed dataset is in a Python list called 'processed_dataset'
data = processed_dataset[:]  # make a copy if you'd like

# Shuffle the data in-place so we get a random mix
random.shuffle(data)

# Decide splits
train_ratio = 0.80  # 80% for training
val_ratio = 0.10    # 10% for validation
test_ratio = 0.10   # 10% for testing

data_size = len(data)
train_end = int(train_ratio * data_size)
val_end = train_end + int(val_ratio * data_size)

train_data = data[:train_end]
val_data = data[train_end:val_end]
test_data = data[val_end:]

print(f"Total: {data_size} records")
print(f"Train: {len(train_data)} records")
print(f"Val:   {len(val_data)} records")
print(f"Test:  {len(test_data)} records")

def save_jsonl(data_list, filename):
    """
    Save a list of dict records to a .jsonl file (one JSON record per line).
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for record in data_list:
            f.write(json.dumps(record, ensure_ascii=False) + '\n')

save_jsonl(train_data, r"..\FinalData\train_data.jsonl")
save_jsonl(val_data,   r"..\FinalData\val_data.jsonl")
save_jsonl(test_data,  r"..\FinalData\test_data.jsonl")


Total: 50 records
Train: 40 records
Val:   5 records
Test:  5 records


## Batch Preparation

In [33]:
import torch
from torch.utils.data import Dataset, DataLoader

In [34]:
class MixedGraphTextDataset(Dataset):
    def __init__(self, processed_dataset):
        """
        processed_dataset is assumed to be a list of dicts, something like:
          {
            'linearized_triples_str': ...
            'text_input_ids': ...
            'text_attention_mask': ...
            'triple_input_ids': ...
            'triple_attention_mask': ...
            'edge_list': ...
            'adjacency_matrix': ...
            ...
          }
        We'll create 2x many samples:
          - 1 Graph→Text sample
          - 1 Text→Graph sample
        """
        self.samples = []
        
        for entry in processed_dataset:
            # 1) Graph→Text
            self.samples.append({
                "task": "G2T",  # Graph→Text
                "input_ids": entry["triple_input_ids"],      # model input is triple tokens
                "attention_mask": entry["triple_attention_mask"],
                # We also store graph structure if needed
                "graph_adj": entry.get("adjacency_matrix", None),  # or edge_list
                # The output/label we want is the text token IDs
                "label_ids": entry["text_input_ids"],
                
                # Possibly store the text attention mask as a "label mask" 
                # if you need it for language modeling or such.
                "label_mask": entry["text_attention_mask"],
            })

            # 2) Text→Graph
            self.samples.append({
                "task": "T2G",  # Text→Graph
                "input_ids": entry["text_input_ids"],        # model input is text tokens
                "attention_mask": entry["text_attention_mask"],
                # For output, we want the triple tokens
                "label_ids": entry["triple_input_ids"],
                
                # Potentially, you might want to generate the adjacency matrix too,
                # but that often goes beyond simple token generation.
                "graph_adj": entry.get("adjacency_matrix", None),
                
                # If you do some sequence modeling for the triple tokens, store mask
                "label_mask": entry["triple_attention_mask"],
            })
        
        # Shuffle samples so we don’t get all G2T then T2G in order
        # (Though you can also rely on DataLoader shuffle.)
        # import random
        # random.shuffle(self.samples)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [35]:
def mixed_collate_fn(batch_list):
    """
    Example collate function that merges a list of samples into a batch.
    The 'batch_list' is a list of dictionaries from __getitem__().
    We'll produce batched tensors for input_ids, label_ids, etc.
    We also keep the 'task' so the model knows which forward pass to apply.
    """
    # Extract each field
    tasks = [item["task"] for item in batch_list]
    
    input_ids = [item["input_ids"] for item in batch_list]
    attention_masks = [item["attention_mask"] for item in batch_list]
    label_ids = [item["label_ids"] for item in batch_list]
    label_masks = [item["label_mask"] for item in batch_list]
    graph_adjs = [item["graph_adj"] for item in batch_list]  # or None
    
    # Convert to tensors (assuming everything is a list of int)
    input_ids = torch.tensor(input_ids, dtype=torch.long)
    attention_masks = torch.tensor(attention_masks, dtype=torch.long)
    label_ids = torch.tensor(label_ids, dtype=torch.long)
    label_masks = torch.tensor(label_masks, dtype=torch.long)
    
    # graph_adjs might remain a list of adjacency matrices (each NxN), 
    # or you can turn them into a 3D tensor if they're all the same size.
    
    batch = {
        "task": tasks,  # list of tasks in this batch
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "label_ids": label_ids,
        "label_mask": label_masks,
        "graph_adj": graph_adjs
    }
    
    return batch

In [37]:
BATCH_SIZE = 4
# ------------------------------------------------------------------
# Example usage
# ------------------------------------------------------------------
def get_mixed_dataloader(processed_dataset, batch_size=4, shuffle=True):
    dataset = MixedGraphTextDataset(processed_dataset)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=mixed_collate_fn
    )
    return dataloader

# Suppose you have a processed_dataset from your earlier code:
# processed_dataset = [...]
# Now you can build a mixed dataloader:
mixed_loader = get_mixed_dataloader(processed_dataset, batch_size=BATCH_SIZE)


for batch in mixed_loader:
    print("Batch Dict Keys:", batch.keys())
    # batch is a dictionary with:
    #   batch["task"] -> e.g. ["G2T","T2G","G2T","T2G"]
    #   batch["input_ids"] -> shape [B, seq_len]
    #   batch["label_ids"] -> shape [B, seq_len]
    #   ...
    # You can decide how to handle them in your training loop.
    print("Tasks:", batch["task"])
    print("Input IDs shape:", batch["input_ids"].shape)
    print("Label IDs shape:", batch["label_ids"].shape)
    print(batch["graph_adj"])
    # ... Forward pass ...
    break  # just show the first batch

Batch Dict Keys: dict_keys(['task', 'input_ids', 'attention_mask', 'label_ids', 'label_mask', 'graph_adj'])
Tasks: ['G2T', 'G2T', 'T2G', 'T2G']
Input IDs shape: torch.Size([4, 128])
Label IDs shape: torch.Size([4, 128])
[[[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 1, 0, 0]], [[0, 0, 0, 0], [1, 0, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [1, 0, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0], [0, 0, 0], [1, 1, 0]]]
