In [1]:
import json
from transformers import BertTokenizer, BertTokenizerFast
import numpy as np

In [2]:
file_path = r"C:\Users\Ribhav\Desktop\UIUC\Classes\Git Repos\Work\BGTT\Dataset\WikiOFGraph-test.jsonl"

## Tokenizer

In [3]:
# ------------------------------------------------------------------------------
# 1. Define special tokens
# ------------------------------------------------------------------------------
special_tokens = [
    "[TRIPLE_START]",
    "[TRIPLE_END]",
    "[SUBJECT]",
    "[PREDICATE]",
    "[OBJECT]"
]

# ------------------------------------------------------------------------------
# 2. Initialize tokenizer and add special tokens
# ------------------------------------------------------------------------------

# Initialize tokenizer (fast or python version)
# tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Add custom tokens
num_added_toks = tokenizer.add_tokens(special_tokens)
print(f"Added {num_added_toks} new tokens:", special_tokens)

# Example max_length for both text and triple encodings
MAX_LENGTH = 128



# --------- Test Example ----------

# Example text
text_string = "Hello world, this is a short sentence."

# Encode (or tokenize) with padding and truncation
encoding = tokenizer(
    text_string,
    padding="max_length",  # can also use 'longest' if batching multiple sequences
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors="pt"    # return PyTorch tensors
)

print("Input IDs:", encoding["input_ids"])
print("Attention Mask:", encoding["attention_mask"])
print("Decoded back:", tokenizer.batch_decode(encoding["input_ids"]))


Added 5 new tokens: ['[TRIPLE_START]', '[TRIPLE_END]', '[SUBJECT]', '[PREDICATE]', '[OBJECT]']
Input IDs: tensor([[ 101, 7592, 2088, 1010, 2023, 2003, 1037, 2460, 6251, 1012,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
Attention Mask: tensor([[

## Helper Functions

In [4]:
def parse_triples(triple_str):
    """
    Given a string like:
      '(<S> College of william & mary| <P> Represented by| <O> 1977 william & mary indians football team), (<S> ... ), ...'
    Return a list of dictionaries with keys 'subject', 'predicate', 'object'.
    """
    # Split on '),'
    triple_str = triple_str.strip()
    raw_triples = triple_str.split('),')
    
    # Clean up each triple chunk
    # We expect them to be in format: 
    # '(<S> College of william & mary| <P> Represented by| <O> 1977 william & mary indians football team)'
    parsed_triples = []
    for raw_triple in raw_triples:
        # Remove possible surrounding parentheses and spaces
        raw_triple = raw_triple.strip()
        # e.g., '(<S> College of william & mary| <P> Represented by| <O> 1977 william & mary indians football team)'
        raw_triple = raw_triple.replace('(', '').replace(')', '').strip()
        
        # Now split on '|'
        parts = raw_triple.split('|')
        # parts[0] should have '<S> ...', parts[1] => '<P> ...', parts[2] => '<O> ...'
        subject = parts[0].replace('<S>', '').strip()
        predicate = parts[1].replace('<P>', '').strip()
        obj = parts[2].replace('<O>', '').strip()
        
        parsed_triples.append({
            'subject': subject,
            'predicate': predicate,
            'object': obj
        })
    
    return parsed_triples

In [5]:
def linearize_triples(triples):
    """
    Convert the set of triples into a linear sequence like:
      [TRIPLE_START] [SUBJECT] Bob [PREDICATE] likes [OBJECT] apples [TRIPLE_END] ...
    """
    triple_str_list = []
    for t in triples:
        triple_repr = (
            "[TRIPLE_START] [SUBJECT] " + t['subject'] +
            " [PREDICATE] " + t['predicate'] +
            " [OBJECT] " + t['object'] +
            " [TRIPLE_END]"
        )
        triple_str_list.append(triple_repr)
    
    # Concat all
    return " ".join(triple_str_list)

In [6]:
def build_edge_list(triples):
    """
    Build an edge list from the triples:
    E.g. [("College of william & mary", "1977 william & mary indians football team"),
          ("1977 william & mary indians football team", "1977 ncaa division i football season"),
          ... ]
    """
    edges = []
    for t in triples:
        edges.append((t['subject'], t['object']))
    return edges


In [7]:
def build_adjacency_matrix(triples):
    """
    Build an adjacency matrix from the given triples.
    1. Collect all unique entities.
    2. Create a matrix of size NxN (N = number of unique entities).
    3. Fill 1 where there's an edge from subject to object (can also store
       relationship/predicate if you want a more complex adjacency).
    """
    # Collect unique entities
    entities = set()
    for t in triples:
        entities.add(t['subject'])
        entities.add(t['object'])
    entities = list(entities)  # fix an ordering
    
    # Create a map from entity -> index
    entity2idx = {ent: i for i, ent in enumerate(entities)}
    
    # Initialize adjacency matrix
    N = len(entities)
    adjacency_matrix = np.zeros((N, N), dtype=int)
    
    # Fill adjacency matrix
    for t in triples:
        s_idx = entity2idx[t['subject']]
        o_idx = entity2idx[t['object']]
        adjacency_matrix[s_idx, o_idx] = 1  # or any weighting if needed
    
    return adjacency_matrix, entities


## Data Processing

In [8]:
max_test = 5  # Process only the first 5 entries as a test

processed_dataset = []


with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= max_test:
            break  # stop after a few lines

        entry = json.loads(line)
        triple_string = entry['triplet']
        text_string = entry['text']

        # a) Parse triples
        parsed = parse_triples(triple_string)

        # b) Linearize the triple set
        linearized_triples = linearize_triples(parsed)

        # c) Tokenize the text & linearized triples (with padding & truncation)
        # ---------------------------------------------------------
        text_encodings = tokenizer(
            text_string,
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"  # or "tf" / "np" as you prefer
        )
        triple_encodings = tokenizer(
            linearized_triples,
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )

        # Convert input_ids and attention_masks to Python lists
        text_input_ids = text_encodings["input_ids"].squeeze(0).tolist()
        text_attention_mask = text_encodings["attention_mask"].squeeze(0).tolist()

        triple_input_ids = triple_encodings["input_ids"].squeeze(0).tolist()
        triple_attention_mask = triple_encodings["attention_mask"].squeeze(0).tolist()
        # ---------------------------------------------------------

        # d) Graph structure encoding
        edge_list = build_edge_list(parsed)
        adjacency_matrix, entities = build_adjacency_matrix(parsed)

        # e) Store final outputs
        processed_dataset.append({
            "original_text": text_string,
            "original_triples": triple_string,
            "parsed_triples": parsed,
            "linearized_triples_str": linearized_triples,

            "text_input_ids": text_input_ids,
            "text_attention_mask": text_attention_mask,
            "triple_input_ids": triple_input_ids,
            "triple_attention_mask": triple_attention_mask,

            "edge_list": edge_list,
            "adjacency_matrix": adjacency_matrix.tolist(),
            "entities": entities
        })


## Save processed output

In [9]:
out_file = r"C:\Users\Ribhav\Desktop\UIUC\Classes\Git Repos\Work\BGTT\Dataset\processed.jsonl"
with open(out_file, 'w', encoding='utf-8') as f_out:
    for item in processed_dataset:
        f_out.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Processed {len(processed_dataset)} entries and saved to '{out_file}'.")

Processed 5 entries and saved to 'C:\Users\Ribhav\Desktop\UIUC\Classes\Git Repos\Work\BGTT\Dataset\processed.jsonl'.
