In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch
from tqdm import tqdm
import re

In [None]:
from huggingface_hub import login
login()

In [None]:
model_name = "google/gemma-3-4b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_use_double_quant=True,
    )
#llm_int8_enable_fp32_cpu_offload=True,
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    token=""
    )
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config, 
    torch_dtype=torch.bfloat16,
    ).cuda()
model.eval()

In [None]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    batch_size=64  
)
def extract_keywords_batch(sentences, max_new_tokens=64):
    prompts = [
        f'Find the important keywords or entities in the following sentence:\n'
        f'Sentence: "{s}"\nKeywords:'
        for s in sentences
    ]
    outputs = generator(prompts, max_new_tokens=max_new_tokens, do_sample=False)
    batch_keywords = []
    for group in outputs:
        # group is a list of generated candidates; take the first
        text = group[0]['generated_text']
        kws = text.split("Keywords:")[-1].strip().split(",")
        batch_keywords.append([k.strip() for k in kws if k.strip()])
    return batch_keywords

def generate_triplets_batch(keywords_list, sentences, max_new_tokens=128):
    prompts = [
        f'Given the keywords: {", ".join(kw)}\n'
        f'And the sentence: "{s}"\n'
        'Identify factual relationships between the keywords in the form:\n'
        '(subject; relation; object)\nTriplets:'
        for kw, s in zip(keywords_list, sentences)
    ]
    outputs = generator(prompts, max_new_tokens=max_new_tokens, do_sample=False)
    batch_triplets = []
    for group in outputs:
        text = group[0]['generated_text']
        lines = text.split("\n")
        raw = [l for l in lines if l.startswith("(")]
        # dedupe
        seen = set(); clean = []
        for line in raw:
            inner = line.strip("() ")
            parts = re.split(r"\s*[;,]\s*", inner)
            if len(parts) >= 3:
                key = tuple(p.strip() for p in parts[:3])
                if key not in seen:
                    seen.add(key)
                    clean.append(f"({key[0]}; {key[1]}; {key[2]})")
        batch_triplets.append(clean)
    return batch_triplets


In [None]:
sentence = "INTRODUCTION A modern computer consists of one or more processors, some main memory, disks, printers, a keyboard, a mouse, a display, network interfaces, and various other input/output devices."
triplets = process_sentence(sentence)

print("Final Triplets for KG:")
for t in triplets:
    print(t)

In [None]:
import pandas as pd
from transformers import logging

logging.set_verbosity_error()

def parse_triplet_str(triplet_str):
    inner = triplet_str.strip("() ")
    parts = re.split(r"\s*;\s*", inner)
    return tuple(parts) if len(parts) == 3 else None

df = pd.read_csv("sentences50k.csv")
triplet_data = []
batch_size = 64

for i in tqdm(range(0, len(df), batch_size), desc="Extracting triplets"):
    sentences_batch = df["sentences"].iloc[i:i+batch_size].tolist()
    keywords_batch = extract_keywords_batch(sentences_batch)
    triplets_batch = generate_triplets_batch(keywords_batch, sentences_batch)
    
    for sent, triplets in zip(sentences_batch, triplets_batch):
        for t in triplets:
            subj, rel, obj = re.split(r"\s*[;,]\s*", t.strip("() "))
            triplet_data.append({
                "subject": subj,
                "relation": rel,
                "object": obj
            })

pd.DataFrame(triplet_data).to_csv("triplets_50k.csv", index=False)
