In [None]:
import torch
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained("ibm/knowgl-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ibm/knowgl-large").to(device)

In [None]:

def map_relation_to_wikidata(relation):
    mapping = {
        "applies to jurisdiction": "wdt:P1001",
        "architectural style": "wdt:P149",
        "award received": "wdt:P166",
        "candidacy in election": "wdt:P726",
        "capital": "wdt:P36",
        "collection": "wdt:P195",
        "connects with": "wdt:P2789",
        "contains administrative territorial entity": "wdt:P150",
        "creator": "wdt:P170",
        "depicts": "wdt:P180",
        "different from": "wdt:P1889",
        "family name": "wdt:P734",
        "followed by": "wdt:P156",
        "has works in the collection": "wdt:P6379",
        "headquarters location": "wdt:P159",
        "instance of": "wdt:P31",
        "located in or next to body of water": "wdt:P206",
        "location": "wdt:P276",
        "made from material": "wdt:P186",
        "measured physical quantity": "wdt:P111",
        "member of political party": "wdt:P102",
        "mother": "wdt:P25",
        "movement": "wdt:P135",
        "notable work": "wdt:P800",
        "occupant": "wdt:P466",
        "owner of": "wdt:P1830",
        "parent organization": "wdt:P749",
        "part of": "wdt:P361",
        "participant in": "wdt:P1344",
        "significant person": "wdt:P3342",
        "shares border with": "wdt:P47",
        "spouse": "wdt:P26",
        "subclass of": "wdt:P279",
        "time period": "wdt:P2348",
        "twinned administrative body": "wdt:P190",
        "uses": "wdt:P2283",
        "work location": "wdt:P937",
    }
    if relation in mapping:
        return mapping[relation]
    print(f'WARNING: relation not mapped to wikidata "{relation}"')
    return None


def parse_string(s):
    s = s.strip("[]")
    # Split into subject, relation, object
    parts = s.split("|")
    result = {}
    for i, part in enumerate(parts):
        part = part.strip("()")
        mention_label_type = part.split("#")
        if i == 0:
            result["subject"] = {
                "mention": mention_label_type[0],
                "label": mention_label_type[1],
                "type": mention_label_type[2],
            }
        elif i == 1:
            result["relation"] = {"label": mention_label_type[0],
                                  "wikidata_id": map_relation_to_wikidata(mention_label_type[0])}
        else:
            result["object"] = {
                "mention": mention_label_type[0],
                "label": mention_label_type[1],
                "type": mention_label_type[2],
            }

    if not('object' in result and 'relation' in result and 'subject' in result):
        raise ValueError('Triplet incomplete')
    return result

def convert_to_triplets(knowgl_outputs):
    results = []
    for x in knowgl_outputs.split("$"):
        try:
            results.append(parse_string(x))
        except Exception as e:
            print(f'Exception {e} when parsing:', x)
            continue

    return results

In [None]:
##################### Change something here
text = "The Mona Lisa is a half-length portrait painting by Italian artist Leonardo da Vinci."
##################### Change something here

inputs = tokenizer(text, return_tensors="pt").to(device)
num_beams = 15
output = model.generate(**inputs, max_length=1000, num_beams=num_beams)

decoded_output = tokenizer.decode(output[0].to("cpu"), skip_special_tokens=True)

triplets = convert_to_triplets(decoded_output)
print(json.dumps(triplets, indent=2))

# Load some dh resources

In [None]:
import requests

##################### Change something here
#master = "https://raw.githubusercontent.com/TIBHannover/ReflectAI-DHd2025/refs/heads/main/data/Man_with_a_Book.txt"
#master = "https://raw.githubusercontent.com/TIBHannover/ReflectAI-DHd2025/refs/heads/main/data/Hunter_Getting_Dress.txt"
#master = "https://raw.githubusercontent.com/TIBHannover/ReflectAI-DHd2025/refs/heads/main/data/Conversion_of_Mary_M.txt"
#master = "https://raw.githubusercontent.com/TIBHannover/ReflectAI-DHd2025/refs/heads/main/data/Perseus_und_Andromed.txt"
master = "https://raw.githubusercontent.com/TIBHannover/ReflectAI-DHd2025/refs/heads/main/data/Portrait_of_Laura_Di.txt"
##################### Change something here

req = requests.get(master)
text = req.text
print(text)

In [None]:
inputs = tokenizer(text, return_tensors="pt").to(device)
num_beams = 15
output = model.generate(**inputs, max_length=1000, num_beams=num_beams)

decoded_output = tokenizer.decode(output[0].to("cpu"), skip_special_tokens=True)

triplets = convert_to_triplets(decoded_output)

In [None]:
print(json.dumps(triplets, indent=2))