In [2]:
import sys
import os

# Add project root to sys.path so `src` works
sys.path.append(os.path.abspath(".."))

Scan the fist 100.000 lines and pick all the humans.

In [2]:
from src import parser

file_path = "../data.nosync/latest-truthy.nt.bz2"

# Scan only the first 1 million triples (fast test)
human_qids = parser.extract_human_qids(file_path, max_count=100_000_000)

print(f"✅ Found {len(human_qids)} human entities")
print("🧍 Example QIDs:", list(human_qids)[:5])

✅ Found 629673 human entities
🧍 Example QIDs: ['Q52626843', 'Q6052353', 'Q13121768', 'Q57414255', 'Q63284286']


Dictionary of human facts

In [3]:
from collections import defaultdict
from tqdm import tqdm
import json

def collect_facts_for_humans(file_path, human_qids, max_lines=None, save_path=None):
    """
    Scans the .nt.bz2 file and builds a dict of human facts.
    Optionally saves the result to a .json file.
    Returns: dict {QID: {property: [values]}}
    """
    facts = defaultdict(lambda: defaultdict(list))
    line_gen = parser.stream_triples(file_path)

    for i, (subj, pred, obj) in enumerate(tqdm(line_gen, desc="🔍 Parsing triples")):
        subj_qid = parser.get_qid(subj)
        if subj_qid in human_qids:
            pid = parser.get_qid(pred)
            val = parser.get_qid(obj) if "/entity/" in obj else obj
            facts[subj_qid][pid].append(val)

        if max_lines and i >= max_lines:
            break

    # Optional save
    if save_path:
        print(f"💾 Saving to {save_path}...")
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(facts, f, indent=2)

    return facts


In [None]:
file_path = "../data.nosync/latest-truthy.nt.bz2"
save_file = "../data.nosync/human_facts.json"

human_facts = collect_facts_for_humans(
    file_path=file_path,
    human_qids=human_qids,
    max_lines=100_000_000,
    save_path=save_file
)

qid = list(human_facts.keys())[0]
sample = human_facts[qid]

print(f"📌 Entity: {qid}")
for prop, values in list(sample.items())[:5]:  # Show only first 5 properties
    print(f"  - {prop}: {values}")


🔍 Parsing triples: 100000000it [19:24, 85885.38it/s]


💾 Saving to ../data.nosync/human_facts.json...


Text generation

In [8]:
from src.text_builder import build_text_representation, load_labels

# Load saved human facts
import json
with open("../data.nosync/human_facts.json", "r") as f:
    human_facts = json.load(f)

# Pick a QID to test
qid = list(human_facts.keys())[1]
text = build_text_representation(qid, human_facts[qid])

print(text)


Q42 has the following attributes:
- 22-rdf-syntax-ns#type: http://schema.org/Dataset, http://wikiba.se/ontology#Item
- about: Q42
- P31: Q5
- P21: Q6581097
- P106: Q214917, Q28389, Q6625963, Q4853732, Q18844224, Q245068, Q36180, Q639669
- P800: Q25169, Q20736364, Q7758404
- P19: Q350
- P1196: Q3739104
- P509: Q12152
- P20: Q159288
- P119: Q533697
- P1442: http://commons.wikimedia.org/wiki/Special:FilePath/Douglas%20Adams%27%20gravestone.jpg
- P1015: https://livedata.bibsys.no/authority/90196888
- P735: Q463035, Q19688263
- P734: Q351735
- P27: Q145
- P551: Q159288, Q84, Q909993, Q350
- P103: Q1860
- P244: http://id.loc.gov/authorities/names/n80076765
- P214: http://viaf.org/viaf/113230702
- P349: http://id.ndl.go.jp/auth/ndlna/00430962
- P434: http://musicbrainz.org/artist/e9ed318d-8cc5-4cf8-ab77-505e39ab6ea4
- P268: http://data.bnf.fr/ark:/12148/cb11888092r#about
- P227: https://d-nb.info/gnd/119033364
- P22: Q14623675
- P25: Q14623678
- P40: Q14623683
- P906: http://libris.kb.se/reso

DONT RUN BELOW EVERY TIME!!

In [2]:
import json
from src.text_builder2 import extract_relevant_ids, load_labels_for_ids

# Load facts
with open("../data.nosync/human_facts.json", "r") as f:
    human_facts = json.load(f)

# Extract only the relevant QIDs and PIDs
relevant_ids = extract_relevant_ids(human_facts)

# Load labels just for those
bz2_path = "../data.nosync/latest-truthy.nt.bz2"
label_cache = "../data.nosync/label_map_filtered.json"
label_map = load_labels_for_ids(bz2_path, relevant_ids, save_path=label_cache)


🔤 Loading selected labels: 7934396038it [3:53:32, 566237.46it/s]


💾 Saved 1,282,195 selected labels to ../data.nosync/label_map_filtered.json


In [3]:
# Loads the label map from the cache

from src.text_builder import load_labels_from_cache

label_map = load_labels_from_cache("../data.nosync/label_map_filtered.json")
print(f"✅ Loaded {len(label_map)} labels from cache")

✅ Loaded 1282195 labels from cache


In [9]:
# Load labels

from src.text_builder2 import build_text_representation, load_labels_for_ids

label_file = "../data.nosync/latest-truthy.nt.bz2"
#label_map = load_labels(label_file, max_lines=2_000_000)

# Load facts
import json
with open("../data.nosync/human_facts.json", "r") as f:
    human_facts = json.load(f)

# Build human-readable text for one QID
qid = list(human_facts.keys())[0]
text = build_text_representation(qid, human_facts[qid], label_map)

print(text)

George Washington about George Washington, cause of death acute laryngitis, died in Mount Vernon, was married to Martha Washington, had a father named Augustine Washington, had a mother named Mary Ball Washington, was a citizen of Kingdom of Great Britain, United States, worked as politician, farmer, cartographer, geometer, engineer, statesperson, revolutionary, writer, army officer, coat of arms Coat of arms of George Washington, received the award Thanks of Congress, Congressional Gold Medal, Fellow of the American Academy of Arts and Sciences, Congressional Gold Medal, place of burial Washington's Tomb, held the position of President of the United States, Commanding General of the United States Army, Commanding General of the United States Army, chairperson, President-elect of the United States, member of the Virginia House of Burgesses, Delegate to the United States Constitutional Convention, member of the Virginia House of Delegates, held the military rank of major general, lieute