In [1]:
import sys
import os

# Add project root to sys.path so `src` works
sys.path.append(os.path.abspath(".."))

Scan the fist 100.000 lines and pick all the humans.

In [2]:
from src import parser

file_path = "../data.nosync/latest-truthy.nt.bz2"

# Scan only the first 1 million triples (fast test)
human_qids = parser.extract_human_qids(file_path, max_count=1_000_000)

print(f"✅ Found {len(human_qids)} human entities")
print("🧍 Example QIDs:", list(human_qids)[:5])

✅ Found 13434 human entities
🧍 Example QIDs: ['Q322228', 'Q29196', 'Q212436', 'Q139475', 'Q276404']


Dictionary of human facts

In [3]:
from collections import defaultdict
from tqdm import tqdm
import json

def collect_facts_for_humans(file_path, human_qids, max_lines=None, save_path=None):
    """
    Scans the .nt.bz2 file and builds a dict of human facts.
    Optionally saves the result to a .json file.
    Returns: dict {QID: {property: [values]}}
    """
    facts = defaultdict(lambda: defaultdict(list))
    line_gen = parser.stream_triples(file_path)

    for i, (subj, pred, obj) in enumerate(tqdm(line_gen, desc="🔍 Parsing triples")):
        subj_qid = parser.get_qid(subj)
        if subj_qid in human_qids:
            pid = parser.get_qid(pred)
            val = parser.get_qid(obj) if "/entity/" in obj else obj
            facts[subj_qid][pid].append(val)

        if max_lines and i >= max_lines:
            break

    # Optional save
    if save_path:
        print(f"💾 Saving to {save_path}...")
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(facts, f, indent=2)

    return facts


In [4]:
file_path = "../data.nosync/latest-truthy.nt.bz2"
save_file = "../data.nosync/human_facts.json"

human_facts = collect_facts_for_humans(
    file_path=file_path,
    human_qids=human_qids,
    max_lines=2_000_000,
    save_path=save_file
)

qid = list(human_facts.keys())[0]
sample = human_facts[qid]

print(f"📌 Entity: {qid}")
for prop, values in list(sample.items())[:5]:  # Show only first 5 properties
    print(f"  - {prop}: {values}")


🔍 Parsing triples: 2000000it [00:33, 60059.22it/s]


💾 Saving to ../data.nosync/human_facts.json...
📌 Entity: Q23
  - 22-rdf-syntax-ns#type: ['http://schema.org/Dataset', 'http://wikiba.se/ontology#Item']
  - about: ['Q23']
  - P509: ['Q3827083']
  - P20: ['Q731635']
  - P26: ['Q191789']


Text generation

In [5]:
from src.text_builder import build_text_representation, load_labels

# Load saved human facts
import json
with open("../data.nosync/human_facts.json", "r") as f:
    human_facts = json.load(f)

# Pick a QID to test
qid = list(human_facts.keys())[0]
text = build_text_representation(qid, human_facts[qid])

print(text)


Q23 has the following attributes:
- 22-rdf-syntax-ns#type: http://schema.org/Dataset, http://wikiba.se/ontology#Item
- about: Q23
- P509: Q3827083
- P20: Q731635
- P26: Q191789
- P109: http://commons.wikimedia.org/wiki/Special:FilePath/George%20Washington%20signature.svg
- P22: Q768342
- P25: Q458119
- P27: Q161885, Q30
- P106: Q82955, Q131512, Q1734662, Q21772571, Q81096, Q372436, Q3242115, Q36180, Q38239859
- P237: Q5138446
- P94: http://commons.wikimedia.org/wiki/Special:FilePath/George%20Washington%20Arms.svg
- P18: http://commons.wikimedia.org/wiki/Special:FilePath/Gilbert%20Stuart%20Williamstown%20Portrait%20of%20George%20Washington.jpg
- P227: https://d-nb.info/gnd/11876439X
- P244: http://id.loc.gov/authorities/names/n86140996
- P214: http://viaf.org/viaf/31432428
- P166: Q3519573, Q721743, Q52382875, Q721743
- P119: Q56546631
- P39: Q11696, Q1115127, Q1115127, Q140686, Q1467287, Q88965329, Q127639560, Q20065408
- P410: Q3280545, Q2046665, Q3100539, Q104680
- P349: http://id.nd

DONT RUN BELOW EVERY TIME!!

In [6]:
# makes labelmap

from src.text_builder import load_labels

bz2_path = "../data.nosync/latest-truthy.nt.bz2"
label_cache = "../data.nosync/label_map_full.json"

label_map = load_labels(bz2_path, save_path=label_cache, save_every=100_000_000)  # No max_lines!


🔤 Loading labels: 100093011it [02:50, 278269.96it/s]

💾 [Checkpoint] Saved 834,699 labels to ../data.nosync/label_map_full.json at line 100,000,000


🔤 Loading labels: 200066689it [05:46, 202241.45it/s]

💾 [Checkpoint] Saved 1,554,766 labels to ../data.nosync/label_map_full.json at line 200,000,000


🔤 Loading labels: 300081327it [08:54, 120444.97it/s]

💾 [Checkpoint] Saved 2,502,445 labels to ../data.nosync/label_map_full.json at line 300,000,000


🔤 Loading labels: 400049604it [12:04, 67757.77it/s] 

💾 [Checkpoint] Saved 3,294,261 labels to ../data.nosync/label_map_full.json at line 400,000,000


🔤 Loading labels: 500064401it [15:23, 62608.19it/s] 

💾 [Checkpoint] Saved 4,336,525 labels to ../data.nosync/label_map_full.json at line 500,000,000


🔤 Loading labels: 600084322it [18:32, 65470.01it/s] 

💾 [Checkpoint] Saved 5,743,923 labels to ../data.nosync/label_map_full.json at line 600,000,000


🔤 Loading labels: 700059328it [21:55, 41430.53it/s] 

💾 [Checkpoint] Saved 7,351,504 labels to ../data.nosync/label_map_full.json at line 700,000,000


🔤 Loading labels: 800062016it [25:14, 39919.08it/s] 

💾 [Checkpoint] Saved 8,458,756 labels to ../data.nosync/label_map_full.json at line 800,000,000


🔤 Loading labels: 900053015it [28:05, 37925.11it/s] 

💾 [Checkpoint] Saved 9,511,597 labels to ../data.nosync/label_map_full.json at line 900,000,000


🔤 Loading labels: 918893437it [28:41, 533633.77it/s]


KeyboardInterrupt: 

In [None]:
# Loads the label map from the cache

from src.text_builder import load_labels_from_cache

label_map = load_labels_from_cache("../data.nosync/label_map_full.json")
print(f"✅ Loaded {len(label_map)} labels from cache")

In [None]:
# Load labels
label_file = "../data.nosync/latest-truthy.nt.bz2"
label_map = load_labels(label_file, max_lines=2_000_000)

# Load facts
import json
with open("../data.nosync/human_facts.json", "r") as f:
    human_facts = json.load(f)

# Build human-readable text for one QID
qid = list(human_facts.keys())[0]
text = build_text_representation(qid, human_facts[qid], label_map)

print(text)

🔤 Loading labels: 1999999it [00:03, 586403.03it/s]


George Washington has the following attributes:
- 22-rdf-syntax-ns#type: http://schema.org/Dataset, http://wikiba.se/ontology#Item
- about: George Washington
- P509: Q3827083
- P20: Q731635
- P26: Q191789
- P109: http://commons.wikimedia.org/wiki/Special:FilePath/George%20Washington%20signature.svg
- P22: Q768342
- P25: Q458119
- P27: Q161885, Q30
- P106: Q82955, Q131512, Q1734662, Q21772571, Q81096, Q372436, Q3242115, Q36180, Q38239859
- P237: Q5138446
- P94: http://commons.wikimedia.org/wiki/Special:FilePath/George%20Washington%20Arms.svg
- P18: http://commons.wikimedia.org/wiki/Special:FilePath/Gilbert%20Stuart%20Williamstown%20Portrait%20of%20George%20Washington.jpg
- P227: https://d-nb.info/gnd/11876439X
- P244: http://id.loc.gov/authorities/names/n86140996
- P214: http://viaf.org/viaf/31432428
- P166: Q3519573, Q721743, Q52382875, Q721743
- P119: Q56546631
- P39: Q11696, Q1115127, Q1115127, Q140686, Q1467287, Q88965329, Q127639560, Q20065408
- P410: Q3280545, Q2046665, Q3100539, 