In [None]:

from pathlib import Path
import sqlite3
import subprocess

BASE = Path.cwd().parent.parent / 'WikiData.nosync'


### Query wikidata_labeled_wo.db

Connects to the labeled database and returns the row where the QID is Q42.

In [None]:

path = BASE / 'wikidata_labeled_wo.db'
with sqlite3.connect(path) as conn:
    cur = conn.cursor()
    cur.execute("SELECT * FROM texts WHERE qid='Q42';")
    rows = cur.fetchall()
    for row in rows:
        print(row)


### Query qid_texts_wo_clean.db

Looks up the entry for Q42 in the cleaned text database.

In [None]:

path = BASE / 'qid_texts_wo_clean.db'
with sqlite3.connect(path) as conn:
    cur = conn.cursor()
    cur.execute("SELECT * FROM texts WHERE qid='Q42';")
    rows = cur.fetchall()
    for row in rows:
        print(row)


### Inspect people_embeddings_test.h5

Uses `h5dump` to display the first embedding vector and associated metadata.

In [None]:

path = BASE / 'people_embeddings_test.h5'
cmds = [
    ['h5dump', '-d', '/embeddings', '-s', '0,0', '-c', '1,1024', str(path)],
    ['h5dump', '-d', '/qids', '-s', '0', '-c', '1', str(path)],
    ['h5dump', '-d', '/dob', '-s', '0', '-c', '1', str(path)],
    ['h5dump', '-d', '/dob_year', '-s', '0', '-c', '1', str(path)]
]
for cmd in cmds:
    result = subprocess.run(cmd, text=True, capture_output=True)
    print(' '.join(cmd))
    print(result.stdout)
