In [12]:

from pathlib import Path
import sqlite3
import subprocess
import h5py

BASE = Path.cwd().parent.parent / 'WikiData.nosync'


### Query wikidata_labeled_wo.db

Connects to the labeled database and returns the row where the QID is Q42.

In [3]:

path = BASE / 'wikidata_labeled_wo.db'
with sqlite3.connect(path) as conn:
    cur = conn.cursor()
    cur.execute("SELECT * FROM properties_labeled WHERE qid='Q42';")
    rows = cur.fetchall()
    for row in rows:
        print(row)


('Q42', 'Douglas Adams', 'about', None, 'Q42', 'Douglas Adams')
('Q42', 'Douglas Adams', 'P31', 'instance of', 'Q5', 'human')
('Q42', 'Douglas Adams', 'P21', 'sex or gender', 'Q6581097', 'male')
('Q42', 'Douglas Adams', 'P106', 'occupation', 'Q214917', 'playwright')
('Q42', 'Douglas Adams', 'P106', 'occupation', 'Q28389', 'screenwriter')
('Q42', 'Douglas Adams', 'P106', 'occupation', 'Q6625963', 'novelist')
('Q42', 'Douglas Adams', 'P106', 'occupation', 'Q4853732', "children's writer")
('Q42', 'Douglas Adams', 'P106', 'occupation', 'Q18844224', 'science fiction writer')
('Q42', 'Douglas Adams', 'P106', 'occupation', 'Q245068', 'comedian')
('Q42', 'Douglas Adams', 'P106', 'occupation', 'Q36180', 'writer')
('Q42', 'Douglas Adams', 'P106', 'occupation', 'Q639669', 'musician')
('Q42', 'Douglas Adams', 'P800', 'notable work', 'Q25169', "The Hitchhiker's Guide to the Galaxy pentalogy")
('Q42', 'Douglas Adams', 'P800', 'notable work', 'Q20736364', 'Dirk Gently series')
('Q42', 'Douglas Adams'

### Query qid_texts_wo_clean.db

Looks up the entry for Q42 in the cleaned text database.

In [8]:
path = BASE / 'qid_texts_wo_m_clean.db'
with sqlite3.connect(path) as conn:
    cur = conn.cursor()
    # only grab the fields you need
    cur.execute("SELECT qid, text FROM texts WHERE qid = ?;", ('Q42',))
    rows = cur.fetchall()

    for qid, raw_text in rows:
        # if your DB actually stores the two‐character sequence '\'+'n', 
        # turn those into real newlines:
        text = raw_text.replace('\\n', '\n')

        print(f"--- QID: {qid} ---\n")
        # now split on real newlines and print each line
        for line in text.split('\n'):
            print(line)
        print("\n")   # spacer between records

--- QID: Q42 ---

Douglas Adams
English science fiction writer and humorist (1952–2001)
Attributes include:
about: Douglas Adams
instance of: human
sex or gender: male
occupation: playwright, screenwriter, novelist, children's writer, science fiction writer, comedian, writer, musician
notable work: The Hitchhiker's Guide to the Galaxy pentalogy, Dirk Gently series, The Private Life of Genghis Khan
date of birth: 1952-03-11
place of birth: Cambridge
date of death: 2001-05-11
manner of death: natural causes
cause of death: myocardial infarction
place of death: Santa Barbara
place of burial: Highgate Cemetery
given name: Douglas, Noël
family name: Adams
country of citizenship: United Kingdom
residence: Santa Barbara, London, Brentwood, Cambridge
native language: English
father: Christopher Douglas Adams
mother: Janet Adams
child: Polly Adams
employer: BBC, The Digital Village
nominated for: Hugo Award for Best Dramatic Presentation, Locus Award for Best Science Fiction Novel
topic's main 

### Inspect people_embeddings_test.h5

Uses `h5py` to display the first embedding vector and associated metadata.

In [None]:
path = BASE / 'people_embeddings_death10k.h5'
# Check if the file exists
if not path.exists():
    print(f"File {path} does not exist. Please check the path.")
else:
    with h5py.File(path, "r") as h5:
        nr = 9000
        print(h5.keys())            # should show 'embeddings', 'qids', 'dod', 'dod_year'
        print(h5["qids"][nr], h5["dod"][nr], h5["dod_year"][nr])

<KeysViewHDF5 ['dod', 'dod_year', 'embeddings', 'qids']>
b'Q100882593' b'1903-09-03' 1903.0


In [None]:
# Shows part of the embedding for the same n as above
with h5py.File(path, "r") as h5:
        print(h5["embeddings"][nr])

[ 2.3    -2.115   0.682  ...  0.0433 -0.2769  0.0874]


In [None]:

path = BASE / 'people_embeddings_death10k.h5'
cmds = [
    ['h5dump', '-d', '/embeddings', '-s', '0,0', '-c', '1,1024', str(path)],
    ['h5dump', '-d', '/qids', '-s', '0', '-c', '1', str(path)],
    ['h5dump', '-d', '/dob', '-s', '0', '-c', '1', str(path)],
    ['h5dump', '-d', '/dob_year', '-s', '0', '-c', '1', str(path)]
]
for cmd in cmds:
    result = subprocess.run(cmd, text=True, capture_output=True)
    print(' '.join(cmd))
    print(result.stdout)


h5dump -d /embeddings -s 0,0 -c 1,1024 /Users/Saxe/Desktop/GitHub/Text-Embeddings/WikiData.nosync/people_embeddings_death10k.h5
HDF5 "/Users/Saxe/Desktop/GitHub/Text-Embeddings/WikiData.nosync/people_embeddings_death10k.h5" {
DATASET "/embeddings" {
   DATATYPE  H5T_IEEE_F16LE
   DATASPACE  SIMPLE { ( 10000, 1024 ) / ( 10000, 1024 ) }
   SUBSET {
      START ( 0, 0 );
      STRIDE ( 1, 1 );
      COUNT ( 1, 1024 );
      BLOCK ( 1, 1 );
      DATA {
      (0,0): 1.93164, -1.57812, 0.769043, 1.97559, 0.468506, -0.223999,
      (0,6): -2.33789, 0.942383, 0.249634, 0.467773, 1.69336, 2.07812,
      (0,12): -0.906738, -0.687988, -0.821777, -2.45312, -0.880859,
      (0,17): -0.737305, 1.59766, 0.535156, 0.896484, 1.22949, -0.851074,
      (0,23): 1.48047, -1.07324, -0.64209, 3.375, -0.776367, -1.39648,
      (0,29): 0.671875, 1.4209, -0.993164, 1.29297, -0.820801, -0.00688171,
      (0,35): 1.07031, 1.13477, -0.797852, -2.00195, 0.145874, 1.14258,
      (0,41): 1.18555, 0.498535, -1.74316,

In [26]:
with h5py.File(path, "r") as h5:
    #get first 10 qids
    qids = h5["qids"][:20]
    print("First 20 QIDs: ", qids)
    #get last 10 qids
    qids = h5["qids"][-20:]
    print("Last 20 QIDs: ", qids)

First 20 QIDs:  [b'Q10000001' b'Q1000002' b'Q1000005' b'Q100000832' b'Q100000854'
 b'Q100000857' b'Q100000889' b'Q10000102' b'Q100001260' b'Q1000023'
 b'Q1000026' b'Q1000034' b'Q1000044' b'Q1000045' b'Q1000048' b'Q100005'
 b'Q1000051' b'Q1000070' b'Q1000079' b'Q1000085']
Last 20 QIDs:  [b'Q100955565' b'Q100955595' b'Q100955790' b'Q100955916' b'Q100955957'
 b'Q100956' b'Q100956016' b'Q100956057' b'Q100956091' b'Q100956112'
 b'Q100956209' b'Q100956250' b'Q100956262' b'Q100956279' b'Q100956304'
 b'Q100956317' b'Q100956358' b'Q100956364' b'Q100956370' b'Q100956391']
