In [13]:
import json
from langchain_community.graphs.age_graph import AGEGraph
from dataclasses import dataclass
import tqdm
from multiprocessing import Pool, get_context
import os


database = {
    "database": "postgres",
    "user": "postgres",
    "password": "password",
    "host": "localhost",
    "port": "5432"
}

In [14]:
AGEGraph.refresh_schema = lambda self: None

graph = AGEGraph(graph_name="gnd", conf=database, create=True)

In [15]:
@dataclass
class Subject:
    code: str
    name: str
    classification_name: str
    alternative_names: list[str]
    related: list[str]

In [16]:
import psycopg2
conn: psycopg2.extensions.connection = graph.connection

with conn.cursor() as cursor:
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_gnd_edge_end_id ON gnd._ag_label_edge USING btree (end_id);
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_gnd_edge_start_end ON gnd._ag_label_edge USING btree (start_id, end_id);
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_gnd_edge_start_id ON gnd._ag_label_edge USING btree (start_id);
    """)

conn.commit()

In [17]:
os.getpid()

365526

In [18]:
with open("llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-all.json", "r") as gnd_subjects_file:
    gnd_subjects = json.load(gnd_subjects_file)

subjects = [
    Subject(
        code=subject["Code"].replace("\"", "'"),
        name=subject["Name"].replace("\"", "'"),
        classification_name=subject["Classification Name"].replace("\"", "'"),
        alternative_names=[s.replace("\"", "'") for s in subject["Alternate Name"]],
        related=[s.replace("\"", "'") for s in subject["Related Subjects"]]
    )
    for subject in gnd_subjects
]

In [19]:

conn.commit()
print("Merging data into graph")

graph.query("""
    MERGE (s:Subject {code: "root", classification_name: "root"})
""")

with conn.cursor() as cursor:
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_subject_id ON gnd."Subject" USING btree (id);
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS subject_idx ON gnd."Subject" USING gin (properties);
    """)

conn.commit()

local_graphs = {}
def add_subject(subject: Subject):
    local_graph = local_graphs.get(os.getpid())
    if local_graph is None:
        local_graph = AGEGraph(graph_name="gnd", conf=database, create=False)
        local_graphs[os.getpid()] = local_graph
    local_graph.query(f"""
        MERGE (s:Subject {{code: "{subject.code}", classification_name: "{subject.classification_name}"}})
    """)

with Pool(8) as pool:
    results = pool.imap_unordered(add_subject, subjects)
    tqdm_bar = tqdm.tqdm(total=len(subjects))
    while True:
        try:
            result = results.next(timeout=1000)
            tqdm_bar.update()
        except StopIteration:
            break
        except Exception as e:
            break
    for g in local_graphs.values():
        g.connection.commit()
        g.connection.close()
    tqdm_bar.close()
    pool.close()
conn.commit()

Merging data into graph


100%|██████████| 204739/204739 [04:18<00:00, 793.08it/s] 


In [20]:
graph.connection.commit()

graph.query("""
    MERGE (a:AlternativeName {name: "root"})
""")
graph.query(
    f"""
    MATCH (s:Subject {{code: "root"}}), (a:AlternativeName {{name: "root"}})
        MERGE (s)-[:ALTERNATIVE_NAME]->(a)
    """
)

with conn.cursor() as cursor:
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS alternativename_idx ON gnd."AlternativeName" USING gin (properties)
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_alternative_name_id ON gnd."AlternativeName" USING btree (id);
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS age_default_alias_start_id_idx ON gnd."ALTERNATIVE_NAME" (start_id);
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS age_default_alias_end_id_idx ON gnd."ALTERNATIVE_NAME" (end_id);
    """)

conn.commit()

local_graphs = {}
def add_alternative_names(subject: Subject):
    local_graph = local_graphs.get(os.getpid())
    if local_graph is None:
        local_graph = AGEGraph(graph_name="gnd", conf=database, create=False)
        local_graphs[os.getpid()] = local_graph
    for alternative_name in [*subject.alternative_names, subject.name]:
        if "return" in alternative_name.lower():
            print(f"Skipping {alternative_name} due to weird bug")
            continue
        local_graph.query(
            f"""
            MERGE (a:AlternativeName {{name: "{alternative_name}"}})
            """
        )
        local_graph.query(
            f"""
            MATCH (s:Subject {{code: "{subject.code}"}}), (a:AlternativeName {{name: "{alternative_name}"}})
                MERGE (s)-[:ALTERNATIVE_NAME]->(a)
            """
        )

with Pool(8) as pool:
    results = pool.imap_unordered(add_alternative_names, subjects)
    tqdm_bar2 = tqdm.tqdm(total=len(subjects))
    while True:
        try:
            result = results.next(timeout=1000)
            tqdm_bar2.update()
        except StopIteration:
            break
        except Exception as e:
            print(e)
            break
    for g in local_graphs.values():
        g.connection.commit()
        g.connection.close()
    tqdm_bar2.close()
    pool.close()

 20%|██        | 41059/204739 [02:58<20:48, 131.08it/s]

Skipping Return on Capital Employed due to weird bug
Skipping Investition,Return on Investment due to weird bug
Skipping Return on Investment due to weird bug


 20%|██        | 41316/204739 [03:00<17:14, 157.93it/s]

Skipping Social Return on Investment due to weird bug


 21%|██        | 42422/204739 [03:06<09:12, 293.82it/s]

Skipping Cashflow Return on Investment due to weird bug


 22%|██▏       | 44578/204739 [03:14<11:39, 229.11it/s]

Skipping Return to education due to weird bug
Skipping Return to schooling due to weird bug


 22%|██▏       | 44799/204739 [03:15<11:29, 231.97it/s]

Skipping Returns to scale due to weird bug


 25%|██▌       | 51203/204739 [03:46<18:51, 135.67it/s]

Skipping Absolute-Return-Strategie due to weird bug
Skipping Absolute-Return-Konzept due to weird bug
Skipping Absolute-Return-Ansatz due to weird bug
Skipping Absolute Return due to weird bug


 75%|███████▍  | 153288/204739 [13:47<07:48, 109.93it/s]

Skipping Return of Spontaneous Circulation due to weird bug


 99%|█████████▉| 203388/204739 [19:59<00:10, 130.51it/s]

Skipping Return due to weird bug


100%|██████████| 204739/204739 [20:16<00:00, 168.30it/s]


In [12]:
for subject in tqdm.tqdm(subjects):
    if "return" in subject.name.lower():
        print(f"Skipping {subject.name} due to weird bug")
        continue
    for related_subject in subject.related:
        if "return" in related_subject.lower():
            print(f"Skipping {related_subject} due to weird bug")
            continue
        graph.query(
            f"""
            MATCH (s1:Subject {{code: "{subject.code}"}}), (s2:Subject)-[:ALTERNATIVE_NAME]->(a:AlternativeName {{name: "{related_subject}"}})
                MERGE (s1)-[:RELATED]->(s2)
            """
        )

graph.connection.commit()

 20%|██        | 41077/204739 [21:18<1:30:57, 29.99it/s]

Skipping Return on Investment due to weird bug


 20%|██        | 41329/204739 [21:28<1:56:33, 23.37it/s]

Skipping Return on Investment due to weird bug


 21%|██        | 42440/204739 [22:24<2:08:49, 21.00it/s]

Skipping Cashflow Return on Investment due to weird bug


 25%|██▌       | 51214/204739 [30:27<2:51:27, 14.92it/s]

Skipping Absolute Return due to weird bug


 35%|███▌      | 71879/204739 [55:04<15:14, 145.22it/s] 