In [None]:
import json
from langchain_community.graphs.age_graph import AGEGraph
import urllib.parse
from dataclasses import dataclass
import logging
import sys
from tqdm import tqdm
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)


database = {
    "dbname": "postgres",
    "host": "localhost",
    "user": "postgres",
    "password": "password",
    "port": "5432"
}

try:
    graph = AGEGraph(graph_name="gnd", conf=database, create=True)
    print("AGEGraph instance created successfully.")
except Exception as e:
    print(f"Failed to create AGEGraph instance: {e}")

In [2]:
from dataclasses import dataclass

@dataclass
class Subject:
    code: str
    name: str
    classification_name: str
    alternative_names: list[str]
    related: list[str]

In [3]:
import json, urllib
with open("llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-tib-core.json", "r") as gnd_subjects_file:
    gnd_subjects = json.load(gnd_subjects_file)

subjects = [
    Subject(
        code=urllib.parse.quote_plus(subject["Code"]),
        name=urllib.parse.quote_plus(subject["Name"]),
        classification_name=urllib.parse.quote_plus(subject["Classification Name"]),
        alternative_names=[
            urllib.parse.quote_plus(alternative_name)
            for alternative_name in subject["Alternate Name"]
        ],
        related=[
            urllib.parse.quote_plus(related)
            for related in subject["Related Subjects"]
        ]
    )
    for subject in gnd_subjects
]

for subject in tqdm(subjects):
    graph.query(
        f"""
        CREATE (s:Subject {{code: '{subject.code}', name: '{subject.code}', classification_name: '{subject.classification_name}'}})
        """,
    )

100%|██████████| 79427/79427 [01:42<00:00, 777.68it/s]


In [4]:
import psycopg2
conn: psycopg2.extensions.connection = graph.connection
conn.commit()
with conn.cursor() as cursor:
    cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS subject_code ON gnd.\"Subject\"(ag_catalog.agtype_access_operator(properties, '\"code\"'::agtype))")

conn.commit()

In [17]:
os.getpid()

365526

In [18]:
with open("llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-all.json", "r") as gnd_subjects_file:
    gnd_subjects = json.load(gnd_subjects_file)

subjects = [
    Subject(
        code=subject["Code"].replace("\"", "'"),
        name=subject["Name"].replace("\"", "'"),
        classification_name=subject["Classification Name"].replace("\"", "'"),
        alternative_names=[s.replace("\"", "'") for s in subject["Alternate Name"]],
        related=[s.replace("\"", "'") for s in subject["Related Subjects"]]
    )
    for subject in gnd_subjects
]   

In [19]:

conn.commit()
print("Merging data into graph")

graph.query("""
    MERGE (s:Subject {code: "root", classification_name: "root"})
""")

with conn.cursor() as cursor:
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_subject_id ON gnd."Subject" USING btree (id);
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS subject_idx ON gnd."Subject" USING gin (properties);
    """)

conn.commit()

local_graphs = {}
def add_subject(subject: Subject):
    local_graph = local_graphs.get(os.getpid())
    if local_graph is None:
        local_graph = AGEGraph(graph_name="gnd", conf=database, create=False)
        local_graphs[os.getpid()] = local_graph
    local_graph.query(f"""
        MERGE (s:Subject {{code: "{subject.code}", classification_name: "{subject.classification_name}"}})
    """)

with Pool(8) as pool:
    results = pool.imap_unordered(add_subject, subjects)
    tqdm_bar = tqdm.tqdm(total=len(subjects))
    while True:
        try:
            result = results.next(timeout=1000)
            tqdm_bar.update()
        except StopIteration:
            break
        except Exception as e:
            break
    for g in local_graphs.values():
        g.connection.commit()
        g.connection.close()
    tqdm_bar.close()
    pool.close()
conn.commit()

Merging data into graph


100%|██████████| 204739/204739 [04:18<00:00, 793.08it/s] 


In [20]:
from tqdm import tqdm
conn.commit()
for subject in tqdm(subjects):
    for alternative_name in subject.alternative_names:
        graph.query(
            f"""
            CREATE (a:AlternativeName {{name: "{alternative_name}"}})
            """
        )
        graph.query(
            f"""
            MATCH (s:Subject {{code: "{subject.code}"}}), (a:AlternativeName {{name: "{alternative_name}"}})
                MERGE (s)-[:ALTERNATIVE_NAME]->(a)
            """
        )

100%|██████████| 79427/79427 [44:22<00:00, 29.83it/s]  


In [21]:
for subject in tqdm(subjects):
    for related_subject in subject.related:
        graph.query(
            f"""
            MATCH (s1:Subject {{name: "{subject.name}"}}), (s2:Subject {{name: "{related_subject}"}})
                MERGE (s1)-[:RELATED]->(s2)
            """
        )

 20%|██        | 41077/204739 [21:18<1:30:57, 29.99it/s]

Skipping Return on Investment due to weird bug


 20%|██        | 41329/204739 [21:28<1:56:33, 23.37it/s]

Skipping Return on Investment due to weird bug


 21%|██        | 42440/204739 [22:24<2:08:49, 21.00it/s]

Skipping Cashflow Return on Investment due to weird bug


 25%|██▌       | 51214/204739 [30:27<2:51:27, 14.92it/s]

Skipping Absolute Return due to weird bug


 75%|███████▍  | 153307/204739 [1:06:22<15:41, 54.62it/s] 

Skipping Return of Spontaneous Circulation due to weird bug


 99%|█████████▉| 203402/204739 [1:19:40<00:12, 103.33it/s]

Skipping Return due to weird bug


100%|██████████| 204739/204739 [1:20:04<00:00, 42.61it/s] 
