In [None]:
import json
from langchain_community.graphs.age_graph import AGEGraph
import urllib.parse
from dataclasses import dataclass
import logging
import sys
from tqdm import tqdm
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)


database = {
    "dbname": "postgres",
    "host": "localhost",
    "user": "postgres",
    "password": "password",
    "port": "5432"
}

try:
    graph = AGEGraph(graph_name="gnd", conf=database, create=True)
    print("AGEGraph instance created successfully.")
except Exception as e:
    print(f"Failed to create AGEGraph instance: {e}")

In [2]:
from dataclasses import dataclass

@dataclass
class Subject:
    code: str
    name: str
    classification_name: str
    alternative_names: list[str]
    related: list[str]

In [3]:
import json, urllib
with open("llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-tib-core.json", "r") as gnd_subjects_file:
    gnd_subjects = json.load(gnd_subjects_file)

subjects = [
    Subject(
        code=urllib.parse.quote_plus(subject["Code"]),
        name=urllib.parse.quote_plus(subject["Name"]),
        classification_name=urllib.parse.quote_plus(subject["Classification Name"]),
        alternative_names=[
            urllib.parse.quote_plus(alternative_name)
            for alternative_name in subject["Alternate Name"]
        ],
        related=[
            urllib.parse.quote_plus(related)
            for related in subject["Related Subjects"]
        ]
    )
    for subject in gnd_subjects
]

for subject in tqdm(subjects):
    graph.query(
        f"""
        CREATE (s:Subject {{code: '{subject.code}', name: '{subject.code}', classification_name: '{subject.classification_name}'}})
        """,
    )

100%|██████████| 79427/79427 [01:42<00:00, 777.68it/s]


In [4]:
import psycopg2
conn: psycopg2.extensions.connection = graph.connection
conn.commit()
with conn.cursor() as cursor:
    cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS subject_code ON gnd.\"Subject\"(ag_catalog.agtype_access_operator(properties, '\"code\"'::agtype))")

conn.commit()

In [5]:
from tqdm import tqdm
conn.commit()
for subject in tqdm(subjects):
    for alternative_name in subject.alternative_names:
        graph.query(
            f"""
            CREATE (a:AlternativeName {{name: "{alternative_name}"}})
            """
        )
        graph.query(
            f"""
            MATCH (s:Subject {{code: "{subject.code}"}}), (a:AlternativeName {{name: "{alternative_name}"}})
                MERGE (s)-[:ALTERNATIVE_NAME]->(a)
            """
        )

100%|██████████| 79427/79427 [44:22<00:00, 29.83it/s]  


In [6]:
import psycopg2
conn: psycopg2.extensions.connection = graph.connection
conn.commit()
with conn.cursor() as cursor:
    cursor.execute("CREATE INDEX IF NOT EXISTS alternativename_name ON gnd.\"AlternativeName\"(ag_catalog.agtype_access_operator(properties, '\"name\"'::agtype))")
conn.commit()

In [7]:
for subject in tqdm(subjects):
    for related_subject in subject.related:
        graph.query(
            f"""
            MATCH (s1:Subject {{name: "{subject.name}"}}), (s2:Subject {{name: "{related_subject}"}})
                MERGE (s1)-[:RELATED]->(s2)
            """
        )

100%|██████████| 79427/79427 [18:33<00:00, 71.31it/s] 
