In [54]:
import json
from langchain_community.graphs.age_graph import AGEGraph
import os
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
import psycopg2
import getpass
import os
from langchain_openai import OpenAIEmbeddings
import itertools
from langchain.embeddings import CacheBackedEmbeddings


database = {
    "database": "postgres",
    "user": "postgres",
    "password": "password",
    "host": "localhost",
    "port": "5432"
}

In [55]:
graph = AGEGraph(graph_name="gnd", conf=database)

In [56]:
connection = psycopg2.connect(
    dbname=database["database"],
    user=database["user"],
    password=database["password"],
    host=database["host"],
    port=database["port"]
)

In [57]:
documents = []

for dirs, _, f in os.walk("llms4subjects/shared-task-datasets/TIBKAT/tib-core-subjects/data/train/"):
    for file in f:
        with open(os.path.join(dirs, file)) as f:
            data = json.load(f)["@graph"]
            subjects = []
            article = None
            document = Document(page_content="", metadata={})
            for d in data:
                if "@type" in d:
                    document = Document(
                        page_content=str(d["abstract"]).replace("\"", "'"),
                        metadata={
                            "subjects": []
                        },
                        id=d["@id"]
                    )
                    if type(d["title"]) == str:
                        document.metadata["title"] = d["title"].replace("\"", "'")
                    elif type(d["title"]) == list:
                        document.metadata["title"] = d["title"][0].replace("\"", "'")
                else:
                    if type(d["sameAs"]) == str:
                        subjects.append(d["sameAs"].replace("\"", "'"))
                    elif type(d["sameAs"]) == list:
                        for s in d["sameAs"]:
                            subjects.append(s.replace("\"", "'"))
            document.metadata["subjects"] = subjects
            documents.append(document)

len(documents)

41902

In [58]:
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAPI token: ")

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)

In [60]:
vector_store = PGVector(
    embeddings=embeddings,
    collection_name="tibkat",
    connection=f"postgresql+psycopg2://{database['user']}:{database['password']}@{database['host']}:{database['port']}/{database['database']}",
    use_jsonb=True,
)

for batch in itertools.batched(documents, 100):
    print(vector_store.add_documents(documents=batch))

['https://www.tib.eu/de/suchen/id/TIBKAT%3A1831634066', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A730014142', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831640422', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831649594', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831649578', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831649640', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A730034739', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A73000435X', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831652579', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A730012433', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A730019101', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831632381', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831654482', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831633353', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831654512', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831652595', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1831636344', 'https://www.tib.eu/de/suchen/id/TIBKAT%3A730033961', 'https://www.ti