In [1]:
import os
from langchain_google_genai import GoogleGenerativeAI
from dotenv import load_dotenv
from langchain_neo4j import Neo4jGraph

load_dotenv(override=True)

True

In [2]:
graph = Neo4jGraph(database="neo4j")

In [38]:
output = graph.query("""CALL apoc.export.graphml.all(
  null,
  {stream:true, useTypes:true}
)""")

In [40]:
list(output[0].keys())

['file',
 'source',
 'format',
 'nodes',
 'relationships',
 'properties',
 'time',
 'rows',
 'batchSize',
 'batches',
 'done',
 'data']

In [42]:
print(output[0]["format"])

graphml


In [29]:
with open("../data/course-data/courses.gephi", "w") as f:
    f.write(str(output[0]['data']))

In [None]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model_name="models/gemini-2.0-flash", api_key=os.getenv("GEMINI_API_KEY"),
                 base_url=os.getenv("GEMINI_BASE_URL"))
# llm2 = GoogleGenerativeAI(model="models/gemini-2.5-flash-preview-04-17", google_api_key=os.getenv("GEMINI_API_KEY"))

llm_transformer = LLMGraphTransformer(
    llm=llm,

    allowed_nodes=[
        "Course",
        "Subject",
        "Program",
        "College",
        "Degree",  # e.g. Bachelor of Arts
        "Certificate",  # e.g. Minor, Post-bac Cert.
        "Abbreviation"  # e.g. “BA”, “PhD”, “MPH”, “CTAHR”
    ],

    allowed_relationships=[
        "BELONGS_TO",  # Course → Subject
        "PART_OF",  # Subject → College
        "OFFERS",  # Program → Degree/Certificate
        "HAS_ABBREV",  # Degree/Certificate/College/Subject → Abbreviation
        "REQUIRES",  # Course → Course (prereq)
        "CROSS_LISTED_AS",  # Course ↔ Course
        "REPEATABLE_UP_TO"  # Course → Course (max repeats)
    ],

    node_properties=[
        # for Course, Subject, Program, College
        "title",
        "description",
        "credits",
        "course_number",
        "metadata",
        # for Degree/Certificate
        "full_name",
        # for Abbreviation
        "code"  # e.g. “BA”, “JD”, “CTAHR”
    ],

    relationship_properties=[
        "min_grade",  # PRE: requirements
        "max_repeats",  # how many times repeatable
        "cross_list_code"  # e.g. “ES 450” ↔ “WGSS 450”
    ]
)


In [None]:
llm.invoke("who am i speaking to?")

In [None]:
from langchain_core.documents import Document

with open("../data/course-data/catalog.json", "r") as f:
    catalog = f.read()

lines = catalog.splitlines()

chunks = [
    "\n".join(lines[i: i + 100])
    for i in range(0, len(lines), 100)
]

catalog_docs = [Document(page_content=chunk) for chunk in chunks]

with open("../data/course-data/abbreviation.txt", "r") as f:
    abbreviations = f.read()

documents = [Document(page_content=abbreviations)] # + catalog_docs[:3]
graph_documents = llm_transformer.convert_to_graph_documents(documents)

In [None]:
print(graph_documents)

In [None]:
remaining_documents = catalog_docs[2:]
remaining_graph_documents = llm_transformer.aconvert_to_graph_documents(remaining_documents)

In [None]:
import nest_asyncio, asyncio

nest_asyncio.apply()
loop = asyncio.get_event_loop()
graph_docs = loop.run_until_complete(graph_documents)

In [None]:
print(graph_docs)

In [None]:
graph.add_graph_documents(graph_docs)

In [None]:
graph.query("MATCH (n) RETURN n LIMIT 12")

In [None]:
# delete all nodes
graph.query("MATCH (n) DETACH DELETE n")