# **Install and Import Libraries**

> ##### **Add the OpenAI API key in config/secrets.env file as follows:**

> ###### **OPENAI_API_KEY = "<api_key>"**


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import json
from neo4j import GraphDatabase
# change dir to the root of the project

# load config
load_dotenv("../config/config.env")

# load secrets
load_dotenv("../config/secrets.env")

from data_pipeline import *

# change import dir to src
import sys
sys.path.append('../src')
import llm_kg_retrieval

# **1. Scrape Website**
> Takes approximately 12 minutes to run.

In [None]:
scrape_website()

# **2. Download all PDFs from links**

In [None]:
download_documents()

# **3. Extract HTML from PDFs**

In [None]:
# only convert pdf and docx files so it might be less than the downloaded files
convert_files()

# **4. Extract Meeting Metadata from PDF with LLM**

In [None]:
type = "agenda"
df = get_documents_dataframe(type=type)
df = df[df["body"] == "Stadsfullmäktige"]

In [None]:
df

In [None]:
# asynchronously extract meeting metadata, max 150 calls per minute (taking into account openai rate limits)
await extract_meeting_data(df=df, type=type)

# **5. Extract Proposals and Decisions**

## Still not good output from LLM

In [None]:
await extract_meeting_agenda()

# **6. Export JSON**

Following code is used to construct individual metadata JSON files for each organs and meetings so it is easier to inspect and debug. It creates a folder called 'extracted_json' in the data directory and makes folder for each organ and meetings.

In [None]:
def construct_individual_json():
    """
    Constructs the individual json files for each meeting item and meeting metadata
    """
    df = pd.read_csv(os.getenv("METADATA_FILE"))
    df.fillna("", inplace=True)

    for index, row in df.iterrows():
        # create the path to the save the json files
        save_path = os.path.join(os.getenv("EXTRACTED_JSON_PATH"), row['body'], row['meeting_date'], row['doc_name'].split(".")[0])

        if row['end_time'] != "":

            # ensure that the path exists
            os.makedirs(os.path.dirname(save_path), exist_ok=True)

            # path to save the metadata json file
            metadata_save_path = os.path.join(os.path.dirname(save_path), "llm_meeting_metadata.json")

            json_data = f'''{{
                    "meeting_date": "{row['meeting_date']}",
                    "start_time": "{row['start_time']}",
                    "meeting_reference": "{row['meeting_reference']}",
                    "end_time": "{row['end_time']}",
                    "meeting_location": "{row['meeting_location']}",
                    "participants": {row['participants']},
                    "substitutes": {row['substitutes']},
                    "additional_attendees": {row['additional_attendees']},
                    "signed_by": {row['signed_by']},
                    "adjusted_by": {row['adjusted_by']},
                    "adjustment_date": "{row['adjustment_date']}",
                    "meeting_items": [] }}''' # meeting items is added when constructing the aggregate JSON file

            json_data = json.dumps(json.loads(json_data), indent=4, ensure_ascii=False)
            # save the metadata json file
            with open(metadata_save_path, "w") as f:
                f.write(json_data)
                
        elif row['agenda_metadata'] != "":
            # ensure that the path exists
            os.makedirs(save_path, exist_ok=True)

            # path to save the meeting item json file
            item_save_path = os.path.join(save_path, "llm_meeting_item.json")

            item = json.loads(row['agenda_metadata'])
            item['title'] = row['title']
            item['section'] = row['section']

            # get all the atachments of the row based on parent link
            attachments = df[df['parent_link'] == row['doc_link']]

            # add the attachments to the item
            item['attachments'] = []
            for index, attachment in attachments.iterrows():
                item['attachments'].append({
                    "title": attachment['title'],
                    "link": attachment['doc_link']
                })
            # save the meeting item json file
            with open(item_save_path, "w") as f:
                f.write(json.dumps(item, indent=4, ensure_ascii=False))

In [None]:
construct_individual_json()

In [None]:
def construct_aggregate_json(construct_from = "llm"): # construct_from = "llm" or "manual"
    """
    Constructs a single JSON out of all the meeting metadata and items
    """
    aggregate_json = {}
    aggregate_json['body'] = []
    extracted_json_path = os.getenv("EXTRACTED_JSON_PATH")
    if not os.path.exists(extracted_json_path):
        print("Extracted JSON path does not exist")
        return
    for body in os.scandir(extracted_json_path):
        if not body.is_dir():
            continue
        aggregate_meeting = []
        for meeting in os.scandir(body.path):
            metadata_path = os.path.join(meeting.path, f"{construct_from}_meeting_metadata.json")
            if os.path.exists(metadata_path):
                with open(metadata_path, "r") as f:
                    metadata = json.load(f)
                for item in os.scandir(meeting.path):
                    if item.is_dir():
                        item_path = os.path.join(item.path, f"{construct_from}_meeting_item.json")
                        if os.path.exists(item_path):
                            with open(item_path, "r") as f:
                                item = json.load(f)
                            metadata['meeting_items'].append(item)
                aggregate_meeting.append(metadata)
        aggregate_json['body'].append({
            "name": body.name,
            "meetings": aggregate_meeting
        })

    with open(os.path.join(extracted_json_path, f"{construct_from}_aggregate_data.json"), "w") as f:
        f.write(json.dumps(aggregate_json, indent=4, ensure_ascii=False))

In [None]:
construct_aggregate_json() # construct_from = "llm" or "manual"

In [None]:
# sanity check for number of folders created. should correspond to number of meetings with metadata extracted with llm
import glob
len(glob.glob(os.getenv("EXTRACTED_JSON_PATH") + "/*/*/*"))

# **7. Create a Knowledge Graph from JSON**

In [None]:
# setup cohere api for embedding
import cohere
from tqdm import tqdm

In [None]:
def generate_embeddings(texts):
    """
    Generates embeddings for the input texts
    """
    co = cohere.Client(os.getenv("COHERE_API_KEY"))
    response = co.embed(texts=texts, model='embed-multilingual-v3.0', input_type="search_document")  
    return response.embeddings

In [None]:
# Function to execute Cypher queries
def execute_cypher_queries(driver, data):
    """
    Executes Cypher queries to create a knowledge graph in Neo4j

    Args:
        driver : neo4j driver
        data : JSON data

    Returns:
        None
    """

    with driver.session() as session:
        # Delete existing nodes and relationships
        print("Deleting existing nodes and relationships...")
        session.run("MATCH (n) DETACH DELETE n")

        body_embeddings = generate_embeddings([body.get("name", "") for body in data.get("body", [])])
        for i, body in enumerate(tqdm(data.get("body", []), position=0, desc="Creating bodies")):
            # Merge Body
            body_name = body.get("name", "")
            session.run("""
                MERGE (b:Body {name: $body_name})
                SET b.name_embedding = $name_embedding
                """, 
                body_name=body_name,
                name_embedding=body_embeddings[i])

            # Process meetings
            meeting_embeddings = generate_embeddings([meeting.get("meeting_location", "") for meeting in body.get("meetings", [])])
            for j, meeting in enumerate(tqdm(body.get("meetings", []), position=1, leave=False, desc="Creating meetings")):
                # Merge Meeting
                meeting_location = meeting.get("meeting_location", "")
                meeting_id = session.run("""
                    MERGE (m:Meeting {
                    meeting_date: $meeting_date,
                    start_time: $start_time,
                    meeting_reference: $meeting_reference,
                    end_time: $end_time,
                    meeting_location: $meeting_location
                    })
                    WITH m
                    MATCH (b:Body {name: $body_name})
                    MERGE (b)-[:HOSTED]->(m)
                    SET m.meeting_location_embedding = $meeting_location_embedding
                    RETURN id(m)
                    """, 
                    meeting_date=meeting.get("meeting_date", ""),
                    start_time=meeting.get("start_time", ""),
                    meeting_reference=meeting.get("meeting_reference", ""),
                    end_time=meeting.get("end_time", ""),
                    meeting_location=meeting_location,
                    body_name=body_name,
                    meeting_location_embedding=meeting_embeddings[j]
                    ).single()[0]

                # Process participants
                for person in meeting.get("participants", []):
                    session.run("""
                        MERGE (p:Person {fname: $fname, lname: $lname})
                        WITH p
                        MATCH (m:Meeting) WHERE id(m) = $meeting_id
                        MERGE (p)-[:ATTENDED {
                        role: $role, 
                        attendance: coalesce($attendance, '')
                        }]->(m)
                        """, 
                        fname=person.get("fname", ""),
                        lname=person.get("lname", ""),
                        role=person.get("role", ""),
                        attendance=person.get("attendance", ""),
                        meeting_id=meeting_id)

                # Process Substitutes
                for substitute in meeting.get("substitutes", []):
                    session.run("""
                        // Create or find the substitute node and connect to the meeting
                        MERGE (s:Person {fname: $fname, lname: $lname})
                        WITH s
                        MATCH (m:Meeting) WHERE id(m) = $meeting_id
                        MERGE (s)-[:SUBSTITUTE_ATTENDEE]->(m)
                        WITH s
                        // Only proceed if substituted_for is not an empty string
                        WHERE $substituted_for <> ''
                        // Create or find the substituted person node and create a relationship
                        MERGE (substituted:Person {name: $substituted_for})
                        MERGE (s)-[:SUBSTITUTED_FOR]->(substituted)
                        """, 
                        fname=substitute.get("fname", ""),
                        lname=substitute.get("lname", ""),
                        substituted_for=substitute.get("substituted_for", ""),
                        meeting_id=meeting_id)

                # Process Additional Attendees
                for attendee in meeting.get("additional_attendees", []):
                    session.run("""
                        MERGE (a:Person {fname: $fname, lname: $lname})
                        WITH a
                        MATCH (m:Meeting) WHERE id(m) = $meeting_id
                        MERGE (a)-[:ADDITIONAL_ATTENDEE {
                            role: coalesce($role, '')
                        }]->(m)
                    """, 
                        fname=attendee.get("fname", ""),
                        lname=attendee.get("lname", ""),
                        role=attendee.get("role", ""),
                        meeting_id=meeting_id)
                    
                                    # Process Signatories
                for signatory in meeting.get("signed_by", []):
                    session.run("""
                        MERGE (s:Person {fname: coalesce($fname, ''), lname: coalesce($lname, '')})
                        WITH s
                        MATCH (m:Meeting) WHERE id(m) = $meeting_id
                        MERGE (s)-[:SIGNED]->(m)
                        """, 
                        fname=signatory.get("fname", ""),
                        lname=signatory.get("lname", ""),
                        meeting_id=meeting_id)
                    
                # Process Adjusters
                for adjuster in meeting.get("adjusted_by", []):
                    name = adjuster.split(" ")
                    session.run("""
                        MERGE (a:Person {fname: coalesce($fname, ''), lname: coalesce($lname, '')})
                        WITH a
                        MATCH (m:Meetmng) WHERE id(m) = $meeting_id
                        MERGE (a)-[:ADJUSTED]->(m)
                        """, 
                        fname=" ".join(name[:-1]) if len(name) > 2 else name[0],
                        lname=name[-1],
                        meeting_id=meeting_id)

                # Process Meeting Items
                for item in meeting.get("meeting_items", []):
                    item_embeddings = generate_embeddings([
                        item.get("title", ""),
                        item.get("context", ""),
                        item.get("decision", "")
                    ])
                    item_id = session.run("""
                        MERGE (i:MeetingItem {
                            title: coalesce($title, ''),
                            section: coalesce($section, ''),
                            context: coalesce($context, ''),
                            decision: coalesce($decision, '')
                        })
                        WITH i
                        MATCH (m:Meeting) WHERE id(m) = $meeting_id
                        MERGE (m)-[:HAS_ITEM]->(i)
                        SET i.title_embedding = $title_embedding,
                            i.context_embedding = $context_embedding,
                            i.decision_embedding = $decision_embedding
                        RETURN id(i)
                        """, 
                        title=item.get("title", ""),
                        section=item.get("section", ""),
                        meeting_id=meeting_id,
                        context=item.get("context", ""),
                        decision=item.get("decision", ""),
                        title_embedding=item_embeddings[0],
                        context_embedding=item_embeddings[1],
                        decision_embedding=item_embeddings[2]
                        ).single()[0]

                    # Process Preparers and Proposers similarly inside Meeting Items
                    for preparer in item.get("prepared_by", []):
                        session.run("""
                            MERGE (p:Person {fname: coalesce($fname, ''), lname: coalesce($lname, '')})
                            WITH p
                            MATCH (i:MeetingItem) WHERE id(i) = $item_id
                            MERGE (p)-[:PREPARED]->(i)
                            """, 
                            fname=preparer.get("fname", ""),
                            lname=preparer.get("lname", ""),
                            item_id=item_id)
                        
                    for proposer in item.get("proposal_by", []):
                        session.run("""
                            MERGE (p:Person {fname: coalesce($fname, ''), lname: coalesce($lname, '')})
                            WITH p
                            MATCH (i:MeetingItem) WHERE id(i) = $item_id
                            MERGE (p)-[:PROPOSED]->(i)
                            """, 
                            fname=proposer.get("fname", ""),
                            lname=proposer.get("lname", ""),
                            item_id=item_id)
                        
                    # Process Meeting Item Attachments
                    attachment_embeddings = generate_embeddings([attachment.get("title", "") for attachment in item.get("attachments", [])])
                    for k, attachment in enumerate(item.get("attachments", [])):
                        session.run("""
                            MERGE (a:Attachment {link: coalesce($link, ''), title: coalesce($title, '')})
                            WITH a
                            MATCH (i:MeetingItem) WHERE id(i) = $item_id
                            MERGE (i)-[:HAS_ATTACHMENT]->(a)
                            SET a.title_embedding = $title_embedding
                            """, 
                            link=attachment.get("link", ""),
                            title=attachment.get("title", ""),
                            title_embedding=attachment_embeddings[k],
                            item_id=item_id)

def create_embeddings_index(driver):
    """
    Creates vector indexes for the embeddings in Neo4j

    Args:
        driver : neo4j driver

    Returns:
        None
    """
    print("Creating vector indexes...")
    with driver.session() as session:
            
            # drop existing indexes
            embedding_names = session.run("""SHOW VECTOR INDEXES YIELD name""").to_df()["name"].tolist()

            # drop existing indexes
            for name in embedding_names:
                session.run(f"DROP INDEX `{name}`")

            # define options for cypher query
            options = """OPTIONS {indexConfig: {
                    `vector.dimensions`: 1024,
                    `vector.similarity_function`: 'cosine'}}"""

            # create vector index for each embedding
            session.run(f"""
                CREATE VECTOR INDEX `body_name_embedding` IF NOT EXISTS
                FOR (b:Body) ON (b.name_embedding)
                {options}
            """)
            session.run(f"""
                CREATE VECTOR INDEX `meeting_location_embedding` IF NOT EXISTS
                FOR (n:Meeting) ON (n.meeting_location_embedding)
                {options}
            """)

            item_properties = ["title_embedding", "context_embedding", "proposal_embedding", "decision_embedding"]
            for property in item_properties:
                session.run(f"""
                    CREATE VECTOR INDEX `item_{property}` IF NOT EXISTS
                    FOR (n:MeetingItem) ON (n.{property})
                    {options}
                """)
                
            session.run(f"""
                CREATE VECTOR INDEX `attachment_title_embedding` IF NOT EXISTS
                FOR (n:Attachment) ON (n.title_embedding)
                {options}
            """)

def post_process_knowledge_graph(driver):
    """
    Post-processes the knowledge graph in Neo4j

    Args:
        driver : neo4j driver

    Returns:
        None
    """
    print("Post-processing knowledge graph...")
    with driver.session() as session:
        # convert date string (mm.dd.yyyy) to datetime
        session.run("""
            MATCH (m:Meeting)
            WHERE toString(m.meeting_date) = m.meeting_date
            WITH m, 
                split(m.meeting_date, '.') AS dateParts
            WITH m, 
                toInteger(dateParts[0]) AS day, 
                toInteger(dateParts[1]) AS month, 
                toInteger(dateParts[2]) AS year
            SET m.meeting_date = datetime({ year: year, month: month, day: day })
        """)

        # TODO: Add logic to merge duplicate Person nodes with interchanged fname and lname, and also with variation in spelling of names  

def get_index_info(driver):
    with driver.session() as session:
        res = session.run("""
                        SHOW VECTOR INDEXES YIELD name, labelsOrTypes, properties
                            """)
        return res.to_df().to_markdown()


def create_knowledge_graph(constuct_from = "llm"): # construct_from = "llm" or "manual"
    # Load JSON data
    aggregate_json_path = os.path.join(os.getenv("EXTRACTED_JSON_PATH"), f"{constuct_from}_aggregate_data.json")

    with open(aggregate_json_path, "r") as f:
        data = json.load(f)

    # Neo4j connection details
    uri = os.getenv("NEO4J_URI")
    username = os.getenv("NEO4J_USERNAME")
    password = os.getenv("NEO4J_PASSWORD")

    # Connect to Neo4j
    driver = GraphDatabase.driver(uri, auth=(username, password))

    # # Execute Cypher queries
    execute_cypher_queries(driver, data)

    # Create embeddings index
    create_embeddings_index(driver)

By default it will construct the knowledge graph from LLM extracted data. If you want to construct it from manually created JSON data, then add the data manually as follows:

1. Create a folder called 'extracted_json' in the data directory.
2. Create inside the 'extracted_json' folder, create a JSON file and add the data. The data should follow the schema defined in data/schema/schema.json. You can find the example of dummy JSON data in data/schema/schema_example.json.

In [None]:
create_knowledge_graph(constuct_from = "llm") # construct_from = "llm" or "manual"

# **8. Test data retrieval from Knowledge Graph with LLM**

In [None]:
prompt = "How Many Meetings are there?"

In [None]:
# instanciate the LLM query processor
processor = llm_kg_retrieval.KnowledgeGraphRAG(
                        url=os.getenv("NEO4J_URI"),
                        username=os.getenv("NEO4J_USERNAME"),
                        password=os.getenv("NEO4J_PASSWORD"),
                    )


In [None]:
# get response from LLM
response, _, _= processor.process_prompt(prompt)
print("Response:", response)