# **Install and Import Libraries**

> ##### **Add the OpenAI API key in config/secrets.env file as follows:**

> ###### **OPENAI_API_KEY = "<api_key>"**


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import json
from neo4j import GraphDatabase

# load config
load_dotenv("../config/config.env")

# load secrets
load_dotenv("../config/secrets.env")

from data_pipeline import *

# **1. Scrape Website**

In [None]:
df = scrape_website()

In [None]:
df

# **2. Download all PDFs from links**

In [None]:
df = download_documents()

In [None]:
df

# **3. Extract HTML from PDFs**

In [None]:
convert_pdf_to_html()

# **4. Extract Meeting Metadata from PDF with LLM**

In [None]:
df = extract_meeting_metadata()

In [None]:
df

# **5. Extract Proposals and Decisions**

## still not working as expected

In [None]:
df = extract_meeting_agenda(10)

# **6. Export JSON**

In [None]:
def construct_individual_json(df):
    """
    Constructs the individual json files for each meeting item and meeting metadata
    """
    for index, row in df.iterrows():
        # create the path to the save the json files
        save_path = os.path.join(os.getenv("EXTRACTED_JSON_PATH"), row['verksamhetsorgan'], row['meeting_date'], row['doc_name'].split(".")[0])

        if row['meeting_end_time'] != "":

            # ensure that the path exists
            os.makedirs(os.path.dirname(save_path), exist_ok=True)

            # path to save the metadata json file
            metadata_save_path = os.path.join(os.path.dirname(save_path), "llm_meeting_metadata.json")

            json_data = f'''{{
                    "meetingDate": "{row['meeting_date']}",
                    "startTime": "{row['meeting_time']}",
                    "meetingReference": "{row['meeting_reference']}",
                    "endTime": "{row['meeting_end_time']}",
                    "meetingPlace": "{row['meeting_place']}",
                    "members": {row['members']},
                    "substitutes": {row['substitutes']},
                    "additionalAttendees": {row['additional_attendees']},
                    "protocolSignatories": {row['protocol_signatories']},
                    "adjustedBy": {row['protocol_adjusters']},
                    "adjustmentDate": "{row['protocol_adjustment_date']}",
                    "meetingItems": [] }}''' # meeting items is added when constructing the aggregate JSON file

            json_data = json.dumps(json.loads(json_data), indent=4, ensure_ascii=False)
            # save the metadata json file
            with open(metadata_save_path, "w") as f:
                f.write(json_data)
                
        elif row['agenda_metadata'] != "":
            # ensure that the path exists
            os.makedirs(save_path, exist_ok=True)

            # path to save the meeting item json file
            item_save_path = os.path.join(save_path, "llm_meeting_item.json")

            item = json.loads(row['agenda_metadata'])
            item['rubrik'] = row['rubrik']
            item['section'] = row['section']

            # get all the atachments of the row based on parent link
            attachments = df[df['parent_link'] == row['doc_link']]

            # add the attachments to the item
            item['attachments'] = []
            for index, attachment in attachments.iterrows():
                item['attachments'].append({
                    "rubrik": attachment['rubrik'],
                    "link": attachment['doc_link']
                })
            # save the meeting item json file
            with open(item_save_path, "w") as f:
                f.write(json.dumps(item, indent=4, ensure_ascii=False))

In [None]:
df = pd.read_csv("../data/metadata.csv", index_col=0)
df.fillna("", inplace=True)

construct_individual_json(df)

In [None]:
def construct_aggregate_json(extracted_json_path, construct_from = "llm"): # construct_from = "llm" or "manual"
    """
    Constructs a single JSON out of all the meeting metadata and items
    """
    aggregate_json = {}
    aggregate_json['verksamhetsorgan'] = []
    if not os.path.exists(extracted_json_path):
        print("Extracted JSON path does not exist")
        return
    for organ in os.scandir(extracted_json_path):
        if not organ.is_dir():
            continue
        aggregate_meeting = []
        for meeting in os.scandir(organ.path):
            metadata_path = os.path.join(meeting.path, f"{construct_from}_meeting_metadata.json")
            if os.path.exists(metadata_path):
                with open(metadata_path, "r") as f:
                    metadata = json.load(f)
                for item in os.scandir(meeting.path):
                    if item.is_dir():
                        item_path = os.path.join(item.path, f"{construct_from}_meeting_item.json")
                        if os.path.exists(item_path):
                            with open(item_path, "r") as f:
                                item = json.load(f)
                            metadata['meetingItems'].append(item)
                aggregate_meeting.append(metadata)
        aggregate_json['verksamhetsorgan'].append({
            "name": organ.name,
            "meetings": aggregate_meeting
        })

    with open(os.path.join(extracted_json_path, f"{construct_from}_aggregate_data.json"), "w") as f:
        f.write(json.dumps(aggregate_json, indent=4, ensure_ascii=False))

In [None]:
construct_aggregate_json(os.getenv("EXTRACTED_JSON_PATH"), construct_from = "llm") # construct_from = "llm" or "manual"

In [None]:

# sanity check for number of folders created. should correspond to number of meetings with metadata extracted with llm
import glob
len(glob.glob(os.getenv("EXTRACTED_JSON_PATH") + "/*/*/*"))

# **7. Create a Knowledge Graph from JSON**

In [None]:
# Function to execute Cypher queries
def execute_cypher_queries(driver, data):
    with driver.session() as session:
        for organ in data.get("verksamhetsorgan", []):
            # Merge Organ
            session.run("""
                MERGE (o:Organ {name: $name})
            """, name=organ.get("name", ""))

            # Process meetings
            for meeting in organ.get("meetings", []):
                # Merge Meeting
                meeting_id = session.run("""
                    MERGE (m:Meeting {
                      meetingDate: $meetingDate,
                      startTime: $startTime,
                      meetingReference: $meetingReference,
                      endTime: $endTime,
                      meetingPlace: $meetingPlace
                    })
                    WITH m
                    MATCH (o:Organ {name: $organName})
                    MERGE (o)-[:HOSTED]->(m)
                    RETURN id(m)
                """, meetingDate=meeting.get("meetingDate", ""),
                     startTime=meeting.get("startTime", ""),
                     meetingReference=meeting.get("meetingReference", ""),
                     endTime=meeting.get("endTime", ""),
                     meetingPlace=meeting.get("meetingPlace", ""),
                     organName=organ.get("name", "")).single()[0]

                # # Link Organ to Meeting
                # session.run("""
                #     MATCH (o:Organ {name: $organName})
                #     WITH o
                #     MATCH (m:Meeting) WHERE id(m) = $meetingId
                #     WITH o, m
                #     MERGE (o)-[:HOSTED]->(m)
                # """, organName=organ.get("name", ""), meetingId=meeting_id)

                # Process Members
                for person in meeting.get("members", []):
                    session.run("""
                        MERGE (p:Person {name: $name})
                        WITH p
                        MATCH (m:Meeting) WHERE id(m) = $meetingId
                        MERGE (p)-[:ATTENDED {
                            role: $role, 
                            attendance: coalesce($attendance, '')
                        }]->(m)
                    """, name=person.get("name", ""),
                         role=person.get("role", ""),
                         attendance=person.get("attendance", ""),
                         meetingId=meeting_id)

                # Process Substitutes
                for substitute in meeting.get("substitutes", []):
                    session.run("""
                        // Create or find the substitute node and connect to the meeting
                        MERGE (s:Person {name: $name})
                        WITH s
                        MATCH (m:Meeting) WHERE id(m) = $meetingId
                        MERGE (s)-[:SUBSTITUTE_ATTENDEE]->(m)
                        WITH s
                        // Only proceed if substitutedFor is not an empty string
                        WHERE $substitutedFor <> ''
                        // Create or find the substituted person node and create a relationship
                        MERGE (substituted:Person {name: $substitutedFor})
                        MERGE (s)-[:SUBSTITUTED_FOR]->(substituted)
                    """, name=substitute.get("name", ""),
                        substitutedFor=substitute.get("substitutedFor", ""),
                        meetingId=meeting_id)

                # Process Additional Attendees
                for attendee in meeting.get("additionalAttendees", []):
                    session.run("""
                        MERGE (a:Person {name: $name})
                        WITH a
                        MATCH (m:Meeting) WHERE id(m) = $meetingId
                        MERGE (a)-[:ADDITIONAL_ATTENDEE {
                            role: coalesce($role, '')
                        }]->(m)
                    """, name=attendee.get("name", ""),
                         role=attendee.get("role", ""),
                         meetingId=meeting_id)

                # Process Meeting Items
                for item in meeting.get("meetingItems", []):
                    item_id = session.run("""
                        MERGE (i:MeetingItem {
                            rubrik: coalesce($rubrik, ''),
                            section: coalesce($section, ''),
                            protocolContext: coalesce($protocolContext, ''),
                            beslut: coalesce($beslut, '')
                        })
                        RETURN id(i)
                    """, rubrik=item.get("rubrik", ""),
                         section=item.get("section", ""),
                         protocolContext=item.get("protocolContext", ""),
                         beslut=item.get("beslut", ""),
                         meetingId=meeting_id,
                         ).single()[0]

                    # Link Meeting to Meeting Item
                    session.run("""
                        MATCH (m:Meeting) WHERE id(m) = $meetingId
                        MATCH (i:MeetingItem) WHERE id(i) = $itemId
                        MERGE (m)-[:HAS_ITEM]->(i)
                    """, meetingId=meeting_id, itemId=item_id)

                    # Process Preparers and Proposers similarly inside Meeting Items
                    for preparer in item.get("preparers", []):
                        session.run("""
                            MERGE (p:Person {name: coalesce($name, '')})
                            WITH p
                            MATCH (i:MeetingItem) WHERE id(i) = $itemId
                            MERGE (p)-[:PREPARED]->(i)
                        """, name=preparer.get("name", ""),
                             itemId=item_id)
                        
                    for proposer in item.get("proposers", []):
                        session.run("""
                            MERGE (p:Person {name: coalesce($name, '')})
                            WITH p
                            MATCH (i:MeetingItem) WHERE id(i) = $itemId
                            MERGE (p)-[:PROPOSED]->(i)
                        """, name=proposer.get("name", ""),
                             itemId=item_id)
                        
                    # Process Signatories
                    for signatory in item.get("signatories", []):
                        session.run("""
                            MERGE (s:Person {name: coalesce($name, '')})
                            WITH s
                            MATCH (i:MeetingItem) WHERE id(i) = $itemId
                            MERGE (s)-[:SIGNED]->(i)
                        """, name=signatory.get("name", ""),
                             itemId=item_id)
                        
                    # Process Adjusters
                    for adjuster in item.get("adjusters", []):
                        session.run("""
                            MERGE (a:Person {name: coalesce($name, '')})
                            WITH a
                            MATCH (i:MeetingItem) WHERE id(i) = $itemId
                            MERGE (a)-[:ADJUSTED]->(i)
                        """, name=adjuster.get("name", ""),
                             itemId=item_id)
                        
                    # Process Meeting Item Attachments
                    for attachment in item.get("attachments", []):
                        session.run("""
                            MERGE (a:Attachment {link: coalesce($link, ''), rubrik: coalesce($rubrik, '')})
                            WITH a
                            MATCH (i:MeetingItem) WHERE id(i) = $itemId
                            MERGE (i)-[:HAS_ATTACHMENT]->(a)
                        """, 
                        link=attachment.get("link", ""),
                        rubrik=attachment.get("rubrik", ""),
                        itemId=item_id)                        

def create_knowledge_graph(constuct_from = "llm"): # construct_from = "llm" or "manual"
    # Load JSON data
    aggregate_json_path = os.path.join(os.getenv("EXTRACTED_JSON_PATH"), f"{constuct_from}_aggregate_data.json")

    with open(aggregate_json_path, "r") as f:
        data = json.load(f)

    # Neo4j connection details
    uri = os.getenv("NEO4J_URI")
    username = os.getenv("NEO4J_USERNAME")
    password = os.getenv("NEO4J_PASSWORD")

    # Connect to Neo4j
    driver = GraphDatabase.driver(uri, auth=(username, password))

    # Execute Cypher queries
    execute_cypher_queries(driver, data)

    # Close the driver connection
    driver.close()

In [None]:
create_knowledge_graph(constuct_from = "llm") # construct_from = "llm" or "manual"

## **8. Test Knowledge Graph Query with LLM**

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

In [None]:
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"], username=os.environ["NEO4J_USERNAME"], password=os.environ["NEO4J_PASSWORD"]
)

In [None]:
graph.schema

In [None]:
chain = GraphCypherQAChain.from_llm(
    llm=ChatOpenAI(temperature=0, model='gpt-4-1106-preview'), graph=graph, verbose=True, validate_cypher=True, return_intermediate_steps=True
)
result = chain("what organs is Håkan Braskén associated with")

In [None]:
result