# **Install and Import Libraries**

> ##### **Make sure the secrets.env file is in the config folder. An example for secrets.env can be found in config/secrets_example.env file**


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

# load config
load_dotenv("../config/config.env")

# load secrets
load_dotenv("../config/secrets.env")

In [None]:
from data_pipeline import *
import chatbot.llm_kg_retrieval as llm_kg_retrieval

# **1. Scrape Website**
> Takes approximately 12 minutes to run.

> One can possibly use asyncronous functions to speed up this process.

In [None]:
scrape_website()

# **2. Download all meeting documents from the scraped links**

> One can possibly use asyncronous functions to speed up this process.

In [None]:
download_documents(overwrite=False)

# **3. Extract HTML and text from PDFs**

In [None]:
# only converts pdf and docx files so it might be less than the downloaded files
convert_files(output_type="xhtml", overwrite=True, add_ids_to_tags=True)
convert_files(output_type="text", overwrite=True)

# **4. Extract Meeting Metadata**

In [None]:
# get dataframe for meeting metadata documents. One can filter the dataframe and extract metadata for specific documents only
# the fetched dataframe consists of additional columns is_manual_metadata_extracted, is_llm_metadata_extracted 
# which shows if the data has already been extracted or not manually and with llm
metadata_df = get_documents_dataframe(type="metadata")

In [None]:
import time

# asynchronously extract meeting metadata (taking into account openai rate limits; limit defined in config file)
metadata_batch_id, _ = extract_meeting_data_batch(df=metadata_df, type="metadata", overwrite_data=True)
time.sleep(3)

# Check batch status
print("Status for batch metadata extraction:")
metadata_output_id = None
while metadata_output_id is None:
    metadata_output_id = check_batch_status(metadata_batch_id)
    time.sleep(1)
metadata_output_jsonl = retrieve_batch_output(metadata_output_id)

In [None]:
await save_metadata_llm_batch_results(metadata_output_jsonl, metadata_df)

# **5. Extract Meeting Agenda**

In [None]:
agenda_df = get_documents_dataframe(type="agenda")

In [None]:
# asynchronously extract meeting agenda (taking into account openai rate limits; limit defined in config file)
# await extract_meeting_data(df=agenda_df, type=type)

In [None]:
# Batch extract meeting agenda for body
import time
agenda_batch_id, references_batch_id = extract_meeting_data_batch(df=agenda_df, type="agenda")
time.sleep(3)

# Check batch status
print("Status for batch agenda extraction:")
agenda_output_id = None
while agenda_output_id is None:
    agenda_output_id = check_batch_status(agenda_batch_id)
    time.sleep(1)
agenda_output_jsonl = retrieve_batch_output(agenda_output_id)

print("Status for batch references extraction:")
references_output_id = None
while references_output_id is None:
    references_output_id = check_batch_status(references_batch_id)
    time.sleep(1)
references_output_jsonl = retrieve_batch_output(references_output_id)

In [None]:
# Save batch results
await save_agenda_llm_batch_results(agenda_output_jsonl, agenda_df, references_jsonl=references_output_jsonl)

# Create html to preview the extracted agenda data, saved in notebooks folder
create_agenda_html(agenda_df)

# **6. Export JSON**

In [None]:
construct_aggregate_json(construct_from="llm") # construct_from = "llm" or "manual"

# **7. Create a Knowledge Graph from JSON**

In [None]:
# Caution: This will completely overwrite the current knowledge graph, if any
create_knowledge_graph(construct_from = "llm") # construct_from = "llm" or "manual"

By default it will construct the knowledge graph from LLM extracted data. If you want to construct it from manually created JSON data, then add the data manually as follows:

1. Manually create JSON files with extracted data inside respective folders in `data/protocols` folder and name it `manual_meeting_metadata.json` or `manual_meeting_agenda.json` depending on the document type. Folder structure is `<body>`/`<meeting_date>`/`<document>`. Put the JSON inside the `<document>` folder.

2. Execute the `construct_aggregate_json(construct_from="manual")` function. This will fail if the created JSON does not follow the schema defined in `data/schema/schema.json`

3. Execute `create_knowledge_graph(constuct_from = "manual")` function.

# **8. Test data retrieval from Knowledge Graph with LLM**

In [None]:
prompt = "Are there any errands involving cats?"

In [None]:
# instantiate the LLM query processor
processor = llm_kg_retrieval.KnowledgeGraphRAG(
                url=os.getenv("NEO4J_URI"),
                username=os.getenv("NEO4J_USERNAME"),
                password=os.getenv("NEO4J_PASSWORD"),
            )

In [None]:
# get response from LLM
response, _, _= processor.process_prompt(prompt)
print("Response:", response)