# **Install and Import Libraries**

> ##### **Add the OpenAI API key in config/secrets.env file as follows:**

> ###### **OPENAI_API_KEY = "<api_key>"**


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os

# load config
load_dotenv("../config/config.env")

# load secrets
load_dotenv("../config/secrets.env")

from data_pipeline import *

# **1. Scrape Website**

In [None]:
df = scrape_website()

In [None]:
df

# **2. Download all PDFs from links**

In [None]:
df = download_documents()

In [None]:
df

# **3. Extract HTML from PDFs**

In [None]:
convert_pdf_to_html()

# **4. Extract Meeting Metadata from PDF with LLM**

In [None]:
df = extract_meeting_metadata()

In [None]:
df

# **5. Extract Proposals and Decisions**

## still not working as expected

In [None]:
df = extract_meeting_agenda(10)

# **6. Export JSON**

In [None]:
import pandas as pd

df = pd.read_csv("../data/metadata.csv", index_col=0)
df.fillna("", inplace=True)

In [None]:
extracted_json_path = os.getenv("EXTRACTED_JSON_PATH")
import html
import json
for index, row in df.iterrows():
    # create the path to the save the json files
    save_path = os.path.join(extracted_json_path, row['verksamhetsorgan'], row['meeting_date'], row['doc_name'].split(".")[0])

    if row['meeting_end_time'] != "":

        # ensure that the path exists
        os.makedirs(save_path, exist_ok=True)

        # path to save the metadata json file
        metadata_save_path = os.path.join(save_path, "llm_meeting_metadata.json")

        json_data = f'''{{
                "meetingDate": "{row['meeting_date']}",
                "startTime": "{row['meeting_time']}",
                "meetingReference": "{row['meeting_reference']}",
                "endTime": "{row['meeting_end_time']}",
                "meetingPlace": "{row['meeting_place']}",
                "members": {row['members']},
                "substitutes": {row['substitutes']},
                "additionalAttendees": {row['additional_attendees']},
                "protocolSignatories": {row['protocol_signatories']},
                "adjustedBy": {row['protocol_adjusters']},
                "adjustmentDate": "{row['protocol_adjustment_date']}",
                "meetingItems": [] }}''' # meeting items will be added when constructing the aggregate JSON file

        json_data = json.dumps(json.loads(json_data), indent=4, ensure_ascii=False)
        # save the metadata json file
        with open(metadata_save_path, "w") as f:
            f.write(json_data)
            
    elif row['agenda_metadata'] != "":
        # ensure that the path exists
        os.makedirs(save_path, exist_ok=True)

        # path to save the meeting item json file
        item_save_path = os.path.join(save_path, "llm_meeting_item.json")

        item = json.loads(row['agenda_metadata'])
        item['rubrik'] = row['rubrik']
        item['section'] = row['section']

        # get all the atachments of the row based on parent link
        attachments = df[df['parent_link'] == row['doc_link']]

        # add the attachments to the item
        item['attachments'] = []
        for index, attachment in attachments.iterrows():
            item['attachments'].append({
                "rubrik": attachment['rubrik'],
                "link": attachment['doc_link']
            })
        # save the meeting item json file
        with open(item_save_path, "w") as f:
            f.write(json.dumps(item, indent=4, ensure_ascii=False))


In [None]:

# sanity check for number of folders created. should correspond to number of meetings with metadata extracted with llm
import glob
len(glob.glob(extracted_json_path + "/*/*/*"))