# **Install and Import Libraries**

In [None]:
!pip install pymupdf
!pip install docx2pdf
!pip install mammoth

# !pip install openai

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time

# **1. Scrape Website**

In [None]:
def fetch_and_parse(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser')

def get_header_keys(header_row):
    headers = []
    if header_row:
        for header in header_row.find_all('th'):
            headers.append(header.get_text(strip=True))
    return headers

def scrape_table(table, base_url):
    data = []
    header_row = table.find('tr', class_='colheader')
    headers = get_header_keys(header_row)
    rows = table.find_all('tr')
    data_row_start = rows.index(header_row) + 1 if header_row else 0

    for row in rows[data_row_start:]:
        row_data = {}
        cells = row.find_all('td')
        for index, cell in enumerate(cells):
            cell_key = headers[index] if index < len(headers) else f'Cell_{index}'
            links = cell.find_all('a', href=True)
            if links:
                link_data = []
                for link in links:
                    href = link['href']
                    link_text = link.get_text(strip=True)
                    if 'docid' in href:
                        link_data.append({'doc_url': base_url + href, 'title': link_text})
                    elif 'id' in href or 'bid' in href:
                        nested_url = base_url + href
                        nested_soup = fetch_and_parse(nested_url)
                        nested_tables = nested_soup.find_all('table')
                        nested_data = scrape_table(nested_tables[0], base_url)
                        link_data.append({'url': nested_url, 'title': link_text, 'nested_data': nested_data})
                row_data[cell_key] = link_data
            else:
                row_data[cell_key] = cell.get_text(strip=True)
        if row_data:
          data.append(row_data)
    return data

start_url = 'https://kungorelse.nykarleby.fi:8443/ktwebbin/dbisa.dll/ktwebscr/pk_kokl_tweb.htm'
base_url = 'https://kungorelse.nykarleby.fi:8443'
soup = fetch_and_parse(start_url)
tables = soup.find_all('table')
result = scrape_table(tables[0], base_url)

with open('/content/result1.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False,indent=4)

# **2. Convert to DataFrame for Easier Access and Filtering**

In [None]:
# Load the JSON file into a Python object to explore its structure
with open("/content/drive/MyDrive/result1.json", "r") as file:
    json_data = json.load(file)

In [None]:
columns = ['doc_link', 'rubrik', 'section', 'meeting_date', 'meeting_time', 'meeting_reference', 'verksamhetsorgan', 'parent_link']
df = pd.DataFrame(columns = columns)

for meeting in json_data:
  for doc in meeting['Datum'][0]['nested_data']:
    if 'Rubrik' in doc.keys() and isinstance(doc['Rubrik'][0], dict) and 'doc_url' in doc['Rubrik'][0].keys():
      parent_row = {
          'doc_link': doc['Rubrik'][0]['doc_url'],
          'rubrik': doc['Rubrik'][0]['title'],
          'section': "" if not doc['§'] else f"§ {doc['§']}",
          'meeting_date': meeting['Datum'][0]['title'].split(' ')[0].strip(),
          'meeting_time': meeting['Datum'][0]['title'].split(' ')[1].strip(),
          'meeting_reference': meeting['Verksamhetsorgan'].split(":")[1].strip(),
          'verksamhetsorgan': meeting['Verksamhetsorgan'].split(":")[0].strip(),
          'parent_link': ""
      }
      df = pd.concat([df, pd.DataFrame([parent_row])], ignore_index=True)
      if 'Bilagor' in doc.keys() and doc['Bilagor'] and doc['Bilagor'] != '-':
        for attachment in doc['Bilagor'][0]['nested_data']:
          attachment_row = {
              'doc_link':attachment['Cell_0'][0]['doc_url'],
              'rubrik':attachment['Cell_0'][0]['title'],
              'section':"",
              'meeting_date': meeting['Datum'][0]['title'].split(' ')[0].strip(),
              'meeting_time': meeting['Datum'][0]['title'].split(' ')[1].strip(),
              'meeting_reference': meeting['Verksamhetsorgan'].split(":")[1].strip(),
              'verksamhetsorgan': meeting['Verksamhetsorgan'].split(":")[0].strip(),
              'parent_link': parent_row['doc_link']
          }
          df = pd.concat([df, pd.DataFrame([attachment_row])], ignore_index=True)

# **3. Download all PDFs from links**

In [None]:
import pandas as pd
import requests
import os
from urllib.parse import unquote
import re

# Create the 'protocols' directory if it doesn't exist
download_path='/content/drive/MyDrive/all_protocols/protocols'
os.makedirs(download_path, exist_ok=True)

# Function to download PDF and return the filename
def download_pdf(url):
    try:
        response = requests.get(url, allow_redirects=True)
        response.raise_for_status()  # Raise an error for bad status codes

        # Extract filename from Content-Disposition header
        cd = response.headers.get('content-disposition','')
        filename = re.findall('filename=(.+)', cd)
        print(filename)
        if filename:
            filename = unquote(filename[0])  # Decoding any URL encoded characters
        else:
            # Create a filename if not found in the header
            filename = url.split('/')[-1]

        # Save the PDF in 'protocols' folder
        filepath = os.path.join(download_path, filename)
        with open(filepath, 'wb') as f:
            f.write(response.content)

        return filename
    except requests.RequestException as e:
        print(f"Error downloading {url}: {str(e)}")
        return None

# Add the 'doc_name' column to the dataframe
df['doc_name'] = df['doc_link'].apply(download_pdf)

# sort columns
df = df[['doc_name', 'doc_link', 'rubrik', 'section', 'meeting_date', 'meeting_time','meeting_reference',
       'verksamhetsorgan', 'parent_link']]

# Save the updated DataFrame
df.to_csv('/content/drive/MyDrive/all_protocols/metadata.csv', index=False)


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/all_protocols/metadata.csv', index_col=0)
df.fillna("", inplace=True)
df.head()

# **4. Extract XHTML from PDFs**

In [None]:
import fitz
import os
from mammoth import convert_to_html

os.makedirs('/content/drive/MyDrive/all_protocols/protocols_html', exist_ok=True)
for index, row in df[10:12].iterrows():
  filename = row['doc_name']
  file_rootname, file_extension = os.path.splitext(filename)
  if file_extension == ".pdf":
    with fitz.open(f'/content/drive/MyDrive/all_protocols/protocols/{filename}') as doc:
        text = "".join(page.get_text("xhtml", flags=~fitz.TEXT_PRESERVE_IMAGES & fitz.TEXT_DEHYPHENATE & fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
  elif file_extension == ".docx":
    with open(f'/content/drive/MyDrive/all_protocols/protocols/{filename}', 'rb') as docx:
      text = convert_to_html(docx)
      text = text.value

  text = text.encode('utf-8')
  with open(f'/content/drive/MyDrive/all_protocols/protocols_html/{file_rootname}.html', "wb") as file:
    file.write(text)
    print(text.decode('utf-8'))

# **4. Extract Meeting Metadata from PDF with LLM**

In [None]:
import fitz
import json
import time
from openai import OpenAI
import multiprocessing

# Initialize the OpenAI client and assistant_id
client = OpenAI(api_key='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')
assistant_id = "asst_q3KCyxW9Ib7U1tinoHW7ynHy"

# Function to process text with LLM
def extract_data_with_llm(text, pdf_name, assistant_id):
    # Create a thread, send the extracted text, and run the assistant
    thread = client.beta.threads.create()
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=text
    )
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant_id
    )
    while run.status != 'completed':
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id
        )
        print(f'Status for {pdf_name}:', run.status)
        time.sleep(5)

    # Retrieve messages from the thread
    messages = client.beta.threads.messages.list(
        thread_id=thread.id
    )
    return messages.data[0].content[0].text.value.replace('```json', '').replace('```', '')

# Function to process a single PDF
def process_pdf(pdf_name):
    print(f"Processing: {pdf_name}\n")

    # Open and read the PDF file
    with fitz.open(f'/content/drive/MyDrive/all_protocols/protocols/{pdf_name}') as doc:
        text = "".join(page.get_text("text", flags=~fitz.TEXT_PRESERVE_IMAGES) for page in doc)

    json_response = extract_data_with_llm(text, pdf_name, assistant_id)

    # Check if the response is valid JSON; retry if not
    for _ in range(3):  # Retry up to 3 times
        try:
            response_data = json.loads(json_response)
            return pdf_name, response_data
        except json.JSONDecodeError:
            print("Invalid JSON response. Retrying...")
            json_response = extract_data_with_llm(text, pdf_name, assistant_id)

    print(f"Failed to get valid JSON response for {pdf_name} after retries.")
    return pdf_name, "LLM Error!"

# Filter documents that contain meeting metadata
filtered_df = df[df['rubrik'].isin(['Beslutande', 'Sammanträdesuppgifter och deltagande', 'Kokoustiedot ja osallistujat', 'Vln:Beslutande', 'Päättäjät'])]

# Number of processes should be equal to the number of CPU cores
num_processes = 2

# Process each PDF in parallel
with multiprocessing.Pool(num_processes) as pool:
    results = pool.map(process_pdf, filtered_df['doc_name'])

# Update the DataFrame with the results
for pdf_name, metadata in results:
    if metadata is not None:
        df.loc[df['doc_name'] == pdf_name, 'metadata'] = metadata

# Save or print the updated DataFrame
print(df)
# df.to_csv('updated_dataframe.csv')  # Optionally save to a CSV file


In [None]:
# inspect result format
results[0][1]

In [None]:
# add metadata to dataframe
for pdf_name, metadata in results:
    if metadata:
        df.loc[df['doc_name'] == pdf_name, 'meeting_end_time'] = metadata['endTime']
        df.loc[df['doc_name'] == pdf_name, 'meeting_place'] = metadata['meetingPlace']
        df.loc[df['doc_name'] == pdf_name, 'members'] = json.dumps(metadata['members'])
        df.loc[df['doc_name'] == pdf_name, 'substitutes'] = json.dumps(metadata['substitutes'] if 'substitutes' in metadata.keys() else [])
        df.loc[df['doc_name'] == pdf_name, 'additional_attendees'] = json.dumps(metadata['additionalAttendees'])
        df.loc[df['doc_name'] == pdf_name, 'protocol_signatories'] = json.dumps(metadata['protocolSignatories'])
        df.loc[df['doc_name'] == pdf_name, 'protocol_adjusters'] = json.dumps(metadata['protocolAdjustment']['adjustedBy'])
        df.loc[df['doc_name'] == pdf_name, 'protocol_adjustment_date'] = metadata['protocolAdjustment']['adjustmentDate']

In [None]:
filtered_df = df[df['rubrik'].isin(['Beslutande', 'Sammanträdesuppgifter och deltagande', 'Kokoustiedot ja osallistujat', 'Vln:Beslutande', 'Päättäjät'])]

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/all_protocols/metadata.csv', index_col=0)
df.fillna("", inplace=True)
df.head()

In [None]:
df.to_csv('/content/drive/MyDrive/all_protocols/metadata.csv')

# **5. Extract Proposals and Decisions**

In [None]:
non_agenda_titles = [
    'Sammanträdets laglighet och beslutsförhet',
    'Godkännande av föredragningslistan',
    'Val av protokolljusterare',
    'Sammanträdets laglighet och beslutförhet',
    'Sammanträdets konstituerande',
    'Kokouksen laillisuus ja päätösvaltaisuus',
    'Kahden pöytäkirjantarkastajan valinta',
    'Esityslistan hyväksyminen',
    'Val av protokolljusterare och protokollförare'
]

In [None]:
filtered_df = df[~df['rubrik'].isin(['Beslutande', 'Sammanträdesuppgifter och deltagande', 'Kokoustiedot ja osallistujat', 'Vln:Beslutande', 'Päättäjät', *non_agenda_titles])]
filtered_df = filtered_df[(filtered_df['parent_link'] == "") & (~filtered_df['section'].isin(["", "§ 0"]))]

In [None]:
filtered_df

In [None]:
stadsfullmäktige_df = filtered_df[filtered_df['verksamhetsorgan'] == 'Stadsfullmäktige']
stadsfullmäktige_df

In [None]:
df.verksamhetsorgan.unique()

In [None]:
import fitz
import docx
from pprint import pprint
from tqdm import tqdm
import re

# get unique organs
organs = df.verksamhetsorgan.unique()

# Construct the alternation group for organ names
organ_alternation_group = '|'.join(re.escape(organ) for organ in organs)

# Assemble the full regex pattern
reference_regex = rf'({organ_alternation_group}) (\d{{1,2}}\.\d{{1,2}}\.\d{{4}}),? (§ \d+|\d+ §)'


# Compile the regex pattern
reference_compiled_regex = re.compile(reference_regex)

print(reference_compiled_regex)


for index, row in stadsfullmäktige_df.iterrows():
  date = row['meeting_date'].replace('.', r'\.')
  match_section = rf"(?:(?:{row['verksamhetsorgan']} {date},? (?:{row['section']}|{row['section'].replace('§','').strip()} §)))(?:(?!(?:{row['verksamhetsorgan']} {date},? (?:{row['section']}|{row['section'].replace('§','').strip()} §))).)*$"
  filepath = f'/content/drive/MyDrive/all_protocols/protocols/{row["doc_name"]}'
  print(match_section)
  print(row['doc_link'], row['verksamhetsorgan'], row['meeting_date'], row['section'])
  try:
    with fitz.open(filepath) as doc:
      text = "".join(page.get_text("text", flags=~fitz.TEXT_PRESERVE_IMAGES) for page in doc)
  except:
    doc = docx.Document(filepath)
    text = '\n'.join(para.text for para in doc.paragraphs)

  matches = reference_compiled_regex.findall(text)
  print("\n".join(set([match for match in [" ".join(match) for match in matches]])))
  print("-"*100)

  # Find the match
  match = re.search(match_section, text, re.DOTALL)

  if match:
      print("Captured Text:", match.group(0))
  # print(text)
  print("="*100)

In [None]:
import fitz

with fitz.open('/content/ungdomsfullmaktige_protokoll_09_05_2023_16_15_162526.pdf') as doc:
      text = "".join(page.get_text("xhtml", flags=~fitz.TEXT_PRESERVE_IMAGES) for page in doc)

In [None]:
text