<a href="https://colab.research.google.com/github/TiagoIesbick/dashboard-etl/blob/main/read_minutes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [229]:
%%capture
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr
!apt-get install -y tesseract-ocr-por
!pip install pdfplumber pytesseract pdf2image
import re
import pandas as pd
import pathlib
import pdfplumber
import pytesseract
from pdf2image import convert_from_path

In [230]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [231]:
def extract_text_from_pdf(path):
    text = ""

    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    # If no text extracted ‚Üí fallback to OCR
    if not text.strip():
        images = convert_from_path(path)
        for image in images:
            page_text = pytesseract.image_to_string(image, lang="por")
            text += page_text + "\n"

    return text

def normalize_text(text):
    text = text.replace('\r', '')
    text = re.sub(r'[ \t]+', ' ', text)
    return text

def extract_minute_number(text):
    pattern = r'(?:ATA\s*N[¬∫o]\s*|Ata\s*)(\d+/\d{4})'
    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return None

    minute = match.group(1)

    number, year = minute.split('/')
    number = number.zfill(2)

    return f"{number}/{year}"

def extract_date(text):
    pattern = r'\b\d{2}/\d{2}/\d{4}\b'
    match = re.search(pattern, text)
    return match.group(0) if match else None

def extract_agenda(text):
    pauta_match = re.search(r'Pauta\s*:', text, re.IGNORECASE)
    if not pauta_match:
        return None

    start = pauta_match.end()

    narrativa_match = re.search(r'\n\s*(?:a\s+)?Reuni[a√£]o\s*\n', text[start:], re.IGNORECASE)
    if not narrativa_match:
        return None

    end = start + narrativa_match.start()
    agenda_block = text[start:end].strip()

    # Multi-line item capture
    pattern = r'^\s*(\d+)\.\s+(.*?)(?=^\s*\d+\.\s+|\Z)'

    matches = re.findall(pattern, agenda_block, re.MULTILINE | re.DOTALL)

    if not matches:
        return None

    cleaned = []
    for number, content in matches:
        content = re.sub(r'\s+', ' ', content).strip()
        cleaned.append(f"{number}. {content}")

    return "\n".join(cleaned)

def remove_pdf_footers(text):
    footer_pattern = r'Ata\s+\d+/\d{4}.*?\/ pg\.\s*\d+'

    return re.sub(footer_pattern, '', text, flags=re.IGNORECASE)

def extract_ordem_do_dia(text):
    text = remove_pdf_footers(text)

    # Step 1 ‚Äî isolate ITEM blocks structurally
    block_pattern = r'(ITEM\s+\d+\s*[-‚Äì].*?)(?=\s+ITEM\s+\d+\s*[-‚Äì]|\s+Nada mais havendo|\Z)'
    blocks = re.findall(block_pattern, text, re.IGNORECASE | re.DOTALL)

    if not blocks:
        return None

    cleaned = []

    for block in blocks:
        block = re.sub(r'\s+', ' ', block).strip()

        # Find earliest delimiter
        semicolon = block.find(';')
        colon = block.find(':')

        # Improved dot detection
        dot_match = re.search(r'\.\s(?!0{2,})', block)
        dot_space = dot_match.start() if dot_match else -1

        # Collect valid positions (ignore -1)
        candidates = [p for p in [semicolon, colon, dot_space] if p != -1]

        if candidates:
            cut_pos = min(candidates)
            header = block[:cut_pos + 1]
        else:
            header = block

        cleaned.append(header.strip())

    item_prefix_pattern = re.compile(r'^ITEM\s+\d+\s*[-‚Äì]\s*', re.IGNORECASE)

    normalized = [
        f"{idx}. {item_prefix_pattern.sub('', item).strip()}"
        for idx, item in enumerate(cleaned, start=1)
    ]

    return "\n".join(normalized)

UNITS = {
    "zero": 0, "um": 1, "uma": 1, "dois": 2, "duas": 2, "tr√™s": 3, "tres": 3,
    "quatro": 4, "cinco": 5, "seis": 6, "sete": 7, "oito": 8, "nove": 9
}

TEENS = {
    "dez": 10, "onze": 11, "doze": 12, "treze": 13, "quatorze": 14,
    "catorze": 14, "quinze": 15, "dezesseis": 16, "dezessete": 17,
    "dezoito": 18, "dezenove": 19
}

TENS = {
    "vinte": 20, "trinta": 30, "quarenta": 40, "cinquenta": 50,
    "sessenta": 60, "setenta": 70, "oitenta": 80, "noventa": 90
}

HUNDREDS = {
    "cem": 100, "cento": 100, "duzentos": 200, "trezentos": 300,
    "quatrocentos": 400, "quinhentos": 500, "seiscentos": 600,
    "setecentos": 700, "oitocentos": 800, "novecentos": 900
}

MONTHS = {
    "janeiro": "01",
    "fevereiro": "02",
    "mar√ßo": "03",
    "marco": "03",
    "abril": "04",
    "maio": "05",
    "junho": "06",
    "julho": "07",
    "agosto": "08",
    "setembro": "09",
    "outubro": "10",
    "novembro": "11",
    "dezembro": "12",
}

def parse_written_number(text):
    text = text.lower().replace(" e ", " ")
    words = text.split()

    total = 0

    for word in words:
        if word in UNITS:
            total += UNITS[word]
        elif word in TEENS:
            total += TEENS[word]
        elif word in TENS:
            total += TENS[word]
        elif word in HUNDREDS:
            total += HUNDREDS[word]
        elif word == "mil":
            total *= 1000

    return total

def extract_date_por_extenso(text):
    pattern = (
        r"Aos\s+(.*?)\s+dias\s+do\s+m[e√™]s\s+de\s+"
        r"(\w+)\s+do\s+ano\s+de\s+(.*?),"
    )

    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    day_written = match.group(1).strip()
    month_written = match.group(2).strip().lower()
    year_written = match.group(3).strip()

    day = parse_written_number(day_written)
    month = MONTHS.get(month_written)
    year = parse_written_number(year_written)

    if not month:
        return None

    return f"{day:02d}/{month}/{year}"

def extract_minute_number_from_deliberative_council_minutes(text):
    minute_number = extract_minute_number(text)
    if not minute_number:
      pattern_simple = r'ATA\s*N[¬∫o]?\s*(\d+)'
      match_simple = re.search(pattern_simple, text, re.IGNORECASE)
      year = extract_date_por_extenso(text[:500])[-4:]
      if match_simple and year:
        minute_number = f"{match_simple.group(1).zfill(2)}/{year}"
    return minute_number if minute_number else None

def extract_date_from_label(text):
    pattern = r'Data:\s*(\d{2}/\d{2}/\d{4})'
    match = re.search(pattern, text, re.IGNORECASE)
    return match.group(1) if match else None

def extract_minute_number_from_fiscal_council_minutes(text):
    minute_number = extract_minute_number(text)
    if not minute_number:
      pattern_simple = r'ATA\s*N[¬∫o]?\s*(\d+)'
      match_simple = re.search(pattern_simple, text, re.IGNORECASE)
      year = extract_date_from_label(text)[-4:]
      if match_simple and year:
        minute_number = f"{match_simple.group(1).zfill(2)}/{year}"
    return minute_number if minute_number else None

def clean_sei_document(text):
    # Remove form-feed characters (OCR page breaks)
    text = text.replace('\f', '\n')

    lines = text.split('\n')
    cleaned_lines = []

    patterns = [
        # Header
        r'\d{2}/\d{2}/\d{4},\s*\d{2}:\d{2}\s+SEI\/PMPA\s*-\s*\d+\s*-\s*Ata',

        # URL footer
        r'https://sei[.,]procempa\.com\.br/sei/.*',

        # Old SEI footer style
        r'Ata\s+\d+/\d{4}.*SEI\s+\d{2}\.\d{2}\.\d+-\d+.*(?:pg|p√°g)\.?\s*\d+',

        # Isolated page number (1‚Äì3 digits only)
        r'^\s*\d{1,3}\s*$'
    ]

    compiled = [re.compile(p, re.IGNORECASE) for p in patterns]

    for line in lines:
        stripped = line.strip()

        if any(p.search(stripped) for p in compiled):
            continue

        cleaned_lines.append(line)

    text = "\n".join(cleaned_lines)

    # Normalize blank lines
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    return text.strip()

def extract_pauta_section(text):
    text = clean_sei_document(text)

    pattern = r'Pauta\s*:\s*(.*?)\s*Informes\s+e\s+delibera[c√ß][√µo]es\s*:'
    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)

    if not match:
        return None

    section = match.group(1)

    # Normalize blank lines
    section = re.sub(r'\n\s*\n+', '\n', section)

    lines = section.split('\n')

    cleaned_items = []
    current_item = None

    item_start_pattern = re.compile(
        r'^\s*(?:\d+\.\s+|[a-z]\.\s+|[‚Ä¢\-\*]\s+)',
        re.IGNORECASE
    )

    for line in lines:
      stripped = line.strip()
      if not stripped:
        continue

      if item_start_pattern.match(stripped):
        # Save previous item
        if current_item:
          cleaned_items.append(current_item.strip())

        # Remove original marker
        cleaned = re.sub(
            r'^\s*(?:\d+\.\s+|[a-z]\.\s+|[‚Ä¢\-\*]\s+)',
            '',
            stripped, flags=re.IGNORECASE
        )

        current_item = cleaned

      else:
        # Append continuation lines (ignore standalone uppercase names)
        if current_item and (not stripped.isupper() or 'PREVIMPA' in stripped):
          current_item += " " + stripped

    # Append last item
    if current_item:
      cleaned_items.append(current_item.strip())

    # üî• Normalize markers to 1., 2., 3., ...
    normalized_items = [
        f"{idx + 1}. {item}"
        for idx, item in enumerate(cleaned_items)
    ]

    return " ".join(normalized_items)


In [232]:
investment_committee_folder = pathlib.Path(r'/content/drive/MyDrive/Atas/ComiteÃÇ de Investimentos')

investment_committee_data = []

for minutes_file in investment_committee_folder.glob("*.pdf"):
  text = extract_text_from_pdf(minutes_file)
  text = normalize_text(text)

  investment_committee_data.append({
      "Ata": extract_minute_number(text),
      "Data": extract_date(text),
      "Pauta": extract_agenda(text)
  })

investment_committee_df = pd.DataFrame(investment_committee_data)

In [233]:
deliberative_council_folder = pathlib.Path(r'/content/drive/MyDrive/Atas/Conselho deliberativo')

deliberative_council_data = []

for minutes_file in deliberative_council_folder.glob("*.pdf"):
  text = extract_text_from_pdf(minutes_file)
  text = normalize_text(text)

  deliberative_council_data.append({
      "Ata": extract_minute_number_from_deliberative_council_minutes(text),
      "Data": extract_date_por_extenso(text[:500]),
      "Pauta": extract_ordem_do_dia(text)
  })

deliberative_council_df = pd.DataFrame(deliberative_council_data)

In [234]:
fiscal_council_folder = pathlib.Path(r'/content/drive/MyDrive/Atas/Conselho fiscal')

fiscal_council_data = []

for minutes_file in fiscal_council_folder.glob("*.pdf"):
  text = extract_text_from_pdf(minutes_file)
  text = normalize_text(text)

  fiscal_council_data.append({
      "Ata": extract_minute_number_from_fiscal_council_minutes(text),
      "Data": extract_date_from_label(text),
      "Pauta": extract_pauta_section(text)
  })

fiscal_council_df = pd.DataFrame(fiscal_council_data)

In [235]:
investment_committee_df['Data'] = pd.to_datetime(investment_committee_df['Data'], format='%d/%m/%Y')
investment_committee_df.sort_values(by='Data', inplace=True, ignore_index=True)
investment_committee_df['Data'] = investment_committee_df['Data'].dt.strftime('%d/%m/%Y')
investment_committee_df

Unnamed: 0,Ata,Data,Pauta
0,01/2025,15/01/2025,1. Informes\n2. Convalida√ß√£o dos atos com pare...
1,02/2025,23/01/2025,1. Informes\n2. Relat√≥rio de Resultados de Inv...
2,03/2025,20/02/2025,1. Informes;\n2. Relat√≥rio de Resultados JAN/2...
3,04/2025,20/03/2025,1. Informes;\n2. Relat√≥rio de Resultados FEV/2...
4,05/2025,03/04/2025,1. Informes;\n2. An√°lise e Delibera√ß√£o compra ...
5,06/2025,17/04/2025,1. Informes;\n2. Relat√≥rio dos Resultados na C...
6,07/2025,15/05/2025,1. Informes;\n2. Relat√≥rio dos Resultados na C...
7,08/2025,18/06/2025,1. Informes;\n2. Relat√≥rio dos Resultados na C...
8,12/2025,26/09/2025,1. Informes;\n2. Relat√≥rio de Dilig√™ncia do 1¬∫...
9,10/2025,30/10/2025,1. Informes;\n2. Relat√≥rio dos Resultados na C...


In [236]:
deliberative_council_df['Data'] = pd.to_datetime(deliberative_council_df['Data'], format='%d/%m/%Y')
deliberative_council_df.sort_values(by='Data', inplace=True, ignore_index=True)
deliberative_council_df['Data'] = deliberative_council_df['Data'].dt.strftime('%d/%m/%Y')
deliberative_council_df

Unnamed: 0,Ata,Data,Pauta
0,01/2025,14/01/2025,1. Aprova√ß√£o da Ata da Sess√£o do dia 19/12/202...
1,02/2025,25/02/2025,1. Aprecia√ß√£o da ata da sess√£o do dia 14/01/20...
2,03/2025,18/03/2025,1. Aprecia√ß√£o da ata da sess√£o do dia 25/02/20...
3,01/2025,26/03/2025,1. 25.13.000001892-3 - Aprova√ß√£o da Ata da Ses...
4,04/2025,15/04/2025,1. 25.13.000002476-1 - Notifica√ß√£o Minist√©rio ...
5,05/2025,28/05/2025,1. Processo 25.13.000002550-4 ‚Äì Aprova√ß√£o da A...
6,06/2025,17/06/2025,1. Processo 25.13.000003479-1 ‚Äì Aprova√ß√£o da A...
7,07/2025,15/07/2025,1. Processo 25.13.000004015-5 ‚Äì Aprova√ß√£o da A...
8,08/2025,19/08/2025,1. Processo 25.13.000004742-7 ‚Äì Aprova√ß√£o da A...
9,09/2025,23/09/2025,1. Aprova√ß√£o da Ata da sess√£o do dia 19/08/202...


In [237]:
fiscal_council_df['Data'] = pd.to_datetime(fiscal_council_df['Data'], format='%d/%m/%Y')
fiscal_council_df.sort_values(by='Data', inplace=True, ignore_index=True)
fiscal_council_df['Data'] = fiscal_council_df['Data'].dt.strftime('%d/%m/%Y')
fiscal_council_df

Unnamed: 0,Ata,Data,Pauta
0,01/2025,14/01/2025,1. Regimento Interno do Conselho Fiscal; 2. Pl...
1,01/2025,31/01/2025,1. Delibera√ß√£o sobre altera√ß√µes no atual Regim...
2,02/2025,18/02/2025,1. Plano de Trabalho do Conselho Fiscal; 2. Pr...
3,03/2025,11/03/2025,1. Plano de Trabalho do Conselho Fiscal para o...
4,02/2025,24/03/2025,1. Delibera√ß√£o sobre o parecer das Demonstra√ß√µ...
5,04/2025,08/04/2025,1. Apresenta√ß√£o dos resultados dos investiment...
6,03/2025,29/04/2025,1. Parecer sobre os resultados dos investiment...
7,05/2025,13/05/2025,1. Despesas do PREVIMPA ‚Äì an√°lises dos contrat...
8,04/2025,30/05/2025,1. Apresenta√ß√£o COMPREV 2. Acompanhamento das ...
9,06/2025,13/06/2025,1. Despesas do PREVIMPA ‚Äì an√°lises dos contrat...


In [239]:
with pd.ExcelWriter("minutes.xlsx", engine="openpyxl") as writer:
  investment_committee_df.to_excel(writer, sheet_name="Comit√™ de Investimentos", index=False)
  deliberative_council_df.to_excel(writer, sheet_name="Conselho Deliberativo", index=False)
  fiscal_council_df.to_excel(writer, sheet_name="Conselho Fiscal", index=False)
