### 🧠 Deadline Manager Agent – EY AI Challenge

Modular notebook: OCR, date parsing, working-days, LLM agent para prazos legais e integração opcional de calendário.

In [7]:
# DEPENDENCIES: Some useful dependencies. Theu might not be necessary.
!apt-get update && apt-get install -y tesseract-ocr
%pip install --upgrade pytesseract PyPDF2 pillow dateparser python-dateutil holidays transformers huggingface_hub[hf_xet]

Reading package lists... Done
E: Could not open lock file /var/lib/apt/lists/lock - open (13: Permission denied)
E: Unable to lock directory /var/lib/apt/lists/
W: Problem unlinking the file /var/cache/apt/pkgcache.bin - RemoveCaches (13: Permission denied)
W: Problem unlinking the file /var/cache/apt/srcpkgcache.bin - RemoveCaches (13: Permission denied)
Defaulting to user installation because normal site-packages is not writeable
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pillow
  Downloading pillow-11.2.1-cp310-cp310-manylinux_2_28_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting python-dateutil
  Downloading pyt

In [8]:
%pip install dateparser
%pip install holidays

# IMPORTS: Some useful libraries. They might not be necessary
import os
from datetime import datetime, timedelta
from dateparser.search import search_dates
import dateparser
from dateutil.relativedelta import relativedelta
import holidays
import pytesseract
from PIL import Image
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
# ...existing code...
def interpretar_obrigacao_fiscal(texto, ano_base=None):
    """Extrai mês do texto e retorna prazos fiscais típicos (IRS, TSU, IVA, DMR)."""
    ano_base = ano_base or datetime.now().year
    texto = texto.lower()

    for mes_nome, mes_num in MESES_PT.items():
        if (
            f"processar {mes_nome}" in texto
            or f"referente a {mes_nome}" in texto
            or f"dmr de {mes_nome}" in texto  # NOVO PADRÃO
        ):
            # Cálculo do mês seguinte
            mes_seg = mes_num + 1 if mes_num < 12 else 1
            ano_seg = ano_base if mes_num < 12 else ano_base + 1
            ano_iva = ano_base if mes_num + 2 <= 12 else ano_base + 1
            mes_iva = (mes_num + 2 - 1) % 12 + 1

            return {
                "irs": date(ano_seg, mes_seg, 20),
                "tsu": date(ano_seg, mes_seg, 20),
                "iva_entrega": date(ano_iva, mes_iva, 10),
                "iva_pagamento": date(ano_iva, mes_iva, 15),
                "mes_base": mes_nome
            }
    return None
# ...existing code...

### 🖼️ OCR & PDF Extraction
Functions to read text in images (Tesseract) and PDFs.

In [2]:
def extract_text_from_image(path):
    """Base da extração de texto a partir de uma imagem (em português)."""
    return pytesseract.image_to_string(Image.open(path), lang='por')

def extract_text_from_pdf(path):
    """Base da extração de texto de todas as páginas de um PDF."""
    rdr = PdfReader(path)
    return "\n".join(page.extract_text() or "" for page in rdr.pages)

### 🧠 Data extraction (NLU)
Extract the first future date from a free text like `dateparser.search.search_dates`.

In [3]:
def infer_deadline(text, base_date=None):
    """Base da identificação de uam data a partir de uma imagem."""
    base = base_date or datetime.now()
    res = search_dates(
        text,
        languages=['pt','en'],
        settings={
            'PREFER_DATES_FROM':'future',
            'RELATIVE_BASE':base,
            'DATE_ORDER':'DMY'
        }
    )
    return res[0][1] if res else None

### 📅 Work days calculation (PT)
Add work days to a date, excluding weekends and Portuguese holidays.

In [None]:
def add_working_days(start_date, days):
    """Base de unção auxiliar para somar dias úteis a uma data, gerir férias judiciais, etc."""
    pt_hols = holidays.Portugal()
    curr = start_date
    added = 0
    while added < days:
        curr += relativedelta(days=1)
        if curr.weekday() < 5 and curr not in pt_hols:
            added += 1
    return curr

### 📅 Lógica fiscal (PT)
Fiscal rules that may appear.

In [None]:
def normalize_abbreviations(text):
    """Expands common fiscal abbreviations to facilitate LLM understanding."""
    substitutions = {
        "dmr": "declaração mensal de remunerações",
        "iva": "imposto sobre o valor acrescentado",
        "irs": "imposto sobre o rendimento das pessoas singulares",
        "ies": "informação empresarial simplificada",
        "tsu": "taxa social única"
    }
    for abbr, full in substitutions.items():
        text = re.sub(rf"\b{abbr}\b", full, text, flags=re.IGNORECASE)
    return text

def interpret_fiscal_obligation(text, base_year=None):
    """Extracts month from text and returns typical fiscal deadlines (IRS, TSU, IVA, DMR)."""
    base_year = base_year or datetime.now().year
    text = text.lower()

    for month_name, month_num in MESES_PT.items():
        patterns = [
            f"processar {month_name}",
            f"referente a {month_name}",
            f"dmr de {month_name}",
            f"entregar dmr de {month_name}",
            f"submeter dmr de {month_name}",
            f"pagamento do dmr de {month_name}",
            f"declaração de dmr do mês de {month_name}",
            f"envio da dmr de {month_name}",
            f"apresentação da dmr de {month_name}",
            f"prazo para dmr de {month_name}",
            f"liquidação do dmr de {month_name}",
            f"preencher dmr de {month_name}",
        ]
        if any(p in text for p in patterns):
            # Next month calculation
            next_month = month_num + 1 if month_num < 12 else 1
            next_year = base_year if month_num < 12 else base_year + 1
            iva_year = base_year if month_num + 2 <= 12 else base_year + 1
            iva_month = (month_num + 2 - 1) % 12 + 1

            return {
                "irs": date(next_year, next_month, 20),
                "tsu": date(next_year, next_month, 20),
                "iva_entrega": date(iva_year, iva_month, 10),
                "iva_pagamento": date(iva_year, iva_month, 15),
                "mes_base": month_name
            }
    return None

### 🤖 Deadline Agent (LLM Free)
One type of open-source model (Flan-T5 small) to apply the following rules:
- Modelo 22: up to 31/jul
- IES: 15/apr (current and next year)
- Others: infer via NLP

In [11]:
%pip install torch

# Implementation using simple LLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model     = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

def llm_generate(prompt: str, max_length: int = 256) -> str:
    inputs = tokenizer(prompt, return_tensors="pt").input_ids
    outs = model.generate(
        inputs, num_beams=4, early_stopping=True, max_length=max_length
    )
    return tokenizer.decode(outs[0], skip_special_tokens=True)

def agent_process(text, reference_date=None):
    """ Base de um Agente que infere deadlines aplicando regras legais ou simplesmente Língua Natural. Retorna a data em dicionário apto para JSON {'deadline': datetime} ou {'error':...}."""

    ref = reference_date or datetime.now()
    
    prompt = f"""
You are a Portuguese legal deadline assistant. Determine the deadline for the request below using these rules:
- "Modelo 22": due by {ref.year}-07-31
- "IES": due by {ref.year}-04-15 if before, else {ref.year+1}-04-15
- Otherwise infer via natural language (e.g. "5 working days from now").
Reference date: {ref.strftime('%Y-%m-%d')}
Input: "{text}"
Return ONLY a JSON object with key "deadline" (ISO8601 date string).
"""
    prompt_with_examples = f"""
You are a Portuguese legal deadline assistant. Determine the deadline for the request below using these rules:
- "Modelo 22": due by {ref.year}-07-31
- "IES": due by {ref.year}-04-15 if before, else {ref.year+1}-04-15
- For other cases, identify the notification date if mentioned, and the number of working days or the specific legal procedure mentioned that implies a deadline calculation rule.
Reference date: {ref.strftime('%Y-%m-%d')}

Here are some examples:
Input: "Notificação recebida em 2024-01-15. Tem 10 dias úteis para responder."
Output: {{"notification_date": "2024-01-15", "days_to_reply": 10}}

Input: "O prazo para contestar é de 20 dias a contar da citação."
Output: {{"days_to_reply": 20, "procedure_type": "contestacao"}}

Input: "Modelo 22 - exercício 2023"
Output: {{"deadline": "{ref.year}-07-31"}}

Input: "{text}"
Return ONLY a JSON object with the keys "deadline" (ISO8601 date string) OR "notification_date" (ISO8601 date string) and "days_to_reply" (integer) OR "procedure_type" (string).
"""
 
    raw = llm_generate(prompt_with_examples)
    
    try:
        obj = json.loads(raw)
        d = dateparser.parse(obj['deadline'])
        return {'deadline': d}
    except Exception as e:
        return {'error': f'LLM parse error: {e} | raw: {raw}'}

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
^C
Note: you may need to restart the kernel to use updated packages.


ImportError: 
AutoModelForSeq2SeqLM requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [None]:
# Implementation using Gemini LLM

def config_llm_gemini(temperature:int):
  '''LLM api calling using Gemini  '''
  # Steps for students:
  # - Go to https://aistudio.google.com/app/apikey and generate your Gemini API key.
  # - Add the necessary packages to your requirements.txt:
  #    langchain
  #    langchain-google-genai
  # - Run the following command to install them:
  #     !pip install -r requirements.txt
  # - Follow the official integration guide for LangChain + Google Generative AI:
  #     https://python.langchain.com/docs/integrations/chat/google_generative_ai/
  # Pay attention to the request limits of the chosen model.
  return "llm" #Should return the LLM response

In [4]:
import re

def process_deadline_from_image_or_text(input_data, is_image=True, base_date=None):
    """
    Processa uma imagem (OCR) ou texto para identificar prazos legais, aplicando regras portuguesas.
    Retorna um dicionário estruturado com data-limite, fonte, regra aplicada e confiança.
    """
    # 1. OCR se for imagem
    if is_image:
        text = extract_text_from_image(input_data)
        source = f"OCR de {input_data}"
    else:
        text = input_data
        source = "Texto fornecido"

    # 2. Limpeza básica do texto
    clean_text = " ".join(text.split())

    # 3. Inferência de data-base
    base = base_date or datetime.now()

    # 4. Busca por datas explícitas e frases de prazo
    prazo_inferido = infer_deadline(clean_text, base_date=base)

    # 5. Busca por frases do tipo "X dias úteis"
    match = re.search(r'(\d+)\s*dias?\s*úteis?', clean_text, re.IGNORECASE)
    if match:
        dias_uteis = int(match.group(1))
        # Busca data-base explícita
        data_base_match = search_dates(clean_text, languages=['pt'], settings={'PREFER_DATES_FROM':'future','RELATIVE_BASE':base,'DATE_ORDER':'DMY'})
        if data_base_match:
            data_base = data_base_match[0][1]
        else:
            data_base = base
        data_limite = add_working_days(data_base, dias_uteis)
        regra = f"{dias_uteis} dias úteis a partir de {data_base.strftime('%d/%m/%Y')}"
        confidence = 0.95
    elif prazo_inferido:
        data_limite = prazo_inferido
        regra = "Data explícita identificada"
        confidence = 0.8
    else:
        # fallback: usar LLM para tentar deduzir
        agent_result = agent_process(clean_text, reference_date=base)
        if 'deadline' in agent_result:
            data_limite = agent_result['deadline']
            regra = "Inferido por LLM"
            confidence = 0.7
        else:
            data_limite = None
            regra = "Não identificado"
            confidence = 0.0

    return {
        "data_limite": data_limite.strftime('%Y-%m-%d') if data_limite else None,
        "fonte": source,
        "regra": regra,
        "confiança": confidence,
        "texto": clean_text
    }


### 🔗 Calendar integration (Opcional)
Function to create events in external calendar tool

In [None]:
# def create_calendar_event(summary, start, end, timezone='UTC'):
#     pass  # implementar conforme API desejada

### 🧪 Use case examples

In [None]:
# Exemplo OCR:
# img_text = extract_text_from_image('scan.png')
# print(infer_deadline(img_text))

# Exemplo agente:
# print(agent_process('Entregar Modelo 22'))
# print(agent_process('Enviar IES até dia 15 de abril'))

# Working days:
# base = datetime(2025,5,27)
# print(add_working_days(base,5))

NameError: name 'agent_process' is not defined