## Extração dos ID's e URL das fotos dos Vereadores

In [None]:
%pip install requests bs4

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import json
from unidecode import unidecode


vereadores = []

for vereador_id in range(1, 200):
    url = f'http://www.camarasorocaba.sp.gov.br/vereador.html?id={vereador_id}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        nome_vereador = soup.find('div', {'class': 'page-header'}).find('h1').text
        print(f'Vereador {vereador_id}: {nome_vereador}')
        vereador_dict = {
            "nome": nome_vereador,
            "id": vereador_id,
            "imagem_url": f"http://syslegis.camarasorocaba.sp.gov.br:8383/syslegis/vereador/getConteudo/{vereador_id}"
        }
        vereadores.append(vereador_dict)
    except AttributeError:
        print(f'Erro ao obter o nome do vereador {vereador_id}')

with open('../dados/vereadores/vereadores.json', 'w', encoding='utf-8') as f:
    json.dump(vereadores, f, ensure_ascii=False, indent=4)

## Extração dos ID's do Ano de cada pagina de relatorio. 

In [None]:
import json, requests
from bs4 import BeautifulSoup
from pathlib import Path


LINKS_FILE_PATH = '../dados/relatorios/links.json'

def extract_year_data(existing_data):
    print('Carregando dados')
    url = 'http://www.camarasorocaba.sp.gov.br/arquivos_publicos.html?id=5e3f0dc905d7040f28b44e0e'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', {'class': 'table table-striped'})
    for year in table.find_all('a'):
        year_str = year.contents[2].strip()

        # Verificar se já existe um dicionário para este ano
        year_exists = False
        if existing_data:
            for data in existing_data:
                if data['year'] == year_str:
                    year_exists = True
                    break

        if not year_exists:
            existing_data.append({'url_id': year.get('href'), 'year': year_str, 'processed': False, 'extracted': False, 'months': []})
        else:
            print(f"Já existe um dicionário para o ano {year_str}. Ignorando.")


try:
    links_file_path = Path(LINKS_FILE_PATH)
    with links_file_path.open("r+", encoding="utf8") as f:
        data = json.load(f)

    print('extraindo')
    extract_year_data(data)

    with links_file_path.open("w", encoding="utf8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
except Exception as e:
    print(f"Erro ao ler arquivo JSON: {str(e)}")
    data = []

## Extração dos ID's do relatório de cada mês do ano.

In [None]:
import json
from pathlib import Path
import requests
from bs4 import BeautifulSoup

BASE_URL = 'http://www.camarasorocaba.sp.gov.br/arquivos_publicos.html'
LINKS_FILE_PATH = '../dados/relatorios/links.json'

def scrape_monthly_reports(year_data, base_url):
    if year_data["extracted"] == False:
        response = requests.get(f"{base_url}{year_data['url_id']}")
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': 'table table-striped'})
        for link in table.find_all('a'):
            url_id = link.get('href')
            month = link.contents[2].strip()
            exists_month = False
            for m in year_data['months']:
                if m["month"] == month:
                    exists_month = True
                    break

            if not exists_month:

                new_month = {
                    "url_id":  url_id,
                    "month": month,
                    "processed": False,
                    "extracted": False,
                    "report_path": "",
                    "file_extension": ""
                }
                year_data['months'].append(new_month)

try:
    links_file_path = Path(LINKS_FILE_PATH)
    with links_file_path.open("r+", encoding="utf8") as f:
        dados = json.load(f)
except Exception as e:
    print(f"Erro ao ler arquivo JSON: {str(e)}")
    dados = []

for year_data in dados:

    scrape_monthly_reports(year_data, BASE_URL)

    with links_file_path.open("w", encoding="utf8") as f:
        json.dump(dados, f, ensure_ascii=False, indent=4)
