<a href="https://colab.research.google.com/github/Rehzende/sorocabatransparente/blob/main/notebooks/extracao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install "camelot-py[cv]"
%pip install camelot-py[base]
%pip install "PyPDF2<3.0"
%pip install pillow
%pip install unidecode
%pip install html5lib
%apt install ghostscript python3-tk - y


In [None]:
import camelot
import json
import os
import requests
from pathlib import Path
from bs4 import BeautifulSoup
from unidecode import unidecode
import urllib.request
from operator import index
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize


In [None]:
LINKS_FILE_PATH = '../dados/relatorios/links.json'


def process_monthly_reports(year_data):
    print(year_data["extracted"])
    if not year_data["extracted"]:
        for m in year_data['months']:
            try:
                year_extracted = True
                if m["extracted"] == False:
                    url = m["url_id"]
                    year = unidecode(year_data['year'].lower())
                    month = unidecode(m['month'])
                    path_name = f"../dados/relatorios/pdf/{year}/{month}/"
                    if not os.path.exists(path_name):
                        os.makedirs(path_name)
                    file_name = f"{path_name}{month}"
                    _, headers = urllib.request.urlretrieve(url, file_name)
                    content_type = headers['Content-Type']
                    if content_type.split(";")[0].split("/")[1] == "pdf":
                        file_extension = "pdf"
                        os.rename(f'{file_name}', f'{file_name}.{file_extension}')
                    if content_type.split(";")[0].split("/")[1] == "html":
                        file_extension = "html"
                        os.rename(f'{file_name}', f'{file_name}.{file_extension}')
                    if not os.path.exists(f"../dados/relatorios/json/{year}/{month}/"):
                        os.makedirs(f"../dados/relatorios/json/{year}/{month}/")
                    report_json_name = f"../dados/relatorios/json/{year}/{month}/{month}.json"
                    extracted = extrac_pdf_data(file_name, report_json_name, file_extension)
                    m['extracted'] = extracted
                    m['report_path'] = report_json_name
                    m['file_extension'] = file_extension

                    print(
                        f"O PDF/HTML do Ano {year} e do mês {month} foi processado com sucesso.")
                
            except Exception as e:
                error = {
                    "message": f"Erro ao processar arquivo PDF/HTML: ",
                    "error":   f"{str(e)}"
                }
                print(error)
                year_extracted = False
        year_data['extracted'] =  year_extracted
        print(
            f"Os Pdf's do Ano {year_data['year'] } foram processado com sucesso.")


def extrac_pdf_data(file_path, report_json_name, file_extension):
    try:
        if file_extension == "pdf":
            tables = camelot.read_pdf(f'{file_path}.{file_extension}', encoding='utf-8')
            df = tables[0].df
            temp_df = df.iloc[2:]
        if file_extension == "html":
            table_MN = pd.read_html(f'{file_path}.{file_extension}')
            df = table_MN[0]
            df.style.set_properties(**{'text-align': 'left'})
            temp_df = df.iloc[7:]

        temp_df.rename(columns=temp_df.iloc[0]).drop(temp_df.index[0])
        temp_df = temp_df.replace(r'\n', ' ', regex=True)
        temp_df = temp_df.replace(r'^s*$', float('0'), regex=True)
        temp_df = temp_df[:-2]
        if (len(df.columns)) == 6:
            temp_df.columns = ['Vereador', 'Aluguel Impressora',
                                'Material Escritório', 'Postagem', 'Combustivel', 'Total']
        else:
            temp_df.columns = ['Vereador', 'Aluguel Impressora',
                            'Material Escritório', 'Postagem', 'Combustivel', 'Total', 'Reembolso']
        temp_df.reset_index(drop=True, inplace=True)
        temp_df.to_json(report_json_name, orient='index')
        extracted = True
        return  extracted
    except Exception as e:
        error = {
            "message": f"Erro ao ler arquivo PDF/HTML: ",
            "error":   f"{str(e)}"
        }
        print(error)
        extracted = False
        return  extracted


try:
    links_file_path = Path(LINKS_FILE_PATH)
    with links_file_path.open("r", encoding="utf8") as f:
        dados = json.load(f)
except Exception as e:
    print(f"Erro ao ler arquivo JSON: {str(e)}")
    dados = []

for year_data in dados:

    process_monthly_reports(year_data)

    with links_file_path.open("w", encoding="utf8") as f:
        json.dump(dados, f, ensure_ascii=False, indent=4)


In [None]:
from PIL import Image, ImageDraw, ImageFont


def put_expenses_values(template, aluguel_impressora, combustivel, mat_escritorio, postagem, reembolso, total):
    fonte = ImageFont.truetype("arial.ttf", 80)
    draw = ImageDraw.Draw(template)
    draw.text((1100, 1060),
              f" R$ {aluguel_impressora}", font=fonte, fill="#fff")
    draw.text((1100, 1260), f" R$ {combustivel}", font=fonte, fill="#fff")
    draw.text((1100, 1460), f" R$ {mat_escritorio}", font=fonte, fill="#fff")
    draw.text((1100, 1660), f" R$ {postagem}", font=fonte, fill="#fff")
    draw.text((1100, 1860), f" R$ {reembolso}", font=fonte, fill="#fff")
    draw.text((1100, 2060), f" R$ {total}", font=fonte, fill="#000")
    return template


def put_photo_vereador(img, image_name):
    photo = Image.open(image_name)
    photo = photo.resize((947, 1433), Image.LANCZOS)
    posicao_photo = (1800, 800)
    img.paste(photo, posicao_photo)
    return img


def main(nome_vereador, image_name, aluguel_impressora, combustivel, mat_escritorio, postagem, reembolso, total):
    fonte_nome = ImageFont.truetype("arial.ttf", 80)
    img = Image.open("despesas_template.jpg")
    img = put_photo_vereador(img, image_name)
    draw = ImageDraw.Draw(img)
    draw.text((140, 800), nome_vereador, font=fonte_nome, fill="#fff")
    draw.text((420, 530), "Janeiro/2022",
              font=ImageFont.truetype("montserrat.ttf", 120), fill="#fff")
    img = put_expenses_values(
        img, aluguel_impressora, combustivel, mat_escritorio, postagem, reembolso, total)

    file_name = nome_vereador.lower().replace(" ", "_")
    img.save(f"../dados/processados/{file_name}.jpg")


## Download das imagens dos vereadores

In [None]:

# from unidecode import unidecode

# with open("../dados/vereadores/vereadores.json", "r", encoding="utf8") as f:
#     dados_vereadores = json.load(f)

# for vereador in dados_vereadores:
#     url = vereador["imagem_url"]
#     imagem_vereador = unidecode(vereador['nome'].lower().replace(" ", "_"))
#     # nome do arquivo a ser salvo
#     nome_arquivo = f"../dados/vereadores/imagens/{imagem_vereador}.jpg"
#     urllib.request.urlretrieve(url, nome_arquivo)
# import urllib.request

In [None]:
from unidecode import unidecode

with open("../dados/vereadores/vereadores.json", "r", encoding="utf8") as f:
    dados_vereadores = json.load(f)

with open('file.json', encoding="utf8") as f:
    vereadores = json.load(f)

for chave, valores in vereadores.items():
    imagem_vereador = unidecode(valores['Vereador'].lower().replace(" ", "_"))
    main(valores["Vereador"], f"../dados/vereadores/imagens/{imagem_vereador}.jpg", valores["Aluguel Impressora"],
         valores["Material Escrit\u00f3rio"], valores["Postagem"], valores["Combustivel"], valores["Reembolso"], valores["Total"])


In [None]:
%pip install tabula-py
import tabula
df = tabula.read_pdf("http://www.camarasorocaba.sp.gov.br:8082/publicFiles/file/63dd5e8cee6582a5bd1416a9", pages='all')