### Import libraries

In [1]:
import pandas as pd

import camelot

import pdfplumber

import os

import re

import numpy as np

### Define Function to scroll through each file in the folder

In [2]:
def process_all_pdfs_in_folder(source_path, sink_path, desired_disease):
    year_df = []
    for filename in os.listdir(source_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(source_path, filename)
            print(f"Processando o arquivo: {pdf_path}")
            week_df = extract_and_process_tables(pdf_path, desired_disease, filename)
            year_df.append(week_df)
            
    if year_df:
        concatenated_df = pd.concat(year_df, ignore_index=True)

        pdf_filename = source_path.rsplit('/', 1)[-1]
        print("Salvando")
        csv_filename = os.path.splitext(pdf_filename)[0] + desired_disease + '.csv'
        output_path = os.path.join(sink_path, csv_filename)
        concatenated_df.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"Arquivo salvo: {csv_filename}")
            

### Define function to scroll through each page/table in selected file

In [72]:
def extract_and_process_tables(pdf_path, desired_disease, filename):
    all_dfs = []
    pdf = pdfplumber.open(pdf_path)

    for page_num in range(len(pdf.pages)):
        try:
            tables = camelot.read_pdf(pdf_path, pages=str(page_num + 1), flavor='stream')

            if tables:
                df = tables[0].df
                page = pdf.pages[page_num]
                table = page.extract_table()
                df_header = pd.DataFrame(table)
                                
                if (not df_header.empty 
                    and df_header.shape[0] > 1
                    and df_header.shape[1] > 1
                    and df_header.iloc[0, 1] is not None
                    and df_header.iloc[0, 5] is not None 
                    and df_header.iloc[0, 1] is not None
                    and desired_disease in df_header.iloc[0, 5]):
                    
                    first = None
                    

                    for n in range(4, 15):
                        cell_value = dfx.iloc[n, 0]
                        if cell_value  == 'Aguascalientes':
                            first = n
                            break
                            
                    df_all = process_table(df, df_header, filename, first)
                    
                        
                    if not df_all.empty:
                        all_dfs.append(df_all)

                        print(f"Processando a página {page_num + 1} com a tabela desejada.")

        except Exception as e:
            print(f"Erro ao processar a página {page_num + 1}: {e}")

    pdf.close()

    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
    else:
        final_df = pd.DataFrame()

    return final_df

### Define function to transform selected table in readable data

In [59]:
def process_table(df, df_header, filename, first):
    
    
    if df.empty or df_header.empty:
        print("DataFrame ou cabeçalho vazio.")
        return pd.DataFrame() 
    
    weeknumber = filename[3:5]

    df = df.iloc[first:].reset_index(drop=True)
    df.columns = df.iloc[0]
    df = df[:]


    df = df.loc[:, (df != '').any(axis=0)]

    df.dropna(axis=1, how='all')

    col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
    df.columns = col_names

    for col in col_names[1:len(df.columns)]:
        df[col] = df[col].str.replace(' ', '')

    first_col_name = df.columns[1]
    df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*|§FUENTE.*)')]
    df = df[~df['Coluna1'].str.startswith(('TOTAL', '&','FUENTE','§FUENTE'))]


    indices_para_converter = range(1, len(df.columns))
    for idx in indices_para_converter:
        if idx < len(df.columns):  # Verifica se o índice está dentro dos limites
            df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)

    num_col = len(df.columns)


    df1 = df.iloc[:, 0:5].copy()
    if len(df_header) > 1 and len(df_header.columns) > 1:
        df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
        df1['year'] = '2015'
    else:
        df1['disease'] = ''
        df1['year'] = ''


    df1['MF'] = df1.get('Coluna3', 0) + df1.get('Coluna4', 0)  

    df1['Acum_Year'] = '2014'
    df1['week'] = weeknumber


    novos_nomes = [
        'ENTIDAD FEDERATIVA',
        'Sem.',
        'M',
        'F',
        'Acum.',
    ]


    df1.columns = novos_nomes + list(df1.columns[5:])
    column_order = [0, 1, 2, 3, 7, 4, 5, 6, 8, 9]
    df1 = df1.iloc[:, column_order]

    df2 = df.iloc[:, [0] + list(range(5, 9))].copy()

    if len(df_header) > 0 and len(df_header.columns) > 5:
        df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
        df2['year'] = '2015'
    else:
        df2['disease'] = ''
        df2['year'] = ''


    df2['MF'] = df2.get('Coluna7', 0) + df2.get('Coluna8', 0)

    df2['Acum_Year'] = '2014'

    df2['week'] = weeknumber




    df2.columns = novos_nomes + list(df2.columns[5:])
    column_order = [0, 1, 2, 3, 7, 4, 5, 6, 8, 9]
    df2 = df2.iloc[:, column_order]

    
    df_all = pd.concat([df1, df2], axis=0, ignore_index=True)
    

    return df_all

### Process selected year/disease and save into selected folder

In [73]:

source_path = '/home/pirata/Documents/projects/epidemic_database/files/2015'
sink_path = '/home/pirata/Documents/projects/epidemic_database/bases/2015/Chikungunya'
desired_disease = 'Chikungunya'
process_all_pdfs_in_folder(source_path, sink_path, desired_disease)

Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem49.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem36.pdf




Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem48.pdf
Erro ao processar a página 2: index 5 is out of bounds for axis 0 with size 2
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem44.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem01.pdf
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem50.pdf
Processando a página 36 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem27.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem08.pdf
Erro ao processar a página 2: index 5 is out of bounds for axis 0 with size 3
Processando a página 40 com a tabela desejada



Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem30.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem05.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem26.pdf
Processando a página 38 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem22.pdf




Erro ao processar a página 8: index 5 is out of bounds for axis 0 with size 4
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem51.pdf
Processando a página 36 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem45.pdf
Processando a página 36 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem02.pdf
Processando a página 39 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem12.pdf
Processando a página 36 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem43.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem15.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/



Erro ao processar a página 5: index 5 is out of bounds for axis 0 with size 4
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem37.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem32.pdf
Erro ao processar a página 8: index 5 is out of bounds for axis 0 with size 3
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem03.pdf
Erro ao processar a página 4: index 5 is out of bounds for axis 0 with size 3
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem11.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem35.pdf
Processando a página 40 com a tabela desejada.
Processando o 



Erro ao processar a página 5: index 5 is out of bounds for axis 0 with size 2




Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem46.pdf




Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem16.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem24.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem39.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem14.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem38.pdf
Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem23.pdf
Erro ao processar a página 3: index 5 is out of bounds for axis 0 with size 3
Processando a página 36 com a tabela desejada.
Processando o arquivo: /home/



Processando a página 40 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem21.pdf
Processando a página 36 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015/sem04.pdf
Processando a página 40 com a tabela desejada.
Salvando
Arquivo salvo: 2015Chikungunya.csv


### Test function/ analysis

In [62]:
# filename = 'sem49.pdf'

# pdf_path = '/home/pirata/Documents/projects/epidemic_database/files/2015_teste/sem49.pdf'

# page_num = 39

# pdf = pdfplumber.open(pdf_path)

# tables = camelot.read_pdf(pdf_path, pages=str(page_num + 1), flavor='stream')

# # if (not df_header.empty 
# #                     and df_header.shape[0] > 1  # Verifica se há pelo menos 2 linhas
# #                     and df_header.shape[1] > 1  # Verifica se há pelo menos 2 colunas
# #                     and df_header.iloc[0, 1] is not None
# #                     and desired_disease in df_header.iloc[0, 5]):
# #     print('é')
    

# if tables:
#     dfx = tables[0].df
#     page = pdf.pages[page_num]
#     table = page.extract_table()
#     df_header = pd.DataFrame(table)

In [71]:
# first = None

# for n in range(4, 15):
#     cell_value = dfx.iloc[n, 0]
#     if cell_value  == 'Aguascalientes':
#         first = n
#         break
        
        
# print(n)

5


In [67]:

# weeknumber = filename[3:5]

# df = dfx.iloc[first:].reset_index(drop=True)
# df.columns = df.iloc[0]
# df = df[:]

# # df.replace('', np.nan, inplace=True)

# df = df.loc[:, (df != '').any(axis=0)]

# df.dropna(axis=1, how='all')

# col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
# df.columns = col_names

# for col in col_names[1:len(df.columns)]:
#     df[col] = df[col].str.replace(' ', '')

# first_col_name = df.columns[1]
# df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*|§FUENTE.*)')]
# df = df[~df['Coluna1'].str.startswith(('TOTAL', '&','FUENTE','§FUENTE'))]


# indices_para_converter = range(1, len(df.columns))
# for idx in indices_para_converter:
#     if idx < len(df.columns):  # Verifica se o índice está dentro dos limites
#         df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)
        
# num_col = len(df.columns)
        

# df1 = df.iloc[:, 0:5].copy()
# if len(df_header) > 1 and len(df_header.columns) > 1:
#     df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
#     df1['year'] = '2015'
# else:
#     df1['disease'] = ''
#     df1['year'] = ''
    
    
# df1['MF'] = df1.get('Coluna3', 0) + df1.get('Coluna4', 0)  

# df1['Acum_Year'] = '2014'
# df1['week'] = weeknumber


# novos_nomes = [
#     'ENTIDAD FEDERATIVA',
#     'Sem.',
#     'M',
#     'F',
#     'Acum.',
# ]


# df1.columns = novos_nomes + list(df1.columns[5:])
# column_order = [0, 1, 2, 3, 7, 4, 5, 6, 8, 9]
# df1 = df1.iloc[:, column_order]

# df2 = df.iloc[:, [0] + list(range(5, 9))].copy()

# if len(df_header) > 0 and len(df_header.columns) > 5:
#     df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
#     df2['year'] = '2015'
# else:
#     df2['disease'] = ''
#     df2['year'] = ''
    
    
# df2['MF'] = df2.get('Coluna7', 0) + df2.get('Coluna8', 0)

# df2['Acum_Year'] = '2014'

# df2['week'] = weeknumber




# df2.columns = novos_nomes + list(df2.columns[5:])
# column_order = [0, 1, 2, 3, 7, 4, 5, 6, 8, 9]
# df2 = df2.iloc[:, column_order]



# df2

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,MF,Acum.,disease,year,Acum_Year,week
0,Aguascalientes,0,0,1,1,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
1,Baja California,0,0,0,0,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
2,Baja California Sur,1,57,91,148,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
3,Campeche,1,86,165,251,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
4,Coahuila,0,10,13,23,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
5,Colima,1,366,615,981,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
6,Chiapas,4,252,430,682,42,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
7,Chihuahua,0,1,0,1,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
8,Distrito Federal,0,0,0,0,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49
9,Durango,1,1,2,3,0,§Enfermedad por Virus Chikungunya CIE-10ª REV....,2016,2015,49


In [58]:
# len(df.columns)

14

In [7]:
# display(re.search(r'-SE(\d+)\.pdf', filename).group(1))

'51'

In [65]:
# df_header.iloc[0, 4]

'Fiebre Manchada\nCIE-10ª REV.\nA77.0'

In [43]:
# desired_disease = 'Leptospirosis'

# desired_disease in df_header.iloc[0, 5]

True

In [62]:
# len(dfx.columns)

16

In [63]:
# df_header

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,ENTIDAD\nFEDERATIVA,Tracoma\nCIE-10ª REV.\nA71,,,,§Enfermedad por Virus\nChikungunya\nCIE-10ª RE...,,,,Leishmaniasis Visceral\nCIE-10ª REV.\nB55.0,,,
1,,2015,,,2014,2015,,,2014,2015,,,2014
2,,Sem.,Acum.,,Acum.,Sem.,Acum.,,Acum.,Sem.,Acum.,,Acum.
3,,,M,F,,,M,F,,,M,F,
4,Aguascalientes,,- -\n- -\n- -\n- -\n- -\n- -\n16 17\n- -\n- -\...,,-\n2\n-\n-\n-\n-\n60\n-\n-\n-\n-\n-\n-\n-\n2\n...,-\n-\n1\n1\n-\n1\n4\n-\n-\n1\n-\n6\n-\n7\n1\n3...,- 1\n- -\n57 91\n86 165\n10 13\n366 615\n252 4...,,-\n-\n-\n-\n-\n-\n42\n-\n-\n-\n-\n-\n-\n-\n-\n...,-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n1...,- -\n- -\n- -\n- -\n- -\n- -\n- -\n- -\n- -\n-...,,-\n-\n-\n-\n-\n-\n1\n-\n-\n-\n-\n-\n-\n-\n-\n-...
5,TOTAL -,,19 21,,71,195,3 979 7 415,,42,1,- 1,,1


In [64]:
# dfx

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,FEDERATIVA,,,,,,,,,,,,,,
1,,,2015,,2014,,2015,,2014,,2015,,,,2014
2,,,Acum.,,,,Acum.,,,,,Acum.,,,
3,,Sem.,,,Acum.,Sem.,,,Acum.,Sem.,,,,,Acum.
4,,,M,F,,,M,F,,,M,,F,,
5,Aguascalientes,-,-,-,-,-,-,1,-,-,,-,,-,-
6,Baja California,-,-,-,2,-,-,-,-,-,,-,,-,-
7,Baja California Sur,-,-,-,-,1,57,91,-,-,,-,,-,-
8,Campeche,-,-,-,-,1,86,165,-,-,,-,,-,-
9,Coahuila,-,-,-,-,-,10,13,-,-,,-,,-,-
