### Importing libraries

In [1]:
import pandas as pd

import camelot

import pdfplumber

import os

### One Table analysis

In [170]:
# Ler a tabela da página 43
tables = camelot.read_pdf(pdf_path, pages='43', flavor='stream')  # Use 'lattice' se a tabela tiver bordas

# Assumindo que há uma única tabela na página 43
df = tables[0].df  # df é um DataFrame do pandas

# Remover linhas anteriores ao cabeçalho
df = df.iloc[4:].reset_index(drop=True)  # Começar a partir da linha 6 (index 5)

# Definir a primeira linha dos dados como cabeçalho
df.columns = df.iloc[0]
df = df[1:]

# Renomear as colunas se necessário
col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
df.columns = col_names

In [171]:
df.head()

Unnamed: 0,Coluna1,Coluna2,Coluna3,Coluna4,Coluna5,Coluna6,Coluna7,Coluna8,Coluna9,Coluna10,Coluna11,Coluna12,Coluna13
1,Aguascalientes,-,4,3,-,-,1,2,13,1,13,14,37
2,Baja California,75,3 362,3 353,8 899,-,274,165,305,-,74,36,157
3,Baja California Sur,58,1 320,1 495,10 521,-,40,15,46,-,6,8,11
4,Campeche,95,3 430,4 508,459,-,8,2,7,-,20,17,43
5,Coahuila,10,881,912,4 262,-,12,19,53,-,55,42,120


In [142]:
column_names = df.columns

for col in column_names[1:13]:
    df[col] = df[col].str.replace(' ', '')

In [143]:
df.head()

Unnamed: 0,Coluna1,Coluna2,Coluna3,Coluna4,Coluna5,Coluna6,Coluna7,Coluna8,Coluna9,Coluna10,Coluna11,Coluna12,Coluna13
1,Aguascalientes,-,4,3,-,-,1,2,13,1,13,14,37
2,Baja California,75,3362,3353,8899,-,274,165,305,-,74,36,157
3,Baja California Sur,58,1320,1495,10521,-,40,15,46,-,6,8,11
4,Campeche,95,3430,4508,459,-,8,2,7,-,20,17,43
5,Coahuila,10,881,912,4262,-,12,19,53,-,55,42,120


In [144]:
# Vamos assumir que a primeira coluna é '0' (você pode precisar ajustar isso com base no nome da coluna real)
first_col_name = df.columns[0]

# Remova linhas onde a primeira coluna é 'TOTAL' ou começa com 'FUENTE'
df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*)')]

In [145]:
df.head()

Unnamed: 0,Coluna1,Coluna2,Coluna3,Coluna4,Coluna5,Coluna6,Coluna7,Coluna8,Coluna9,Coluna10,Coluna11,Coluna12,Coluna13
1,Aguascalientes,-,4,3,-,-,1,2,13,1,13,14,37
2,Baja California,75,3362,3353,8899,-,274,165,305,-,74,36,157
3,Baja California Sur,58,1320,1495,10521,-,40,15,46,-,6,8,11
4,Campeche,95,3430,4508,459,-,8,2,7,-,20,17,43
5,Coahuila,10,881,912,4262,-,12,19,53,-,55,42,120


In [146]:

indices_para_converter = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

for idx in indices_para_converter:
    df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)

In [147]:
pdf = pdfplumber.open(pdf_path)

page = pdf.pages[42]

table = page.extract_table()

df_header = pd.DataFrame(table)

In [148]:
df1 = df.iloc[:,0:5]

df1['disease'] = df_header.iloc[0,1].replace('\n',' ')

df1['year'] = df_header.iloc[1,1].replace('\n',' ')

df1['MF'] = df1['Coluna3'] + df1['Coluna4']

df1['Acum_Year'] = df_header.iloc[1,4].replace('\n',' ')

df1.head()

Unnamed: 0,Coluna1,Coluna2,Coluna3,Coluna4,Coluna5,disease,year,MF,Acum_Year
1,Aguascalientes,0,4,3,0,Síndrome Febril CIE-10ª REV. R50,2015,7,2014
2,Baja California,75,3362,3353,8899,Síndrome Febril CIE-10ª REV. R50,2015,6715,2014
3,Baja California Sur,58,1320,1495,10521,Síndrome Febril CIE-10ª REV. R50,2015,2815,2014
4,Campeche,95,3430,4508,459,Síndrome Febril CIE-10ª REV. R50,2015,7938,2014
5,Coahuila,10,881,912,4262,Síndrome Febril CIE-10ª REV. R50,2015,1793,2014


In [149]:

novos_nomes = [df_header.iloc[0,0].replace('\n',' '),
              df_header.iloc[2,1].replace('\n',' '),
              df_header.iloc[3,2].replace('\n',' '),
              df_header.iloc[3,3].replace('\n',' '),
              df_header.iloc[2,4].replace('\n',' ')]

df1.columns = novos_nomes + list(df1.columns[5:])

In [150]:
df1.head()

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,Acum.,disease,year,MF,Acum_Year
1,Aguascalientes,0,4,3,0,Síndrome Febril CIE-10ª REV. R50,2015,7,2014
2,Baja California,75,3362,3353,8899,Síndrome Febril CIE-10ª REV. R50,2015,6715,2014
3,Baja California Sur,58,1320,1495,10521,Síndrome Febril CIE-10ª REV. R50,2015,2815,2014
4,Campeche,95,3430,4508,459,Síndrome Febril CIE-10ª REV. R50,2015,7938,2014
5,Coahuila,10,881,912,4262,Síndrome Febril CIE-10ª REV. R50,2015,1793,2014


In [151]:
column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6]

df1 = df1.iloc[:,column_order]

df1.head()

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,MF,Acum.,Acum_Year,disease,year
1,Aguascalientes,0,4,3,7,0,2014,Síndrome Febril CIE-10ª REV. R50,2015
2,Baja California,75,3362,3353,6715,8899,2014,Síndrome Febril CIE-10ª REV. R50,2015
3,Baja California Sur,58,1320,1495,2815,10521,2014,Síndrome Febril CIE-10ª REV. R50,2015
4,Campeche,95,3430,4508,7938,459,2014,Síndrome Febril CIE-10ª REV. R50,2015
5,Coahuila,10,881,912,1793,4262,2014,Síndrome Febril CIE-10ª REV. R50,2015


In [153]:

df2 = df.iloc[:, [0] + list(range(5, 9))].copy()

df2['disease'] = df_header.iloc[0,5].replace('\n',' ')

df2['year'] = df_header.iloc[1,4].replace('\n',' ')

df2['MF'] = df2['Coluna7'] + df2['Coluna8']

df2['Acum_Year'] = df_header.iloc[1,8].replace('\n',' ')

df2.head()

Unnamed: 0,Coluna1,Coluna6,Coluna7,Coluna8,Coluna9,disease,year,MF,Acum_Year
1,Aguascalientes,0,1,2,13,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,3,2014
2,Baja California,0,274,165,305,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,439,2014
3,Baja California Sur,0,40,15,46,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,55,2014
4,Campeche,0,8,2,7,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,10,2014
5,Coahuila,0,12,19,53,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,31,2014


In [154]:

novos_nomes = [df_header.iloc[0,0].replace('\n',' '),
              df_header.iloc[2,1].replace('\n',' '),
              df_header.iloc[3,2].replace('\n',' '),
              df_header.iloc[3,3].replace('\n',' '),
              df_header.iloc[2,4].replace('\n',' ')]

df2.columns = novos_nomes + list(df2.columns[5:])

In [155]:
df2.head()

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,Acum.,disease,year,MF,Acum_Year
1,Aguascalientes,0,1,2,13,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,3,2014
2,Baja California,0,274,165,305,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,439,2014
3,Baja California Sur,0,40,15,46,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,55,2014
4,Campeche,0,8,2,7,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,10,2014
5,Coahuila,0,12,19,53,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014,31,2014


In [156]:
column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6]

df2 = df2.iloc[:,column_order]

df2.head()

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,MF,Acum.,Acum_Year,disease,year
1,Aguascalientes,0,1,2,3,13,2014,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014
2,Baja California,0,274,165,439,305,2014,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014
3,Baja California Sur,0,40,15,55,46,2014,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014
4,Campeche,0,8,2,10,7,2014,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014
5,Coahuila,0,12,19,31,53,2014,Efectos del Calor y de la Luz CIE-10ª REV. T67,2014


In [162]:
df3 = df.iloc[:, [0] + list(range(9, 13))].copy()

df3['disease'] = df_header.iloc[0,9].replace('\n',' ')

df3['year'] = df_header.iloc[1,9].replace('\n',' ')

df3['MF'] = df3['Coluna11'] + df3['Coluna12']

df3['Acum_Year'] = df_header.iloc[1,12].replace('\n',' ')

df3.head()

Unnamed: 0,Coluna1,Coluna10,Coluna11,Coluna12,Coluna13,disease,year,MF,Acum_Year
1,Aguascalientes,1,13,14,37,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,27,2014
2,Baja California,0,74,36,157,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,110,2014
3,Baja California Sur,0,6,8,11,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,14,2014
4,Campeche,0,20,17,43,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,37,2014
5,Coahuila,0,55,42,120,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,97,2014


In [163]:

novos_nomes = [df_header.iloc[0,0].replace('\n',' '),
              df_header.iloc[2,1].replace('\n',' '),
              df_header.iloc[3,2].replace('\n',' '),
              df_header.iloc[3,3].replace('\n',' '),
              df_header.iloc[2,4].replace('\n',' ')]

df3.columns = novos_nomes + list(df3.columns[5:])

In [164]:
df3.head()

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,Acum.,disease,year,MF,Acum_Year
1,Aguascalientes,1,13,14,37,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,27,2014
2,Baja California,0,74,36,157,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,110,2014
3,Baja California Sur,0,6,8,11,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,14,2014
4,Campeche,0,20,17,43,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,37,2014
5,Coahuila,0,55,42,120,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015,97,2014


In [165]:
column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6]

df3 = df3.iloc[:,column_order]

df3.head()

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,MF,Acum.,Acum_Year,disease,year
1,Aguascalientes,1,13,14,27,37,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
2,Baja California,0,74,36,110,157,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
3,Baja California Sur,0,6,8,14,11,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
4,Campeche,0,20,17,37,43,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
5,Coahuila,0,55,42,97,120,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015


In [166]:
df_all = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

In [167]:
df_all

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,MF,Acum.,Acum_Year,disease,year
0,Aguascalientes,0,4,3,7,0,2014,Síndrome Febril CIE-10ª REV. R50,2015
1,Baja California,75,3362,3353,6715,8899,2014,Síndrome Febril CIE-10ª REV. R50,2015
2,Baja California Sur,58,1320,1495,2815,10521,2014,Síndrome Febril CIE-10ª REV. R50,2015
3,Campeche,95,3430,4508,7938,459,2014,Síndrome Febril CIE-10ª REV. R50,2015
4,Coahuila,10,881,912,1793,4262,2014,Síndrome Febril CIE-10ª REV. R50,2015
...,...,...,...,...,...,...,...,...,...
91,Tamaulipas,0,105,96,201,173,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
92,Tlaxcala,0,11,7,18,28,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
93,Veracruz,0,54,75,129,152,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
94,Yucatán,0,25,20,45,42,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015


In [41]:
print(year_df)

[     ENTIDAD FEDERATIVA Sem.   M   F   MF Acum. Acum_Year  \
0        Aguascalientes    0   0   0    0     0      2014   
1       Baja California    8  87  74  161   502      2014   
2   Baja California Sur    6   5  11   16    91      2014   
3              Campeche   17  26  18   44    66      2014   
4              Coahuila   76  83  65  148   149      2014   
..                  ...  ...  ..  ..  ...   ...       ...   
91           Tamaulipas    4   2   5    7     0      2014   
92             Tlaxcala    0   0   0    0     0      2014   
93             Veracruz    1   0   1    1     0      2014   
94              Yucatán    0   0   0    0     2      2014   
95            Zacatecas    2   1   1    2     0      2014   

                                            disease  year  
0                  Síndrome Febril CIE-10ª REV. R50  2015  
1                  Síndrome Febril CIE-10ª REV. R50  2015  
2                  Síndrome Febril CIE-10ª REV. R50  2015  
3                  Síndrom

In [39]:

folder_path = '/home/pirata/Documents/projects/epidemic_database/files/2015'
desired_disease = 'Síndrome Febril CIE-10ª REV. R50'
process_all_pdfs_in_folder(folder_path, desired_disease)

Processando a página 43 com a tabela desejada.


Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,MF,Acum.,Acum_Year,disease,year
0,Aguascalientes,0,4,3,7,0,2014,Síndrome Febril CIE-10ª REV. R50,2015
1,Baja California,75,3362,3353,6715,8899,2014,Síndrome Febril CIE-10ª REV. R50,2015
2,Baja California Sur,58,1320,1495,2815,10521,2014,Síndrome Febril CIE-10ª REV. R50,2015
3,Campeche,95,3430,4508,7938,459,2014,Síndrome Febril CIE-10ª REV. R50,2015
4,Coahuila,10,881,912,1793,4262,2014,Síndrome Febril CIE-10ª REV. R50,2015
...,...,...,...,...,...,...,...,...,...
91,Tamaulipas,0,105,96,201,173,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
92,Tlaxcala,0,11,7,18,28,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
93,Veracruz,0,54,75,129,152,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015
94,Yucatán,0,25,20,45,42,2014,§Enfermedad Febril Exantemática CIE-10ª REV. U97,2015


### Header Analysis

In [3]:
pdf_path = '/home/pirata/Documents/projects/epidemic_database/files/2015_teste/sem01.pdf'


In [36]:
tables = camelot.read_pdf(pdf_path, pages=str(41), flavor='stream')


pdf = pdfplumber.open(pdf_path)
dfx = tables[0].df
page = pdf.pages[40]
table = page.extract_table()
df_header = pd.DataFrame(table)

df_header

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,ENTIDAD\nFEDERATIVA,Síndrome Febril\nCIE-10ª REV.\nR50,,,Efectos del Calor y de la Luz\nCIE-10ª REV.\nT67,,,Enfermedad Febril Exantemática\nCIE-10ª REV.\nU97,,,
1,,2014,,,2014,,,2015,,,2014
2,,Sem.,Acum.,,Sem.,Acum.,,Sem.,Acum.,,Acum.
3,,,M,F,,M,F,,M,F,
4,Aguascalientes,,626 588\n4 142 3 942\n4 682 5 234\n1 321 1 237...,,-\n-\n-\n-\n-\n-\n1\n-\n-\n-\n-\n1\n-\n-\n2\n1...,7 7\n134 91\n25 17\n3 3\n26 27\n48 36\n60 76\n...,,-\n-\n1\n-\n-\n-\n-\n-\n-\n-\n-\n-\n1\n-\n-\n-...,- -\n- -\n- 1\n- -\n- -\n- -\n- -\n- -\n- -\n-...,,-
5,TOTAL 1 687,,122 669 133 011,,9,1 289 1 365,,4,1 3,,-


In [15]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,FEDERATIVA,,,,,,,,,,,,,,
1,,,2014,,,,2014,,,,2015,,,,2014
2,,,Acum.,,,,Acum.,,,,,Acum.,,,
3,,Sem.,,,Sem.,,,,Sem.,,,,,,Acum.
4,,,M,F,,,M,F,,,M,,F,,
5,Aguascalientes,-,626,588,,-,7,7,,-,,-,,-,-
6,Baja California,2,4 142,3 942,,-,134,91,,-,,-,,-,-
7,Baja California Sur,10,4 682,5 234,,-,25,17,,1,,-,,1,-
8,Campeche,21,1 321,1 237,,-,3,3,,-,,-,,-,-
9,Coahuila,82,2 215,2 233,,-,26,27,,-,,-,,-,-


In [9]:
pdf = pdfplumber.open(pdf_path)

page = pdf.pages[40]

table = page.extract_table()

df_header = pd.DataFrame(table)

df_header

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,ENTIDAD\nFEDERATIVA,Síndrome Febril\nCIE-10ª REV.\nR50,,,Efectos del Calor y de la Luz\nCIE-10ª REV.\nT67,,,Enfermedad Febril Exantemática\nCIE-10ª REV.\nU97,,,
1,,2014,,,2014,,,2015,,,2014
2,,Sem.,Acum.,,Sem.,Acum.,,Sem.,Acum.,,Acum.
3,,,M,F,,M,F,,M,F,
4,Aguascalientes,,626 588\n4 142 3 942\n4 682 5 234\n1 321 1 237...,,-\n-\n-\n-\n-\n-\n1\n-\n-\n-\n-\n1\n-\n-\n2\n1...,7 7\n134 91\n25 17\n3 3\n26 27\n48 36\n60 76\n...,,-\n-\n1\n-\n-\n-\n-\n-\n-\n-\n-\n-\n1\n-\n-\n-...,- -\n- -\n- 1\n- -\n- -\n- -\n- -\n- -\n- -\n-...,,-
5,TOTAL 1 687,,122 669 133 011,,9,1 289 1 365,,4,1 3,,-


### All tables one file function

In [29]:
def process_table_13_columns(df, df_header):
    df = df.iloc[4:].reset_index(drop=True)
    df.columns = df.iloc[0]
    df = df[1:]

    col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
    df.columns = col_names

    for col in col_names[1:13]:
        df[col] = df[col].str.replace(' ', '')

    first_col_name = df.columns[0]
    df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*)')]

    indices_para_converter = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    for idx in indices_para_converter:
        df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)

    df1 = df.iloc[:, 0:5].copy()
    if df_header.shape[0] > 0 and df_header.shape[1] > 1:
        df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 1:
        df1['year'] = df_header.iloc[1, 1].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 4:
        df1['Acum_Year'] = df_header.iloc[1, 4].replace('\n', ' ')

    if df.shape[1] > 4:
        df1['MF'] = df1['Coluna3'] + df1['Coluna4']

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' '),
        df_header.iloc[2, 1].replace('\n', ' '),
        df_header.iloc[3, 2].replace('\n', ' '),
        df_header.iloc[3, 3].replace('\n', ' '),
        df_header.iloc[2, 4].replace('\n', ' ')
    ]
    df1.columns = novos_nomes + list(df1.columns[5:])
    column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6]
    df1 = df1.iloc[:, column_order]

    df2 = df.iloc[:, [0] + list(range(5, 9))].copy()
    if df_header.shape[0] > 0 and df_header.shape[1] > 5:
        df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 4:
        df2['year'] = df_header.iloc[1, 4].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 8:
        df2['Acum_Year'] = df_header.iloc[1, 8].replace('\n', ' ')
    if df2.shape[1] > 7 and df2.shape[1] > 8:
        df2['MF'] = df2['Coluna7'] + df2['Coluna8']
        df2.columns = novos_nomes + list(df2.columns[5:])
        df2 = df2.iloc[:, column_order]

    df3 = df.iloc[:, [0] + list(range(9, 13))].copy()
    if df_header.shape[0] > 0 and df_header.shape[1] > 9:
        df3['disease'] = df_header.iloc[0, 9].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 9:
        df3['year'] = df_header.iloc[1, 9].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 12:
        df3['Acum_Year'] = df_header.iloc[1, 12].replace('\n', ' ')
    if df3.shape[1] > 11 and df3.shape[1] > 12:
        df3['MF'] = df3['Coluna11'] + df3['Coluna12']
        df3.columns = novos_nomes + list(df3.columns[5:])
        df3 = df3.iloc[:, column_order]

    df_all = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

    return df_all

def process_table_9_columns(df, df_header):
    df = df.iloc[4:].reset_index(drop=True)
    df.columns = df.iloc[0]
    df = df[1:]

    col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
    df.columns = col_names

    for col in col_names[1:9]:
        df[col] = df[col].str.replace(' ', '')

    first_col_name = df.columns[0]
    df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*)')]

    indices_para_converter = [1, 2, 3, 4, 5, 6, 7, 8]
    for idx in indices_para_converter:
        df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)

    df1 = df.iloc[:, 0:5].copy()
    if df_header.shape[0] > 0 and df_header.shape[1] > 1:
        df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 1:
        df1['year'] = df_header.iloc[1, 1].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 4:
        df1['Acum_Year'] = df_header.iloc[1, 4].replace('\n', ' ')

    if df.shape[1] > 4:
        df1['MF'] = df1['Coluna3'] + df1['Coluna4']

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' '),
        df_header.iloc[2, 1].replace('\n', ' '),
        df_header.iloc[3, 2].replace('\n', ' '),
        df_header.iloc[3, 3].replace('\n', ' '),
        df_header.iloc[2, 4].replace('\n', ' ')
    ]
    df1.columns = novos_nomes + list(df1.columns[5:])
    column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6]
    df1 = df1.iloc[:, column_order]

    df2 = df.iloc[:, [0] + list(range(5, 9))].copy()
    if df_header.shape[0] > 0 and df_header.shape[1] > 5:
        df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 4:
        df2['year'] = df_header.iloc[1, 4].replace('\n', ' ')
    if df_header.shape[0] > 1 and df_header.shape[1] > 8:
        df2['Acum_Year'] = df_header.iloc[1, 8].replace('\n', ' ')
    if df2.shape[1] > 7 and df2.shape[1] > 8:
        df2['MF'] = df2['Coluna7'] + df2['Coluna8']
        df2.columns = novos_nomes + list(df2.columns[5:])
        df2 = df2.iloc[:, column_order]

    df_all = pd.concat([df1, df2], axis=0, ignore_index=True)

    return df_all

def process_pdf_tables(pdf_path, output_dir):
    all_dfs = []
    pdf = pdfplumber.open(pdf_path)
    
    for page_num in range(len(pdf.pages)):
        df_all = None
        try:
            tables = camelot.read_pdf(pdf_path, pages=str(page_num + 1), flavor='stream')
            
            if tables:
                df = tables[0].df
                page = pdf.pages[page_num]
                table = page.extract_table()
                df_header = pd.DataFrame(table)
                
                num_columns = len(df.columns)
                
                if num_columns >= 13:
                    df_all = process_table_13_columns(df, df_header)
                elif num_columns == 9:
                    df_all = process_table_9_columns(df, df_header)
                else:
                    print(f"Página {page_num + 1} tem um número inesperado de colunas: {num_columns}")

                if df_all is not None:
                    all_dfs.append(df_all)

        except Exception as e:
            print(f"Erro ao processar a página {page_num + 1}: {e}")
    
    pdf.close()
    
    final_df = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
    
    # Extrai o nome do arquivo PDF sem a extensão
    pdf_filename = os.path.basename(pdf_path)
    csv_filename = os.path.splitext(pdf_filename)[0] + '.csv'
    
    # Gera o caminho completo para salvar o arquivo CSV
    output_path = os.path.join(output_dir, csv_filename)
    
    # Salva o DataFrame final em um arquivo CSV
    final_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    
    return final_df

In [30]:
# Exemplo de uso da função
# pdf_path = "caminho/para/o/seu/arquivo.pdf"
final_df = process_pdf_tables(pdf_path, output_dir)
print(resultado)

Página 1 tem um número inesperado de colunas: 2
Página 2 tem um número inesperado de colunas: 1
Página 3 tem um número inesperado de colunas: 2
Página 4 tem um número inesperado de colunas: 1
Página 5 tem um número inesperado de colunas: 3
Página 6 tem um número inesperado de colunas: 1
Página 7 tem um número inesperado de colunas: 5
Página 8 tem um número inesperado de colunas: 2
Página 9 tem um número inesperado de colunas: 8
Página 10 tem um número inesperado de colunas: 11
Página 14 tem um número inesperado de colunas: 11
Página 15 tem um número inesperado de colunas: 10
Página 17 tem um número inesperado de colunas: 11
Página 27 tem um número inesperado de colunas: 12
Erro ao processar a página 28: 'NoneType' object has no attribute 'replace'
Erro ao processar a página 31: 'NoneType' object has no attribute 'replace'
Erro ao processar a página 32: 'NoneType' object has no attribute 'replace'
Página 41 tem um número inesperado de colunas: 11
Página 42 tem um número inesperado de co

### All tables all files in folder function

In [17]:
def process_all_pdfs_in_folder(source_path, sink_path):
    year_df = []
    for filename in os.listdir(source_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(source_path, filename)
            print(f"Processando o arquivo: {pdf_path}")
            week_df = extract_and_process_tables(pdf_path, filename)
            year_df.append(week_df)
            
    if year_df:
        concatenated_df = pd.concat(year_df, ignore_index=True)

        pdf_filename = source_path.rsplit('/', 1)[-1]
        print("Salvando")
        csv_filename = os.path.splitext(pdf_filename)[0] + '.csv'
        output_path = os.path.join(sink_path, csv_filename)
        concatenated_df.to_csv(output_path, index=False, encoding='utf-8-sig',mode='w')
        print(f"Arquivo salvo: {csv_filename}")

In [36]:
def extract_and_process_tables(pdf_path, filename):
    all_dfs = []
    pdf = pdfplumber.open(pdf_path)

    for page_num in range(len(pdf.pages)):
        try:
            tables = camelot.read_pdf(pdf_path, pages=str(page_num + 1), flavor='stream')

            if tables:
                df = tables[0].df
                page = pdf.pages[page_num]
                table = page.extract_table()
                df_header = pd.DataFrame(table)
                
                if (not df_header.empty 
                    and df_header.shape[0] > 1  # Verifica se há pelo menos 2 linhas
                    and df_header.shape[1] > 1  # Verifica se há pelo menos 2 colunas
                    and df_header.iloc[0, 1] is not None 
                    and isinstance(df_header.iloc[0, 1], str)):
                    
                    num_columns = len(df.columns)
                    
                    if filename[3:5] == '01':
                        
                        if num_columns == 9:
#                             df_all = process_table_9(df, df_header, filename)
                            df_all = pd.DataFrame()
                            print('9 colunas')
                            
                        elif num_columns == 11:
                            df_all = process_table_11(df, df_header, filename)
                            
                        elif num_columns == 13:
                            df_all = process_table_13(df, df_header, filename)
                            
                        else:
                            df_all = pd.DataFrame()
                        
                    else:
                        
                        if num_columns == 9:
#                             df_all = process_table_9(df, df_header, filename)
                            df_all = pd.DataFrame()
                            print('9 colunas')
                            
                        elif num_columns == 11:
                            df_all = process_table_11(df, df_header, filename)
                            
                        elif num_columns == 13:
                            df_all = process_table_13(df, df_header, filename)
                            
                        else:
                            df_all = pd.DataFrame()
                    
                        
                    if not df_all.empty:
                        all_dfs.append(df_all)
#                         print(all_dfs)
                        print(f"Processando a página {page_num + 1} com a tabela desejada.")

        except Exception as e:
            print(f"Erro ao processar a página {page_num + 1}: {e}")

    pdf.close()

    final_df = pd.concat(all_dfs, ignore_index=True) #if all_dfs else pd.DataFrame()


    return final_df

In [2]:
# Defina a função para processar as tabelas do PDF
def process_table_13(df, df_header, filename):
    if df.empty or df_header.empty:
        print("DataFrame ou cabeçalho vazio.")
        return pd.DataFrame()  # Retorna um DataFrame vazio se não houver dados
    
    weeknumber = filename[3:5]

    df = df.iloc[4:].reset_index(drop=True)
    df.columns = df.iloc[0]
    df = df[1:]

    col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
    df.columns = col_names

    for col in col_names[1:13]:
        df[col] = df[col].str.replace(' ', '')

    first_col_name = df.columns[0]
    df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*)')]

    indices_para_converter = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    for idx in indices_para_converter:
        if idx < len(df.columns):  # Verifica se o índice está dentro dos limites
            df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)

    df1 = df.iloc[:, 0:5].copy()
    if len(df_header) > 1 and len(df_header.columns) > 1:
        df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
        df1['year'] = df_header.iloc[1, 1].replace('\n', ' ')
    else:
        df1['disease'] = ''
        df1['year'] = ''

    df1['MF'] = df1.get('Coluna3', 0) + df1.get('Coluna4', 0)
    df1['Acum_Year'] = df_header.iloc[1, 4].replace('\n', ' ') if len(df_header) > 1 and len(df_header.columns) > 4 else ''
    
    df1['week'] = weeknumber

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
        df_header.iloc[2, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
        df_header.iloc[3, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
        df_header.iloc[3, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
        df_header.iloc[2, 4].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 4 else ''
    ]
    
    df1.columns = novos_nomes + list(df1.columns[5:])
    column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9]
    df1 = df1.iloc[:, column_order]

    df2 = df.iloc[:, [0] + list(range(5, 9))].copy()
    if len(df_header) > 0 and len(df_header.columns) > 5:
        df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
        df2['year'] = df_header.iloc[1, 4].replace('\n', ' ')
    else:
        df2['disease'] = ''
        df2['year'] = ''

    df2['MF'] = df2.get('Coluna7', 0) + df2.get('Coluna8', 0)
    df2['Acum_Year'] = df_header.iloc[1, 8].replace('\n', ' ') if len(df_header) > 1 and len(df_header.columns) > 8 else ''
    
    df2['week'] = weeknumber

    df2.columns = novos_nomes + list(df2.columns[5:])
    df2 = df2.iloc[:, column_order]

    df3 = df.iloc[:, [0] + list(range(9, 13))].copy()
    if len(df_header) > 0 and len(df_header.columns) > 9:
        df3['disease'] = df_header.iloc[0, 9].replace('\n', ' ')
        df3['year'] = df_header.iloc[1, 9].replace('\n', ' ')
    else:
        df3['disease'] = ''
        df3['year'] = ''

    df3['MF'] = df3.get('Coluna11', 0) + df3.get('Coluna12', 0)
    df3['Acum_Year'] = df_header.iloc[1, 12].replace('\n', ' ') if len(df_header) > 1 and len(df_header.columns) > 12 else ''
    
    df3['week'] = weeknumber

    df3.columns = novos_nomes + list(df3.columns[5:])
    df3 = df3.iloc[:, column_order]

    df_all = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
    

    return df_all

In [3]:
# Defina a função para processar as tabelas do PDF
def process_table_11(df, df_header, filename):
    if df.empty or df_header.empty:
        print("DataFrame ou cabeçalho vazio.")
        return pd.DataFrame()  # Retorna um DataFrame vazio se não houver dados
    
    weeknumber = filename[3:5]

    df = df.iloc[4:].reset_index(drop=True)
    df.columns = df.iloc[0]
    df = df[1:]

    col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
    df.columns = col_names

    for col in col_names[1:9]:
        df[col] = df[col].str.replace(' ', '')

    first_col_name = df.columns[0]
    df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*)')]

    indices_para_converter = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    for idx in indices_para_converter:
        if idx < len(df.columns):  # Verifica se o índice está dentro dos limites
            df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)

    df1 = df.iloc[:, 0:5].copy()
    if len(df_header) > 1 and len(df_header.columns) > 1:
        df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
        df1['year'] = df_header.iloc[1, 1].replace('\n', ' ')
    else:
        df1['disease'] = ''
        df1['year'] = ''

    df1['MF'] = df1.get('Coluna3', 0) + df1.get('Coluna4', 0)
    df1['Acum_Year'] = int(df_header.iloc[1, 1].replace('\n', ' ')) - 1

    df1['week'] = weeknumber

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
        df_header.iloc[2, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
        df_header.iloc[3, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
        df_header.iloc[3, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
        'Acum.'
    ]

    df1.columns = novos_nomes + list(df1.columns[5:])
    column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9]
    df1 = df1.iloc[:, column_order]

    df2 = df.iloc[:, [0] + list(range(5, 9))].copy()
    if len(df_header) > 0 and len(df_header.columns) > 5:
        df2['disease'] = df_header.iloc[0, 4].replace('\n', ' ')
        df2['year'] = df_header.iloc[1, 4].replace('\n', ' ')
    else:
        df2['disease'] = ''
        df2['year'] = ''

    df2['MF'] = df2.get('Coluna7', 0) + df2.get('Coluna8', 0)
    df2['Acum_Year'] = int(df_header.iloc[1, 4].replace('\n', ' ')) - 1

    df2['week'] = weeknumber

    df2.columns = novos_nomes + list(df2.columns[5:])
    df2 = df2.iloc[:, column_order]

    df3 = df.iloc[:, [0] + list(range(8, 14))].copy()
    if len(df_header) > 0 and len(df_header.columns) > 9:
        df3['disease'] = df_header.iloc[0, 7].replace('\n', ' ')
        df3['year'] = df_header.iloc[1, 7].replace('\n', ' ')
    else:
        df3['disease'] = ''
        df3['year'] = ''

    df3['Coluna12'] = pd.to_numeric(df3['Coluna12'], errors='coerce').fillna(0).astype(int)
    df3['Coluna14'] = pd.to_numeric(df3['Coluna14'], errors='coerce').fillna(0).astype(int)

    df3['MF'] = df3.get('Coluna12', 0) + df3.get('Coluna14', 0)
    df3['Acum_Year'] = int(df_header.iloc[1, 7].replace('\n', ' ')) - 1

    df3['week'] = weeknumber

    df3 = df3.drop(['Coluna11','Coluna13'],axis = 1)

    column_order = [0, 2, 3, 4, 1, 5, 6, 7, 8, 9]

    df3 = df3.iloc[:, column_order]

    df3.columns = novos_nomes + list(df3.columns[5:])

    column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9]
    df3 = df3.iloc[:, column_order]

    df_all = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

    return df_all

In [94]:
# Defina a função para processar as tabelas do PDF
def process_table_10(df, df_header, filename):
    if df.empty or df_header.empty:
        print("DataFrame ou cabeçalho vazio.")
        return pd.DataFrame()  # Retorna um DataFrame vazio se não houver dados
    
    weeknumber = filename[3:5]

    df = df.iloc[6:].reset_index(drop=True)
    df.columns = df.iloc[0]
    # df = df[:]

    col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
    df.columns = col_names

    for col in col_names[1:10]:
        df[col] = df[col].str.replace(' ', '')

    first_col_name = df.columns[0]
    df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*)')]

    indices_para_converter = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    for idx in indices_para_converter:
        if idx < len(df.columns):  # Verifica se o índice está dentro dos limites
            df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)

    df = df.drop(['Coluna3','Coluna5'],axis = 1)

    df1 = df.iloc[:, 0:5].copy()
    if len(df_header) > 1 and len(df_header.columns) > 1:
        df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
        df1['year'] = df_header.iloc[1, 1].replace('\n', ' ')
    else:
        df1['disease'] = ''
        df1['year'] = ''

    df1['MF'] = df1.get('Coluna4', 0) + df1.get('Coluna6', 0)
    df1['Acum_Year'] = int(df_header.iloc[1, 1].replace('\n', ' ')) - 1
    df1['En_Estudio'] = ''


    df1['week'] = weeknumber

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
        df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
        df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
        df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
        'Acum.'
    ]

    df1.columns = novos_nomes + list(df1.columns[5:])
    column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9]
    df1 = df1.iloc[:, column_order]

    df2 = df.iloc[:, [0] + list(range(5, 10))].copy()
    if len(df_header) > 0 and len(df_header.columns) > 5:
        df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
        df2['year'] = df_header.iloc[1, 5].replace('\n', ' ')
    else:
        df2['disease'] = ''
        df2['year'] = ''

    df2['MF'] = df2.get('Coluna10', 0) + df2.get('Coluna11', 0)
    df2['Acum_Year'] = int(df_header.iloc[1, 5].replace('\n', ' ')) - 1

    df2['week'] = weeknumber

    column_order = [0, 2, 3, 4, 8, 5, 9, 6, 7, 10, 1]

    df2 = df2.iloc[:, column_order]

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
        df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
        df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
        df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
        'MF',
        'Acum.'
    ]

    df2.columns = novos_nomes + list(df2.columns[6:10]) + ['En_Estudio']


    df_all = pd.concat([df1, df2], axis=0, ignore_index=True)

    return df_all

In [37]:
source_path = '/home/pirata/Documents/projects/epidemic_database/files/2015_teste'
sink_path = '/home/pirata/Documents/projects/epidemic_database/bases/'
process_all_pdfs_in_folder(source_path, sink_path)

Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2015_teste/sem01.pdf
Erro ao processar a página 7: 'NoneType' object has no attribute 'replace'
Erro ao processar a página 8: 'NoneType' object has no attribute 'replace'
Erro ao processar a página 12: 'NoneType' object has no attribute 'replace'
Erro ao processar a página 14: 'NoneType' object has no attribute 'replace'
Erro ao processar a página 15: 'NoneType' object has no attribute 'replace'
Processando a página 19 com a tabela desejada.
Processando a página 20 com a tabela desejada.
Processando a página 21 com a tabela desejada.
9 colunas
Processando a página 23 com a tabela desejada.
Processando a página 24 com a tabela desejada.
Erro ao processar a página 25: 'NoneType' object has no attribute 'replace'
9 colunas
Processando a página 27 com a tabela desejada.
Erro ao processar a página 34: 'NoneType' object has no attribute 'replace'
Erro ao processar a página 37: 'NoneType' object has no attribute 'r

In [129]:
pdf_path = '/home/pirata/Documents/projects/epidemic_database/files/2015_teste/sem43.pdf'

desired_disease = 'Fiebre por Dengue'

pdf = pdfplumber.open(pdf_path)

page_num = 30

tables = camelot.read_pdf(pdf_path, pages=str(page_num + 1), flavor='stream')

if tables:
    dfx = tables[0].df
    page = pdf.pages[page_num]
    table = page.extract_table()
    df_header = pd.DataFrame(table)

In [127]:
if (not df_header.empty 
                    and df_header.shape[0] > 1  # Verifica se há pelo menos 2 linhas
                    and df_header.shape[1] > 1  # Verifica se há pelo menos 2 colunas
                    and df_header.iloc[0, 1] is not None 
                    and isinstance(df_header.iloc[0, 1], str)
                    and desired_disease in df_header.iloc[0, 1]):
#                     and df_header.iloc[0, 1].str.contains('Dengue', regex=False).any()):   
                    
                    num_columns = len(df.columns)
                    print('sim')
else:
    print('nao')
                    

sim


In [123]:
df_header.iloc[0, 1]

'Otras Rickettsiosis\nCIE-10ª REV.\nA75.9'

In [131]:
filename = 'sem43.pdf'

weeknumber = filename[3:5]

df = dfx.iloc[6:].reset_index(drop=True)
df.columns = df.iloc[0]
# df = df[:]

col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
df.columns = col_names

for col in col_names[1:10]:
    df[col] = df[col].str.replace(' ', '')

first_col_name = df.columns[0]
df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*)')]

indices_para_converter = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for idx in indices_para_converter:
    if idx < len(df.columns):  # Verifica se o índice está dentro dos limites
        df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)
        
df = df.drop(['Coluna3','Coluna5'],axis = 1)

df1 = df.iloc[:, 0:5].copy()
if len(df_header) > 1 and len(df_header.columns) > 1:
    df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
    df1['year'] = df_header.iloc[1, 1].replace('\n', ' ')
else:
    df1['disease'] = ''
    df1['year'] = ''

df1['MF'] = df1.get('Coluna4', 0) + df1.get('Coluna6', 0)
df1['Acum_Year'] = int(df_header.iloc[1, 1].replace('\n', ' ')) - 1
df1['week'] = weeknumber
df1['En_Estudio'] = ''

novos_nomes = [
    df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
    df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
    df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
    df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
    'Acum.'
]

df1.columns = novos_nomes + list(df1.columns[5:])
column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9, 10]
df1 = df1.iloc[:, column_order]

df2 = df.iloc[:, [0] + list(range(5, 10))].copy()
if len(df_header) > 0 and len(df_header.columns) > 5:
    df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
    df2['year'] = df_header.iloc[1, 5].replace('\n', ' ')
else:
    df2['disease'] = ''
    df2['year'] = ''

df2['MF'] = df2.get('Coluna10', 0) + df2.get('Coluna11', 0)
df2['Acum_Year'] = int(df_header.iloc[1, 5].replace('\n', ' ')) - 1

df2['week'] = weeknumber

column_order = [0, 2, 3, 4, 8, 5, 9, 6, 7, 10, 1]

df2 = df2.iloc[:, column_order]

novos_nomes = [
    df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
    df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
    df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
    df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
    'MF',
    'Acum.'
]

df2.columns = novos_nomes + list(df2.columns[6:10]) + ['En_Estudio']


df_all = pd.concat([df1, df2], axis=0, ignore_index=True)

# return df_all


df2

Unnamed: 0,Baja California,34,33,55,2,2.1,-,-.1,-.2,-.3
0,Baja California,34,33,55,2,2,-,-,-,-
1,Baja California Sur,58,90,169,2 979,4,-,2,1,77
2,Campeche,-,146,197,366,-,-,41,35,76
3,Coahuila,8,20,16,139,3,-,-,-,5
4,Colima,5,430,611,522,116,2,49,42,128
5,Chiapas,8,403,531,694,-,12,161,177,737
6,Chihuahua,5,3,7,-,-,-,-,1,-
7,Distrito Federal,-,-,-,-,-,-,-,-,-
8,Durango,5,45,74,35,-,-,-,-,1
9,Guanajuato,1,1,1,9,-,-,-,-,-


In [14]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Aguascalientes,-,,-,,-,-,-,-,-,-,-
1,Baja California,-,,-,,-,-,-,-,-,-,-
2,Baja California Sur,3,,1,,2,7,-,1,1,-,-
3,Campeche,-,,-,,-,-,2,-,-,-,-
4,Coahuila,-,,-,,-,-,-,-,-,-,-
5,Colima,5,,2,,3,2,4,2,2,-,2
6,Chiapas,22,,11,,13,-,-,5,1,5,5
7,Chihuahua,-,,-,,-,-,-,-,-,-,-
8,Distrito Federal,-,,-,,-,-,-,-,-,-,-
9,Durango,-,,-,,-,-,-,-,-,-,-


In [130]:
df_header

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,ENTIDAD\nFEDERATIVA,§Fiebre por Dengue\nCIE-10ª REV.\nA90,,,,§Fiebre Hemorrágica por Dengue\nCIE-10ª REV.\nA91,,,,
1,,2015,,,2014,2015,,,,2014
2,,Confirmados,,,Confirmados\nAcum.,En Estudio\nAcum.,Confirmados,,,Confirmados\nAcum.
3,,Sem.,Acum.,,,,Sem.,Acum.,,
4,,,M,F,,,,M,F,
5,Aguascalientes 4\nBaja California 34\nBaja Cal...,,1 8\n33 55\n90 169\n146 197\n20 16\n430 611\n4...,,-\n2\n2 979\n366\n139\n522\n694\n-\n-\n35\n9\n...,-\n2\n4\n-\n3\n116\n-\n-\n-\n-\n-\n-\n-\n483\n...,-\n-\n-\n-\n-\n2\n12\n-\n-\n-\n-\n6\n-\n28\n-\...,1 -\n- -\n2 1\n41 35\n- -\n49 42\n161 177\n- 1...,,-\n-\n77\n76\n5\n128\n737\n-\n-\n1\n-\n516\n-\...
6,TOTAL 835,,6 283 9 440,,15 818,2 612,197,1 843 2 049,,5 951


In [173]:
import pdfplumber

pdf = pdfplumber.open(pdf_path)

page = pdf.pages[42]

table = page.extract_table()

df_header = pd.DataFrame(table)

In [174]:
df_header

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,ENTIDAD\nFEDERATIVA,Síndrome Febril\nCIE-10ª REV.\nR50,,,,Efectos del Calor y de la Luz\nCIE-10ª REV.\nT67,,,,§Enfermedad Febril\nExantemática\nCIE-10ª REV....,,,
1,,2015,,,2014,2015,,,2014,2015,,,2014
2,,Sem.,Acum.,,Acum.,Sem.,Acum.,,Acum.,Sem.,Acum.,,Acum.
3,,,M,F,,,M,F,,,M,F,
4,Aguascalientes,,4 3\n3 362 3 353\n1 320 1 495\n3 430 4 508\n88...,,-\n8 899\n10 521\n459\n4 262\n3 099\n9 942\n1 ...,-\n-\n-\n-\n-\n-\n4\n-\n-\n-\n-\n-\n-\n4\n1\n-...,1 2\n274 165\n40 15\n8 2\n12 19\n73 65\n318 47...,,13\n305\n46\n7\n53\n84\n135\n38\n56\n2\n8\n33\...,1\n-\n-\n-\n-\n-\n-\n-\n-\n-\n9\n-\n-\n-\n-\n-...,13 14\n74 36\n6 8\n20 17\n55 42\n6 4\n89 109\n...,,37\n157\n11\n43\n120\n16\n234\n101\n300\n44\n7...
5,TOTAL 4 025,,179 237 207 991,,234 741,18,2 429 2 268,,2 504,17,2 367 2 195,,4 464


In [35]:
df_header.iloc[0,0].replace('\n',' ')

'ENTIDAD FEDERATIVA'

In [37]:
df_header.iloc[0,1].replace('\n',' ')

'Síndrome Febril CIE-10ª REV. R50'

In [49]:
df_header.iloc[1,1].replace('\n',' ')

'2015'

In [57]:
df_header.iloc[2,1].replace('\n',' ')

'Sem.'

In [58]:
df_header.iloc[3,2].replace('\n',' ')

'M'

In [59]:
df_header.iloc[3,3].replace('\n',' ')

'F'

In [60]:
df_header.iloc[2,4].replace('\n',' ')

'Acum.'

In [65]:
df_header.iloc[1,4].replace('\n',' ')

'2014'

In [113]:
df_header.iloc[0,5].replace('\n',' ')

'Efectos del Calor y de la Luz CIE-10ª REV. T67'