In [2]:
import pandas as pd

import camelot

import pdfplumber

import os

import re

In [3]:
def process_all_pdfs_in_folder(source_path, sink_path, desired_disease):
    year_df = []
    for filename in os.listdir(source_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(source_path, filename)
            print(f"Processando o arquivo: {pdf_path}")
            week_df = extract_and_process_tables(pdf_path, desired_disease, filename)
            year_df.append(week_df)
            
    if year_df:
        concatenated_df = pd.concat(year_df, ignore_index=True)

        pdf_filename = source_path.rsplit('/', 1)[-1]
        print("Salvando")
        csv_filename = os.path.splitext(pdf_filename)[0] + desired_disease + '.csv'
        output_path = os.path.join(sink_path, csv_filename)
        concatenated_df.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"Arquivo salvo: {csv_filename}")
            

In [4]:
def extract_and_process_tables(pdf_path, desired_disease, filename):
    all_dfs = []
    pdf = pdfplumber.open(pdf_path)

    for page_num in range(len(pdf.pages)):
        try:
            tables = camelot.read_pdf(pdf_path, pages=str(page_num + 1), flavor='stream')

            if tables:
                df = tables[0].df
                page = pdf.pages[page_num]
                table = page.extract_table()
                df_header = pd.DataFrame(table)
                                
                if (not df_header.empty 
                    and df_header.shape[0] > 1  # Verifica se há pelo menos 2 linhas
                    and df_header.shape[1] > 1  # Verifica se há pelo menos 2 colunas
                    and df_header.iloc[0, 1] is not None 
                    and isinstance(df_header.iloc[0, 1], str)
                    and desired_disease in df_header.iloc[0, 1]):  
                    
                    first = None
                    

                    for n in range(4, 10):
                        if df.iloc[n, 0]:
                            first = n
                            break
                            
                    df_all = process_table(df, df_header, filename, first)
                    
                        
                    if not df_all.empty:
                        all_dfs.append(df_all)
#                         print(all_dfs)
                        print(f"Processando a página {page_num + 1} com a tabela desejada.")

        except Exception as e:
            print(f"Erro ao processar a página {page_num + 1}: {e}")

    pdf.close()

    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
    else:
        final_df = pd.DataFrame()

    # Salva o DataFrame final em um arquivo CSV
#     pdf_filename = os.path.basename(pdf_path)
#     csv_filename = os.path.splitext(pdf_filename)[0] + '_filtered.csv'
#     output_path = os.path.join('/home/pirata/Documents/projects/epidemic_database/bases', csv_filename)
#     final_df.to_csv(output_path, index=False, encoding='utf-8-sig')

    return final_df

In [9]:
# Defina a função para processar as tabelas do PDF
def process_table(df, df_header, filename, first):
    
    
    if df.empty or df_header.empty:
        print("DataFrame ou cabeçalho vazio.")
        return pd.DataFrame()  # Retorna um DataFrame vazio se não houver dados
    
    weeknumber = re.search(r'-SE(\d+)\.pdf', filename).group(1)

    df = df.iloc[first:].reset_index(drop=True)
    df.columns = df.iloc[0]
    df = df[:]

    col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
    df.columns = col_names

    for col in col_names[1:13]:
        df[col] = df[col].str.replace(' ', '')

    first_col_name = df.columns[0]
    df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*|§FUENTE.*)')]

    if len(df.columns) == 16:

        df = df.drop(['Coluna13','Coluna15'],axis = 1)

    else:

        df = df.drop(['Coluna14'],axis = 1)


    indices_para_converter = range(1, 14)
    for idx in indices_para_converter:
        if idx < len(df.columns):  # Verifica se o índice está dentro dos limites
            df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)

    num_col = len(df.columns)


    df1 = df.iloc[:, 0:5].copy()
    if len(df_header) > 1 and len(df_header.columns) > 1:
        df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
        df1['year'] = df_header.iloc[1, 1].replace('\n', ' ')
    else:
        df1['disease'] = ''
        df1['year'] = ''


    df1['MF'] = df1.get('Coluna3', 0) + df1.get('Coluna4', 0)  


    df1['Acum_Year'] = int(df_header.iloc[1, 1].replace('\n', ' ')) - 1
    df1['week'] = weeknumber
    df1['En_Estudio'] = ''

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
        df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
        df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
        df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
        'Acum.'
    ]

    df1.columns = novos_nomes + list(df1.columns[5:])
    column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9, 10]
    df1 = df1.iloc[:, column_order]

    df2 = df.iloc[:, [0] + list(range(5, 10))].copy()
    if len(df_header) > 0 and len(df_header.columns) > 5:
        df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
        df2['year'] = df_header.iloc[1, 5].replace('\n', ' ')
    else:
        df2['disease'] = ''
        df2['year'] = ''


    df2['MF'] = df2.get('Coluna8', 0) + df2.get('Coluna9', 0)

    df2['Acum_Year'] = int(df_header.iloc[1, 5].replace('\n', ' ')) - 1

    df2['week'] = weeknumber

    column_order = [0, 2, 3, 4, 8, 5, 9, 6, 7, 10, 1]

    df2 = df2.iloc[:, column_order]

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
        df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
        df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
        df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
        'MF',
        'Acum.'
    ]

    df2.columns = novos_nomes + list(df2.columns[6:10]) + ['En_Estudio']


    df3 = df.iloc[:, [0] + list(range(10, 14))].copy()
    if len(df_header) > 0 and len(df_header.columns) > 5:
        df3['disease'] = df_header.iloc[0, 10].replace('\n', ' ')
        df3['year'] = df_header.iloc[1, 10].replace('\n', ' ')
    else:
        df3['disease'] = ''
        df3['year'] = ''

    if len(df.columns) == 16:

        df3['MF'] = df3.get('Coluna12', 0) + df3.get('Coluna14', 0)

    else:

        df3['MF'] = df3.get('Coluna12', 0) + df3.get('Coluna13', 0)


    df3['Acum_Year'] = int(df_header.iloc[1, 10].replace('\n', ' ')) - 1

    df3['week'] = weeknumber

    df3['En_Estudio'] = ''

    novos_nomes = [
        df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
        df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
        df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
        df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
        'Acum.'
    ]

    df3.columns = novos_nomes + list(df3.columns[5:])

    column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9, 10]

    df3 = df3.iloc[:, column_order]
    
    
    df_all = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
    

    return df_all

In [10]:

source_path = '/home/pirata/Documents/projects/epidemic_database/files/2016/semanal'
sink_path = '/home/pirata/Documents/projects/epidemic_database/bases/2016/teste'
desired_disease = 'Dengue'
process_all_pdfs_in_folder(source_path, sink_path, desired_disease)

Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE8.pdf
Processando a página 27 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE25.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE51.pdf




Erro ao processar a página 31: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE19.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE21.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE46.pdf




Processando a página 35 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE30.pdf




Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE26.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE16.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE50.pdf




Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE35.pdf




Erro ao processar a página 31: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE39.pdf




Erro ao processar a página 31: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE29.pdf




Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE6.pdf
Processando a página 35 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE33.pdf




Processando a página 35 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE7.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE24.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE20.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE34.pdf




Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE47.pdf




Processando a página 35 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE45.pdf




Processando a página 35 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE2.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE28.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE38.pdf




Erro ao processar a página 31: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE37.pdf




Erro ao processar a página 35: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE43.pdf




Erro ao processar a página 31: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE17.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE42.pdf




Erro ao processar a página 35: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE13.pdf




Erro ao processar a página 5: "['Coluna14'] not found in axis"




Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE41.pdf




Erro ao processar a página 31: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE4.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE23.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE48.pdf




Erro ao processar a página 35: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE12.pdf
Processando a página 27 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE32.pdf




Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE10.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE11.pdf
Processando a página 31 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE3.pdf
Processando a página 27 com a tabela desejada.
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE52.pdf




Erro ao processar a página 31: positional indexers are out-of-bounds
Processando o arquivo: /home/pirata/Documents/projects/epidemic_database/files/2016/semanal/BOL-EPID-2016-SE15.pdf
Processando a página 31 com a tabela desejada.
Salvando
Arquivo salvo: semanalDengue.csv


In [22]:
filename = 'BOL-EPID-2016-SE8.pdf'

pdf_path = '/home/pirata/Documents/projects/epidemic_database/files/2016_teste/BOL-EPID-2016-SE8.pdf'

page_num = 26

pdf = pdfplumber.open(pdf_path)

tables = camelot.read_pdf(pdf_path, pages=str(page_num + 1), flavor='stream')

if tables:
    dfx = tables[0].df
    page = pdf.pages[page_num]
    table = page.extract_table()
    df_header = pd.DataFrame(table)

In [23]:
first = None

for n in range(4, 10):
    if dfx.iloc[n, 0]:
        first = n
        break
        
        
print(n)

7


In [24]:
weeknumber = re.search(r'-SE(\d+)\.pdf', filename).group(1)

df = dfx.iloc[7:].reset_index(drop=True)
df.columns = df.iloc[0]
df = df[:]

col_names = [f'Coluna{i+1}' for i in range(len(df.columns))]
df.columns = col_names

for col in col_names[1:13]:
    df[col] = df[col].str.replace(' ', '')

first_col_name = df.columns[0]
df = df[~df[first_col_name].str.match(r'^(TOTAL|FUENTE.*|§FUENTE.*)')]

if len(dfx.columns) == 16:
    
    df = df.drop(['Coluna13','Coluna15'],axis = 1)
    
else:
    
    df = df.drop(['Coluna14'],axis = 1)


indices_para_converter = range(1, 14)
for idx in indices_para_converter:
    if idx < len(df.columns):  # Verifica se o índice está dentro dos limites
        df.iloc[:, idx] = pd.to_numeric(df.iloc[:, idx], errors='coerce').fillna(0).astype(int)
        
num_col = len(df.columns)
        

df1 = df.iloc[:, 0:5].copy()
if len(df_header) > 1 and len(df_header.columns) > 1:
    df1['disease'] = df_header.iloc[0, 1].replace('\n', ' ')
    df1['year'] = df_header.iloc[1, 1].replace('\n', ' ')
else:
    df1['disease'] = ''
    df1['year'] = ''
    
    
df1['MF'] = df1.get('Coluna3', 0) + df1.get('Coluna4', 0)  


df1['Acum_Year'] = int(df_header.iloc[1, 1].replace('\n', ' ')) - 1
df1['week'] = weeknumber
df1['En_Estudio'] = ''

novos_nomes = [
    df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
    df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
    df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
    df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
    'Acum.'
]

df1.columns = novos_nomes + list(df1.columns[5:])
column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9, 10]
df1 = df1.iloc[:, column_order]

df2 = df.iloc[:, [0] + list(range(5, 10))].copy()
if len(df_header) > 0 and len(df_header.columns) > 5:
    df2['disease'] = df_header.iloc[0, 5].replace('\n', ' ')
    df2['year'] = df_header.iloc[1, 5].replace('\n', ' ')
else:
    df2['disease'] = ''
    df2['year'] = ''
    
    
df2['MF'] = df2.get('Coluna8', 0) + df2.get('Coluna9', 0)

df2['Acum_Year'] = int(df_header.iloc[1, 5].replace('\n', ' ')) - 1

df2['week'] = weeknumber

column_order = [0, 2, 3, 4, 8, 5, 9, 6, 7, 10, 1]

df2 = df2.iloc[:, column_order]

novos_nomes = [
    df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
    df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
    df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
    df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
    'MF',
    'Acum.'
]

df2.columns = novos_nomes + list(df2.columns[6:10]) + ['En_Estudio']


df3 = df.iloc[:, [0] + list(range(10, 14))].copy()
if len(df_header) > 0 and len(df_header.columns) > 5:
    df3['disease'] = df_header.iloc[0, 10].replace('\n', ' ')
    df3['year'] = df_header.iloc[1, 10].replace('\n', ' ')
else:
    df3['disease'] = ''
    df3['year'] = ''
    
if len(dfx.columns) == 16:
    
    df3['MF'] = df3.get('Coluna12', 0) + df3.get('Coluna14', 0)
    
else:
    
    df3['MF'] = df3.get('Coluna12', 0) + df3.get('Coluna13', 0)
    
    
df3['Acum_Year'] = int(df_header.iloc[1, 10].replace('\n', ' ')) - 1

df3['week'] = weeknumber

df3['En_Estudio'] = ''

novos_nomes = [
    df_header.iloc[0, 0].replace('\n', ' ') if len(df_header) > 0 and len(df_header.columns) > 0 else '',
    df_header.iloc[3, 1].replace('\n', ' ') if len(df_header) > 2 and len(df_header.columns) > 1 else '',
    df_header.iloc[4, 2].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 2 else '',
    df_header.iloc[4, 3].replace('\n', ' ') if len(df_header) > 3 and len(df_header.columns) > 3 else '',
    'Acum.'
]

df3.columns = novos_nomes + list(df3.columns[5:])

column_order = [0, 1, 2, 3, 7, 4, 8, 5, 6, 9, 10]

df3 = df3.iloc[:, column_order]


df3

Unnamed: 0,ENTIDAD FEDERATIVA,Sem.,M,F,MF,Acum.,Acum_Year,disease,year,week,En_Estudio
0,Aguascalientes,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
1,Baja California,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
2,Baja California Sur,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
3,Campeche,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
4,Coahuila,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
5,Colima,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
6,Chiapas,15,15,47,62,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
7,Chihuahua,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
8,Distrito Federal,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,
9,Durango,0,0,0,0,0,2015,§Infección por Virus Zika CIE-10ª REV. U06.9,2016,8,


In [16]:
display(re.search(r'-SE(\d+)\.pdf', filename).group(1))

'2'

In [62]:
len(dfx.columns)

16

In [14]:
df_header

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,ENTIDAD\nFEDERATIVA,§Fiebre por Dengue\nCIE-10ª REV.\nA90,,,,§Fiebre Hemorrágica por Dengue\nCIE-10ª REV.\nA91,,,,,§Infección por Virus Zika\nCIE-10ª REV.\nU06.9,,,
1,,2016,,,2015,2016,,,,2015,2016,,,2015
2,,Confirmados,,,Confirmados\nAcum.,En\nEstudio\nAcum.,Confirmados,,,Confirmados\nAcum.,Confirmados,,,Confirmados\nAcum.
3,,Sem.,Acum.,,,,Sem.,Acum.,,,Sem.,Acum.,,
4,,,M,F,,,,M,F,,,M,F,
5,Aguascalientes -\nBaja California -\nBaja Cali...,,- -\n- -\n- 1\n- -\n- -\n3 1\n6 9\n- -\n- -\n-...,,-\n-\n3\n-\n-\n5\n24\n-\n-\n-\n-\n32\n-\n2\n-\...,-\n-\n2\n-\n-\n5\n-\n-\n-\n-\n-\n-\n-\n16\n-\n...,-\n-\n-\n-\n-\n-\n6\n-\n-\n-\n-\n24\n-\n-\n-\n...,- -\n- -\n- -\n- -\n- -\n- -\n2 5\n- -\n- -\n-...,,-\n-\n1\n-\n-\n2\n6\n-\n-\n-\n-\n8\n-\n1\n-\n1...,-\n-\n-\n-\n-\n-\n3\n-\n-\n-\n-\n-\n-\n-\n-\n-...,- -\n- -\n- -\n- -\n- -\n- -\n1 2\n- -\n- -\n-...,,-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-...
6,TOTAL 88,,35 57,,109,88,46,26 24,,46,3,1 2,,-


In [58]:
dfx

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,,,2016,,2015,,2016,,,2015,,2016,,2015,
1,,,,,,En,,,,,,,,,
2,,,Confirmados,,Confirmados,,,Confirmados,,Confirmados,,Confirmados,,Confirmados,
3,,,,,,Estudio,,,,,,,,,
4,,,Acum.,,,,,Acum.,,,,Acum.,,,
5,,Sem.,,,Acum.,,Sem.,,,Acum.,Sem.,,,Acum.,
6,,,M,F,,Acum.,,M,F,,,M,F,,
7,Aguascalientes,-,-,-,-,-,-,-,-,-,-,-,-,,-
8,Baja California,1,1,-,1,-,-,-,-,-,-,-,-,,-
9,Baja California Sur,3,10,7,30,1,-,-,-,2,-,-,-,,-
