In [41]:
import pandas as pd
import requests
from tqdm.notebook import tqdm
from rarfile import RarFile
import numpy as np

# Download current data

In [143]:
today = pd.Timestamp.today()
n_days_to_look = 30
current_data_path = '../data/current_data_raw.rar'

for days in tqdm(range(1, n_days_to_look+1), total=n_days_to_look):
    date = (today - pd.Timedelta(f'{days}d')).strftime("%d%m%Y")
    url = f"http://deis.minsal.cl/wp-content/uploads/2021/01/DEFUNCIONES_FUENTE_DEIS_2016_2021_{date}.rar"
    req = requests.get(url, allow_redirects=True)
    if req.reason != 'Not Found':
        print(date)
        break
_ = open(current_data_path, 'wb').write(req.content)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

28012021


# Download older data

In [13]:
older_data_path = '../data/older_data_raw.rar'

url = "https://repositoriodeis.minsal.cl/DatosAbiertos/Vitales/DEF_1990-2018.rar"
req = requests.get(url, allow_redirects=True)
_ = open(older_data_path, 'wb').write(req.content)

# Preprocessing

In [151]:
def transform_age(df):
    edad_tipo = df['EDAD_TIPO'].values
    edad = df['EDAD_CANT'].values
    idx_meses = edad_tipo == 2
    idx_dias = edad_tipo == 3
    idx_hrs = edad_tipo == 4
    idx_none = ~np.isin(edad_tipo, [1, 2, 3, 4])
    edad[idx_meses] = edad[idx_meses]/12
    edad[idx_dias] = edad[idx_dias]/365
    edad[idx_hrs] = edad[idx_hrs]/(365*24)
    edad[idx_none] = -1
    return edad

In [160]:
current_data = pd.read_csv('../data/current_deis_defunciones.csv', encoding='latin1', sep=';', header=None)
current_data_col_dict = pd.read_csv('../data/dictionary_current_deis.csv', encoding='latin1', sep=';')
col_names = current_data_col_dict['Nombre de la variable'].values
current_data.columns = col_names
current_data['EDAD_CANT'] = transform_age(current_data)
current_data = current_data.query('-1 < EDAD_CANT < 130')
current_data['FECHA_DEF'] = pd.to_datetime(current_data['FECHA_DEF'])
first_date_current_data = current_data['FECHA_DEF'].min()

In [161]:
old_data = pd.read_csv('../data/old_deis_defunciones.csv', encoding='latin1', sep=';')
old_data['EDAD_CANT'] = transform_age(old_data)
old_data = old_data.query('-1 < EDAD_CANT < 130')
old_data['FECHA_DEF'] = pd.to_datetime(old_data['FECHA_DEF'])
old_data = old_data.query('FECHA_DEF < @first_date_current_data')

In [162]:
cols_dict = {
    'FECHA_DEF': 'fecha',
    'GLOSA_SEXO': 'sexo',
    'EDAD_CANT': 'edad',
    'GLOSA_COMUNA_RESIDENCIA': 'comuna',
    'GLOSA_REG_RES': 'region',
    'DIAG1': 'diag',
    'CAPITULO_DIAG1': 'capitulo_diag',
    'GLOSA_CAPITULO_DIAG1': 'glosa_capitulo_diag',
    'CODIGO_GRUPO_DIAG1': 'grupo_diag', 
    'GLOSA_GRUPO_DIAG1': 'glosa_grupo_diag',
    'CODIGO_CATEGORIA_DIAG1': 'categoria_diag',
    'GLOSA_CATEGORIA_DIAG1': 'glosa_categoria_diag',
    'GLOSA_SUBCATEGORIA_DIAG1': 'glosa_subcategoria_diag',    
}

current_data = current_data.rename(columns=cols_dict)[list(cols_dict.values())]
old_data = old_data.rename(columns=cols_dict)[list(cols_dict.values())]

data = pd.concat([current_data, old_data])
data.to_csv('../data/consolidated_data.csv')