In [24]:
import os

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [25]:
spark = SparkSession.builder \
    .appName("SragSilver") \
    .master("local[*]") \
    .getOrCreate()

In [26]:
spark.sparkContext.setLogLevel("ERROR")

In [27]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../")) 
data_path = os.path.join(base_dir, "data/bronze/")
df = spark.read.csv(
    data_path,
    header=True,
    inferSchema=True,
    sep=";"
)

                                                                                

In [28]:
date_columns = [
    'DT_NOTIFIC', 'DT_SIN_PRI', 'DT_NASC', 'DT_UT_DOSE',
    'DT_VAC_MAE', 'DT_DOSEUNI', 'DT_1_DOSE', 'DT_2_DOSE', 
    'DT_ANTIVIR', 'DT_INTERNA', 'DT_ENTUTI', 'DT_SAIDUTI',
    'DT_COLETA', 'DT_PCR', 'DT_RES', 'DT_EVOLUCA',
    'DT_ENCERRA', 'DT_DIGITA'
]

for col in date_columns:
    df = df.withColumn(col, F.to_date(F.col(col), 'dd/MM/yyyy'))

In [33]:
int_columns = [
    'SEM_NOT', 'SEM_PRI', 'NU_IDADE_N', 'TP_IDADE', 'CS_GESTANT',
    'CS_RACA', 'CS_ESCOL_N', 'CO_PAIS', 'CS_ZONA', 'SURTO_SG',
    'NOSOCOMIAL', 'AVE_SUINO', 'FEBRE', 'TOSSE', 'GARGANTA', 'DISPNEIA',
    'DESC_RESP', 'SATURACAO', 'DIARREIA', 'VOMITO', 'OUTRO_SIN',
    'PUERPERA', 'FATOR_RISC', 'CARDIOPATI', 'HEMATOLOGI', 'SIND_DOWN', 
    'HEPATICA', 'ASMA', 'DIABETES', 'NEUROLOGIC', 'PNEUMOPATI', 
    'IMUNODEPRE', 'RENAL', 'OBESIDADE', 'OBES_IMC', 'OUT_MORBI',
    'VACINA', 'MAE_VAC', 'ANTIVIRAL', 'TP_ANTIVIR', 'HOSPITAL',
    'UTI', 'SUPORT_VEN', 'RAIOX_RES', 'AMOSTRA', 'TP_AMOSTRA', 
    'PCR_RESUL', 'PCR_RESUL', 'POS_PCRFLU', 'PCR_FLUASU',
    'FLUASU_OUT', 'PCR_FLUBLI', 'FLUBLI_OUT', 'POS_PCROUT','PCR_VSR',
    'PCR_PARA1', 'PCR_PARA2', 'PCR_PARA3', 'PCR_PARA4', 'PCR_ADENO', 
    'PCR_BOCA','PCR_RINO', 'PCR_OUTRO', 'PCR_METAP', 'CLASSI_FIN',
    'CLASSI_OUT', 'CRITERIO', 'EVOLUCAO', 'HISTO_VGM', 'PAIS_VGM', 
    'CO_PS_VGM', 'LO_PS_VGM', 'DT_VGM', 'DT_RT_VGM', 'PCR_SARS2', 'PAC_COCBO',
    'PAC_DSCBO', 'OUT_ANIM', 'DOR_ABD', 'FADIGA', 'PERD_OLFT', 'PERD_PALA',
    'TOMO_RES', 'TP_TES_AN', 'POS_AN_FLU', 'TP_FLU_AN', 'POS_AN_OUT',
    'AN_SARS2', 'AN_VSR', 'AN_PARA1', 'AN_PARA2', 'AN_PARA3', 'AN_ADENO',
    'AN_OUTRO', 'DS_AN_OUT', 'TP_AM_SOR', 'SOR_OUT', 'RES_IGG', 'RES_IGM',
    'RES_IGA', 'ESTRANG', 'VACINA_COV', 'TRAT_COV', 'TIPO_TRAT',
    'OUT_TRAT', 'CO_DETEC', 'VG_OMS', 'VG_OMSOUT', 'VG_LIN', 'VG_MET',
    'VG_METOUT', 'VG_DTRES', 'VG_ENC', 'VG_REINF', 'REINF', 'FAB_ADIC',
    'LOT_RE_BI', 'FAB_RE_BI', 'DOSE_ADIC', 'DOS_RE_BI', 'LOTE_ADIC', 'TABAG'
]

for int_col in int_columns:
    df = df.withColumn(int_col, F.col(int_col).cast('int'))

In [35]:
df.show()

+----------+-------+----------+-------+---------+--------------------+----------+-------------------+----------+--------------------+----------+-------+----------+----------+--------+---------+----------+-------+----------+-------+-------+-----+--------------------+----------+--------------------+----------+-------+--------+----------+---------+-----+-----+--------+--------+---------+---------+--------+------+---------+-------------------+--------+----------+----------+----------+---------+--------+----+--------+----------+----------+----------+-----+---------+--------+---------+--------------------+------+----------+-------+----------+----------+----------+---------+---------+---------+----------+---------+----------+--------+----------+----------+--------------------+----------+--------------+----------+----+----------+----------+----------+---------+--------------------+----------+-------+----------+----------+-----------------+---------+----------+----------+----------+----------+-