In [672]:
# Required Libraries
import pandas as pd
import os
from imblearn.over_sampling import SMOTE

In [673]:
# Paths to the data

# SINAN DataSUS CSV files path (modify to match your file path)
sinan_path = os.path.expanduser('~/Desktop/DataSUS-Chikungunya-ML/source/csv/')

# Cleaned CSV files path (modify to match your file path)
cleaned_path = os.path.expanduser('~/Desktop/DataSUS-Chikungunya-ML/datasets/')

In [674]:
"""
Load the Chikungunya data from the CSV files
The CSV files are named CHIKBRYY.csv, where YY is the last two digits of the year
The files are stored in the ~/Downloads/dbc2csv/source/csv/ directory
The columns in the CSV files are not consistent across all years
We want to identify the columns that are common to all files
"""

# List of last two digits of years for which we have CSV files (2018 to 2024)
start_year = 18
end_year = 25
assert start_year < end_year, "Start year must be less than end year"
years = range(start_year, end_year + 1)

# Dictionary to store the columns for each file
file_columns = {}

# Loop through each year, build the filename, and read the CSV
for year in years:
    file_name = f'{sinan_path}CHIKBR{str(year)}.csv'
    try:
        df = pd.read_csv(file_name, low_memory=False)
        # Save the set of columns for this file
        file_columns[file_name] = set(df.columns)
    except Exception as e:
        print(f"Error loading {file_name}: {e}")

# Ensure we have loaded at least one file before proceeding
assert file_columns, "No files were loaded. Please check your file paths."

# Find common columns: the intersection of columns across all files
common_columns = set.intersection(*file_columns.values())
print("\nColumns common to all files:")
print(common_columns)

# Compute the union of all columns (all columns that appear in any file)
all_columns = set.union(*file_columns.values())

# For columns that are not common, print which files have them and which don't.
print("\nColumns that are not common among all files:")
for col in all_columns - common_columns:
    # Extract base name (e.g., CHIKBR21) from each file path
    files_with = [os.path.splitext(os.path.basename(fname))[0] 
                    for fname, cols in file_columns.items() if col in cols]
    files_without = [os.path.splitext(os.path.basename(fname))[0] 
                        for fname, cols in file_columns.items() if col not in cols]
    print(f"Column '{col}' is present in files: {files_with} and missing in files: {files_without}")



Columns common to all files:
{'DT_INVEST', 'COUFINF', 'VOMITO', 'HOSPITALIZ', 'PETEQUIA_N', 'DT_SIN_PRI', 'GRAV_EXTRE', 'GRAV_HEMAT', 'DT_PRNT', 'NU_ANO', 'DT_NOTIFIC', 'CEFALEIA', 'ALRM_HIPOT', 'RES_CHIKS2', 'CS_GESTANT', 'EVIDENCIA', 'SEM_PRI', 'ID_REGIONA', 'SG_UF_NOT', 'RENAL', 'RESUL_VI_N', 'CLASSI_FIN', 'GRAV_CONSC', 'GRAV_ENCH', 'LACO_N', 'CS_SEXO', 'COMUNINF', 'HISTOPA_N', 'DT_CHIK_S2', 'ALRM_HEPAT', 'SOROTIPO', 'ID_PAIS', 'HIPERTENSA', 'ACIDO_PEPT', 'DT_NS1', 'NU_IDADE_N', 'AUTO_IMUNE', 'NAUSEA', 'ID_MUNICIP', 'ALRM_PLAQ', 'RESUL_PRNT', 'PLASMATICO', 'TP_SISTEMA', 'DT_GRAV', 'DT_CHIK_S1', 'GRAV_MIOC', 'GRAV_INSUF', 'COMPLICA', 'ID_AGRAVO', 'ALRM_LIQ', 'LACO', 'IMUNOH_N', 'CONJUNTVIT', 'HEPATOPAT', 'ID_UNIDADE', 'GRAV_SANG', 'DT_INTERNA', 'GRAV_ORGAO', 'GRAV_MELEN', 'DT_OBITO', 'CLINC_CHIK', 'ID_MN_RESI', 'DIABETES', 'GRAV_TAQUI', 'RESUL_PCR_', 'DT_ENCERRA', 'GRAV_HIPOT', 'SEM_NOT', 'TP_NOT', 'ALRM_VOM', 'EXANTEMA', 'GENGIVO', 'UF', 'ARTRITE', 'MANI_HEMOR', 'SANGRAM', 'GRAV_CO

In [675]:
"""
Remove columns that are not common to all files
Create a new DataFrame with only the common columns
Concatenate all the DataFrames without the last {test_years} years into a single DataFrame called X_train
Concatenate the DataFrames from the last {test_years} years into a single DataFrame called X_test
"""

# Number of years to use for testing
test_years = 2
assert common_columns, "No common columns found. Please check your file paths."
assert len(years) > test_years, "At least {test_years + 1}  years of data are required."

# Load data for all years and save in X dataset
X = []
for year in years:
    file_name = f'{sinan_path}CHIKBR{str(year)}.csv'
    try:
        df = pd.read_csv(file_name, low_memory=False, usecols=common_columns)
        if "Unnamed: 0" in df.columns:
            df = df.drop("Unnamed: 0", axis=1)
        X.append(df)
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        
X = pd.concat(X, ignore_index=True)

# Display the shape of the dataset
print(f"\nShape of the dataset: {X.shape}")



Shape of the dataset: (1500209, 116)


In [676]:
# # Load the train and test data for each year, keeping only the common columns
# X_train = []
# X_test = []

# for year in years:
#     file_name = f'{sinan_path}CHIKBR{str(year)}.csv'
#     try:
#         df = pd.read_csv(file_name, usecols=common_columns, low_memory=False)
#         # Drop the "Unnamed: 0" column if present
#         if "Unnamed: 0" in df.columns:
#             df = df.drop("Unnamed: 0", axis=1)
#         if year < years[-test_years]:
#             X_train.append(df)
#         else:
#             X_test.append(df)
#     except Exception as e:
#         print(f"Error loading {file_name}: {e}")

# # Concatenate all the DataFrames into a single train and test DataFrame
# X_train = pd.concat(X_train, ignore_index=True)
# X_test = pd.concat(X_test, ignore_index=True)

# # Display the shape of the concatenated DataFrame
# print("\nShape of the concatenated X_train DataFrame: ", X_train.shape)
# print("Shape of the concatenated X_test DataFrame: ", X_test.shape)

In [677]:
"""
Filter the data to include only the rows where the pacient was hospitalized ("HOSPITALIZ" column is equal to 1 or to 1.0)
"""

print("\nUnique values in the 'HOSPITALIZ' column:")
print(X['HOSPITALIZ'].value_counts())

# Filter the data to include only the rows where the patient was hospitalized
X = X[(X['HOSPITALIZ'] == 1) | (X['HOSPITALIZ'] == 1.0)]

# Display the shape of the filtered DataFrame
print("\nShape of the filtered X DataFrame: ", X.shape)


Unique values in the 'HOSPITALIZ' column:
HOSPITALIZ
2.0    835678
2       73033
1.0     42802
9.0     30132
1        3629
9        2885
           1
Ø           1
J           1
ï           1
Name: count, dtype: int64

Shape of the filtered X DataFrame:  (42802, 116)


In [678]:
"""
Extract the target variable from the data (EVOLUCAO column)

1- cura
2- óbito pelo
agravo
3- óbito por outras
causas
4- óbito em
investigação
9- ignorado

Remove rows where the target variable is Nan, 3 (death by other causes), 4 (under investigation) or 9 (ignored)
Only keep rows where the target variable is 1 (cure) or 2 (death by the disease)
"""

##############################################################
# Should we drop rows where the target variable is 3 or 4?
# Dropping them results in a very unbalanced dataset
##############################################################

# Print the value counts for the 'EVOLUCAO' column in X
print("\nValues for 'EVOLUCAO' in X:")
print(X["EVOLUCAO"].value_counts(dropna=False))



Values for 'EVOLUCAO' in X:
EVOLUCAO
1.0    29833
NaN     7354
9.0     3186
3.0     1646
2.0      565
4.0      218
Name: count, dtype: int64


In [679]:
# Filter the data to include only the rows where the target variable is 1 or 2
X = X[(X['EVOLUCAO'] == 1.0) | (X['EVOLUCAO'] == 2.0) | (X['EVOLUCAO'] == 3.0) | (X['EVOLUCAO'] == 4.0)]

In [680]:
# Remove columns where all values are the same in X (constant columns)
constant_columns = [col for col in X.columns if X[col].nunique() == 1]

# Drop these constant columns from X
X = X.drop(columns=constant_columns)

print("\nDropped constant columns:", constant_columns)


Dropped constant columns: ['TP_NOT', 'ID_AGRAVO', 'ID_PAIS', 'HISTOPA_N', 'IMUNOH_N', 'HOSPITALIZ', 'TP_SISTEMA', 'NDUPLIC_N']


In [681]:
"""
Identify and remove columns with more than missing_values_threshold missing values in X
"""

missing_values_threshold = 0.01
assert 0.0 <= missing_values_threshold <= 1.0, "missing_values_threshold must be between 0 and 1"

# Compute the percentage of missing values in each column of X
missing_values = X.isnull().mean()

# Print the percentage of missing values in each column of X
print("\nPercentage of missing values in X:")
print(missing_values)


Percentage of missing values in X:
DT_NOTIFIC    0.0
SEM_NOT       0.0
NU_ANO        0.0
SG_UF_NOT     0.0
ID_MUNICIP    0.0
             ... 
PLASMATICO    1.0
EVIDENCIA     1.0
PLAQ_MENOR    1.0
CON_FHD       1.0
COMPLICA      1.0
Length: 108, dtype: float64


In [682]:
cols_to_drop = missing_values[missing_values > missing_values_threshold].index

X = X.drop(columns=cols_to_drop)
print(f"\nDropped columns (more than {missing_values_threshold * 100}% missing):")
print(list(cols_to_drop))

# Display the shape of the filtered DataFrame
print("\nShape of the filtered X DataFrame: ", X.shape)


Dropped columns (more than 1.0% missing):
['ID_REGIONA', 'ID_UNIDADE', 'CS_ESCOL_N', 'ID_RG_RESI', 'ID_OCUPA_N', 'DT_CHIK_S1', 'DT_CHIK_S2', 'DT_PRNT', 'RES_CHIKS1', 'RES_CHIKS2', 'RESUL_PRNT', 'DT_SORO', 'RESUL_SORO', 'DT_NS1', 'RESUL_NS1', 'DT_VIRAL', 'RESUL_VI_N', 'DT_PCR', 'RESUL_PCR_', 'SOROTIPO', 'DT_INTERNA', 'UF', 'MUNICIPIO', 'TPAUTOCTO', 'COUFINF', 'COPAISINF', 'COMUNINF', 'DOENCA_TRA', 'CLINC_CHIK', 'DT_OBITO', 'DT_ENCERRA', 'ALRM_HIPOT', 'ALRM_PLAQ', 'ALRM_VOM', 'ALRM_SANG', 'ALRM_HEMAT', 'ALRM_ABDOM', 'ALRM_LETAR', 'ALRM_HEPAT', 'ALRM_LIQ', 'DT_ALRM', 'GRAV_PULSO', 'GRAV_CONV', 'GRAV_ENCH', 'GRAV_INSUF', 'GRAV_TAQUI', 'GRAV_EXTRE', 'GRAV_HIPOT', 'GRAV_HEMAT', 'GRAV_MELEN', 'GRAV_METRO', 'GRAV_SANG', 'GRAV_AST', 'GRAV_MIOC', 'GRAV_CONSC', 'GRAV_ORGAO', 'DT_GRAV', 'MANI_HEMOR', 'EPISTAXE', 'GENGIVO', 'METRO', 'PETEQUIAS', 'HEMATURA', 'SANGRAM', 'LACO_N', 'PLASMATICO', 'EVIDENCIA', 'PLAQ_MENOR', 'CON_FHD', 'COMPLICA']

Shape of the filtered X DataFrame:  (32262, 38)


In [683]:
print("Columns in X: ", X.columns)

Columns in X:  Index(['DT_NOTIFIC', 'SEM_NOT', 'NU_ANO', 'SG_UF_NOT', 'ID_MUNICIP',
       'DT_SIN_PRI', 'SEM_PRI', 'NU_IDADE_N', 'CS_SEXO', 'CS_GESTANT',
       'CS_RACA', 'SG_UF', 'ID_MN_RESI', 'DT_INVEST', 'FEBRE', 'MIALGIA',
       'CEFALEIA', 'EXANTEMA', 'VOMITO', 'NAUSEA', 'DOR_COSTAS', 'CONJUNTVIT',
       'ARTRITE', 'ARTRALGIA', 'PETEQUIA_N', 'LEUCOPENIA', 'LACO', 'DOR_RETRO',
       'DIABETES', 'HEMATOLOG', 'HEPATOPAT', 'RENAL', 'HIPERTENSA',
       'ACIDO_PEPT', 'AUTO_IMUNE', 'CLASSI_FIN', 'CRITERIO', 'EVOLUCAO'],
      dtype='object')


Delete over 20% missing: 45 features left

Delete over 10% missing: 38 features left

Delete over 5% missing: 38 features left

Delete over 2% missing: 38 features left

Delete over 1% missing: 37 features left

In [684]:
X.head()

Unnamed: 0,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,DT_SIN_PRI,SEM_PRI,NU_IDADE_N,CS_SEXO,CS_GESTANT,...,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,CLASSI_FIN,CRITERIO,EVOLUCAO
35,2018-07-06,201827,2018,12,120040,2018-07-05,201827,4010.0,M,6.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,13.0,1.0,1.0
51,2018-06-29,201826,2018,12,120040,2018-06-24,201826,4029.0,F,5.0,...,1.0,2.0,2.0,2.0,1.0,2.0,2.0,13.0,1.0,1.0
93,2018-09-19,201838,2018,12,120040,2018-09-17,201838,4011.0,F,5.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,13.0,2.0,1.0
115,2018-09-24,201839,2018,12,120040,2018-08-24,201834,4005.0,F,6.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,13.0,1.0,1.0
141,2018-08-16,201833,2018,12,120040,2018-08-13,201833,4011.0,F,5.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,13.0,1.0,1.0


In [685]:
# Count values in column "SG_UF_NOT" in X
print("Values in column 'SG_UF_NOT' in X:\n")
print(X["SG_UF_NOT"].value_counts())

Values in column 'SG_UF_NOT' in X:

SG_UF_NOT
35    4568
31    4541
23    3138
33    2360
50    2191
29    1965
26    1506
21    1275
24    1205
25    1121
51    1078
17     926
52     795
22     745
41     716
27     700
11     623
15     610
32     483
28     394
13     381
43     304
53     259
14     150
16      92
42      80
12      56
Name: count, dtype: int64


In [686]:
# Dictionary definition of the Brazilian states

# Código   UF		Sigla
# 11	Rondônia	RO
# 12	Acre	AC
# 13	Amazonas	AM
# 14	Roraima	RR
# 15	Pará	PA
# 16	Amapá	AP
# 17	Tocantins	TO
# 21	Maranhão	MA
# 22	Piauí	PI
# 23	Ceará	CE
# 24	Rio Grande do Norte	RN
# 25	Paraíba	PB
# 26	Pernambuco	PE
# 27	Alagoas	AL
# 28	Sergipe	SE
# 29	Bahia	BA
# 31	Minas Gerais	MG
# 32	Espírito Santo	ES
# 33	Rio de Janeiro	RJ
# 35	São Paulo	SP
# 41	Paraná	PR
# 42	Santa Catarina	SC
# 43	Rio Grande do Sul (*)	RS
# 50	Mato Grosso do Sul	MS
# 51	Mato Grosso	MT
# 52	Goiás	GO
# 53	Distrito Federal	DF

In [687]:
# Map "SG_UF_NOT" value to the region of Brazil as a new one-hot column use these values:
# "Norte": [11, 12, 13, 14, 15, 16, 17],
# "Nordeste": [21, 22, 23, 24, 25, 26, 27, 28, 29],
# "Centro-Oeste": [50, 51, 52, 53],
# "Sudeste": [31, 32, 33, 35],
# "Sul": [41, 42, 43]

X["REGION_NORTH"] = X["SG_UF_NOT"].isin([11, 12, 13, 14, 15, 16, 17]).astype(int)
X["REGION_NORTHEAST"] = X["SG_UF_NOT"].isin([21, 22, 23, 24, 25, 26, 27, 28, 29]).astype(int)
X["REGION_MIDWEST"] = X["SG_UF_NOT"].isin([50, 51, 52, 53]).astype(int)
X["REGION_SOUTHEAST"] = X["SG_UF_NOT"].isin([31, 32, 33, 35]).astype(int)
X["REGION_SOUTH"] = X["SG_UF_NOT"].isin([41, 42, 43]).astype(int)

# Remove the "SG_UF_NOT" and "ID_MUNICIP" column from X
X = X.drop(columns=["SG_UF_NOT", "ID_MUNICIP"])

In [688]:
# Count values in each region
print("\nValues in each region in X:\n")
print("Region North: ", X["REGION_NORTH"].sum())
print("Region Northeast: ", X["REGION_NORTHEAST"].sum())
print("Region Midwest: ", X["REGION_MIDWEST"].sum())
print("Region Southeast: ", X["REGION_SOUTHEAST"].sum())
print("Region South: ", X["REGION_SOUTH"].sum())


Values in each region in X:

Region North:  2838
Region Northeast:  12049
Region Midwest:  4323
Region Southeast:  11952
Region South:  1100


In [689]:
# Remove "SG_UF" and "ID_MN_RESI" columns from X (not useful, redundant values with "SG_UF_NOT" and "ID_MUNICIP")
X = X.drop(columns=["SG_UF", "ID_MN_RESI"])

In [690]:
# Remove rows whose "DT_SIN_PRI" value is lower than 2017-07-01 (first symptoms reported before 2017-07-01 doesn't make sense to go to the hospital after 2018)
previous_year = start_year - 1
X = X[pd.to_datetime(X["DT_SIN_PRI"]) >= pd.to_datetime(f"20{previous_year}-07-01")]

# Save the difference in DAYS between "DT_NOTIFIC" and "DT_SIN_PRI" in a new column called "TIME_DIFF_DAYS"
X["TIME_DIFF_DAYS"] = (pd.to_datetime(X["DT_NOTIFIC"]) - pd.to_datetime(X["DT_SIN_PRI"])).dt.days

# Print min and max values of "TIME_DIFF_DAYS"
print("\nMin and max values of 'TIME_DIFF_DAYS' in X:")
print("Min: ", X["TIME_DIFF_DAYS"].min())
print("Max: ", X["TIME_DIFF_DAYS"].max())


Min and max values of 'TIME_DIFF_DAYS' in X:
Min:  0
Max:  1604


In [691]:
# Print number of rows whose "TIME_DIFF_DAYS" is greater than 180 days
print("\nNumber of rows whose 'TIME_DIFF_DAYS' is greater than 180 days: ", (X["TIME_DIFF_DAYS"] > 180).sum())

# Remove rows whose "TIME_DIFF_DAYS" is greater than 180 days
X = X[X["TIME_DIFF_DAYS"] <= 180]

# Remove "DT_SIN_PRI" and "SEM_PRI" columns from X (not useful, redundant values with "TIME_DIFF_DAYS")
X = X.drop(columns=["DT_SIN_PRI", "SEM_PRI"])


Number of rows whose 'TIME_DIFF_DAYS' is greater than 180 days:  94


In [692]:
# Use week and year info to create a new TIME column replacing DT_NOTIFIC, SEM_NOT and NU_ANO
print("Type of SEM_NOT column: ", X["SEM_NOT"].dtype)
print("Type of NU_ANO column: ", X["NU_ANO"].dtype)
print("Min value of SEM_NOT column: ", X["SEM_NOT"].min())
print("Max value of SEM_NOT column: ", X["SEM_NOT"].max())

# Get min value from "DT_NOTIFIC" column
print("Min value of DT_NOTIFIC column: ", X["DT_NOTIFIC"].min())

##########################################################################
# TO DO
# FIX WEEK CALCULATION FORMULA ERROR
# Calculate the week from "DT_NOTIFIC" columns starting in 2017-12-31
# start_notific = pd.to_datetime(X["DT_NOTIFIC"].min())

# X["TIME"] = X["DT_NOTIFIC"].apply(lambda x: (x - start_notific).days // 7)
##########################################################################

# Calculate the week and year from the SEM_NOT and NU_ANO columns
week = X["SEM_NOT"] % 100
year = X["NU_ANO"] - 2000 - start_year
X["TIME"] = year * 52 + week

# Remove DT_NOTIFIC, SEM_NOT and NU_ANO columns
X = X.drop(columns=["DT_NOTIFIC", "SEM_NOT", "NU_ANO"])


Type of SEM_NOT column:  int64
Type of NU_ANO column:  int64
Min value of SEM_NOT column:  201801
Max value of SEM_NOT column:  202509
Min value of DT_NOTIFIC column:  2017-12-31


In [693]:
# Check all values in column "CLASSI_FIN"
# 5.0 discarded
# 13.0 confirmed case of Chikungunya

# Map "CLASSI_FIN" value to 1 for confirmed cases and 0 for discarded cases
X["CLASSI_FIN"] = X["CLASSI_FIN"].map({13.0: 1, 5.0: 0})

# Rename "CLASSI_FIN" column to "CONFIRMED_CASE"
X = X.rename(columns={"CLASSI_FIN": "CONFIRMED_CASE"})
print("\nValues in 'CONFIRMED_CASE' column in X:")
print(X["CONFIRMED_CASE"].value_counts())


Values in 'CONFIRMED_CASE' column in X:
CONFIRMED_CASE
0.0    18045
1.0    13890
Name: count, dtype: int64


In [694]:
# Remove "DT_INVEST" column from X (not useful, redundant with "TIME" column)
X = X.drop(columns=["DT_INVEST"])

# Rename "NU_IDADE_N" column to "AGE" column
X = X.rename(columns={"NU_IDADE_N": "AGE"})

# Remove rows whose "AGE" is less than 1000 (undefined, user error) or greater than 4150 (unrealistic, over 150 years)
X = X[(X["AGE"] >= 1000) & (X["AGE"] <= 4150)]

# Modify "AGE" column to have the age in years
# If "AGE" is less than 4000, change to 0 years (hours, days or months old)
# Else, subtract 4000 from the value to get the age in years

X["AGE"] = X["AGE"].apply(lambda x: 0 if x < 4000 else x - 4000)

In [695]:
# Most common years are young people
X["AGE"].value_counts()

AGE
0.0      1508
1.0       708
11.0      587
5.0       560
8.0       555
         ... 
105.0       1
119.0       1
101.0       1
106.0       1
104.0       1
Name: count, Length: 105, dtype: int64

In [696]:
# Print "CS_SEXO" column values
print("Values in 'CS_SEXO' column: ", X["CS_SEXO"].value_counts())

# Keep rows whose "CS_SEXO" is "M" or "F"
X = X[(X["CS_SEXO"] == "M") | (X["CS_SEXO"] == "F")]

# Map "CS_SEXO" value to 1 if "F" and 0 if "M"
X["CS_SEXO"] = X["CS_SEXO"].map({"F": 1, "M": 0})

# Rename column to "GENDER"
X = X.rename(columns={"CS_SEXO": "GENDER"})

Values in 'CS_SEXO' column:  CS_SEXO
F    16921
M    15153
I       10
Name: count, dtype: int64


In [697]:
# Map "CS_GESTANT" value to 1 if 1,2 or 3 (trimester of pregnancy)
X["CS_GESTANT"] = X["CS_GESTANT"].map({1: 1, 2: 1, 3: 1, 4: 0, 5: 0, 6: 0, 9: 0})

# Rename column to "PREGNANT"
X = X.rename(columns={"CS_GESTANT": "PREGNANT"})
print("Values in 'PREGNANT' column: ", X["PREGNANT"].value_counts())

Values in 'PREGNANT' column:  PREGNANT
0.0    30945
1.0     1126
Name: count, dtype: int64


In [698]:
# Count values in "CS_RACA" column
print("Values in 'CS_RACA' column: ", X["CS_RACA"].value_counts())

##############################################################
# Bias question: does race influence the outcome of the disease?
##############################################################

# One-hot encode "CS_RACA" column, ignore value 9 (ignored)
X["WHITE"] = X["CS_RACA"].isin([1]).astype(int)
X["BLACK"] = X["CS_RACA"].isin([2]).astype(int)
X["YELLOW"] = X["CS_RACA"].isin([3]).astype(int)
X["BROWN"] = X["CS_RACA"].isin([4]).astype(int)
X["INDIGENOUS"] = X["CS_RACA"].isin([5]).astype(int)

# Remove "CS_RACA" column
X = X.drop(columns=["CS_RACA"])

Values in 'CS_RACA' column:  CS_RACA
4.0    18106
1.0     9048
9.0     3049
2.0     1400
3.0      256
5.0      215
Name: count, dtype: int64


In [699]:
symptoms_columns = ["FEBRE", "MIALGIA", "CEFALEIA", "EXANTEMA", "VOMITO", "NAUSEA", "DOR_COSTAS", "CONJUNTVIT", "ARTRITE", "ARTRALGIA", "PETEQUIA_N", "LEUCOPENIA", "LACO", "DOR_RETRO", "DIABETES", "HEMATOLOG", "HEPATOPAT", "RENAL", "HIPERTENSA", "ACIDO_PEPT", "AUTO_IMUNE"]

# Map symptoms columns to 1 if the value is 1 else 0 (no symptom)
for col in symptoms_columns:
    X[col] = X[col].map({1: 1, 2: 0, 9: 0})

In [700]:
# Train-test split
# Split the data into X_train, y_train, X_test, y_test
# Use the last {test_years} years for testing and the rest for training
max_train_week_value = (end_year - start_year - test_years) * 52
print(max_train_week_value)
print(X["TIME"].max())

X_train = X[X["TIME"] <= max_train_week_value]
X_test = X[X["TIME"] > max_train_week_value]

# Remove the 'EVOLUCAO' column from X_train and save it in y_train
y_train = X_train.pop("EVOLUCAO")

# Remove the 'EVOLUCAO' column from X_test and save it in y_test
y_test = X_test.pop("EVOLUCAO")

# Change the target variable to 0 for cure and 1 for death
y_train = y_train.map({1: 0, 2: 1, 3:1, 4:1})
y_test = y_test.map({1: 0, 2: 1, 3:1, 4:1})

260
373


In [701]:
# Values of the y_train and y_test
print("\nValues for 'EVOLUCAO' in y_train:")
print(y_train.value_counts(dropna=True))

print("\nValues for 'EVOLUCAO' in y_test:")
print(y_test.value_counts(dropna=True))


Values for 'EVOLUCAO' in y_train:
EVOLUCAO
0    15080
1     1153
Name: count, dtype: int64

Values for 'EVOLUCAO' in y_test:
EVOLUCAO
0    14608
1     1233
Name: count, dtype: int64


In [702]:
X_train.head()

Unnamed: 0,AGE,GENDER,PREGNANT,FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,...,REGION_MIDWEST,REGION_SOUTHEAST,REGION_SOUTH,TIME_DIFF_DAYS,TIME,WHITE,BLACK,YELLOW,BROWN,INDIGENOUS
35,10.0,0,0.0,1,1,1,1,0,1,1,...,0,0,0,1,27,0,0,0,1,0
51,29.0,1,0.0,1,1,1,1,0,1,1,...,0,0,0,5,26,0,0,0,1,0
93,11.0,1,0.0,1,1,1,0,0,0,0,...,0,0,0,2,38,0,0,0,1,0
115,5.0,1,0.0,1,0,1,0,0,1,0,...,0,0,0,31,39,0,0,0,1,0
141,11.0,1,0.0,1,1,1,0,0,0,0,...,0,0,0,3,33,0,0,0,1,0


In [703]:
"""
Save the filtered data to a new CSV file in the cleaned_path directory
"""

# Save the filtered data to a new CSV file in the cleaned_path directory
X_train.to_csv(f'{cleaned_path}X_train.csv', index=False)
y_train.to_csv(f'{cleaned_path}y_train.csv', index=False)

X_test.to_csv(f'{cleaned_path}X_test.csv', index=False)
y_test.to_csv(f'{cleaned_path}y_test.csv', index=False)