In [16]:
# Required Libraries
import pandas as pd
import os
from imblearn.over_sampling import SMOTE

In [17]:
# Paths to the data

# SINAN DataSUS CSV files path (modify to match your file path)
sinan_path = os.path.expanduser('~/Desktop/DataSUS-Chikungunya-ML/source/csv/')

# Cleaned CSV files path (modify to match your file path)
cleaned_path = os.path.expanduser('~/Desktop/DataSUS-Chikungunya-ML/datasets/')

In [18]:
"""
Load the Chikungunya data from the CSV files
The CSV files are named CHIKBRYY.csv, where YY is the last two digits of the year
The files are stored in the ~/Downloads/dbc2csv/source/csv/ directory
The columns in the CSV files are not consistent across all years
We want to identify the columns that are common to all files
"""

# List of last two digits of years for which we have CSV files (2018 to 2024)
start_year = 18
end_year = 24
assert start_year < end_year, "Start year must be less than end year"
years = range(start_year, end_year + 1)

# Dictionary to store the columns for each file
file_columns = {}

# Loop through each year, build the filename, and read the CSV
for year in years:
    file_name = f'{sinan_path}CHIKBR{str(year)}.csv'
    try:
        df = pd.read_csv(file_name, low_memory=False)
        # Save the set of columns for this file
        file_columns[file_name] = set(df.columns)
    except Exception as e:
        print(f"Error loading {file_name}: {e}")

# Ensure we have loaded at least one file before proceeding
assert file_columns, "No files were loaded. Please check your file paths."

# Find common columns: the intersection of columns across all files
common_columns = set.intersection(*file_columns.values())
print("\nColumns common to all files:")
print(common_columns)

# Compute the union of all columns (all columns that appear in any file)
all_columns = set.union(*file_columns.values())

# For columns that are not common, print which files have them and which don't.
print("\nColumns that are not common among all files:")
for col in all_columns - common_columns:
    # Extract base name (e.g., CHIKBR21) from each file path
    files_with = [os.path.splitext(os.path.basename(fname))[0] 
                    for fname, cols in file_columns.items() if col in cols]
    files_without = [os.path.splitext(os.path.basename(fname))[0] 
                        for fname, cols in file_columns.items() if col not in cols]
    print(f"Column '{col}' is present in files: {files_with} and missing in files: {files_without}")



Columns common to all files:
{'DT_INVEST', 'COUFINF', 'VOMITO', 'HOSPITALIZ', 'PETEQUIA_N', 'GRAV_EXTRE', 'DT_SIN_PRI', 'GRAV_HEMAT', 'DT_PRNT', 'NU_ANO', 'DT_NOTIFIC', 'CEFALEIA', 'ALRM_HIPOT', 'RES_CHIKS2', 'CS_GESTANT', 'EVIDENCIA', 'SEM_PRI', 'ID_REGIONA', 'SG_UF_NOT', 'RENAL', 'RESUL_VI_N', 'CLASSI_FIN', 'GRAV_CONSC', 'GRAV_ENCH', 'LACO_N', 'CS_SEXO', 'COMUNINF', 'HISTOPA_N', 'DT_CHIK_S2', 'ALRM_HEPAT', 'SOROTIPO', 'ID_PAIS', 'HIPERTENSA', 'ACIDO_PEPT', 'DT_NS1', 'NU_IDADE_N', 'AUTO_IMUNE', 'NAUSEA', 'ID_MUNICIP', 'ALRM_PLAQ', 'RESUL_PRNT', 'PLASMATICO', 'TP_SISTEMA', 'DT_GRAV', 'DT_CHIK_S1', 'GRAV_MIOC', 'GRAV_INSUF', 'COMPLICA', 'ID_AGRAVO', 'ALRM_LIQ', 'LACO', 'IMUNOH_N', 'CONJUNTVIT', 'HEPATOPAT', 'ID_UNIDADE', 'GRAV_SANG', 'DT_INTERNA', 'GRAV_ORGAO', 'GRAV_MELEN', 'DT_OBITO', 'CLINC_CHIK', 'ID_MN_RESI', 'DIABETES', 'GRAV_TAQUI', 'RESUL_PCR_', 'DT_ENCERRA', 'GRAV_HIPOT', 'SEM_NOT', 'TP_NOT', 'ALRM_VOM', 'EXANTEMA', 'GENGIVO', 'UF', 'ARTRITE', 'MANI_HEMOR', 'SANGRAM', 'GRAV_CO

In [19]:
"""
Remove columns that are not common to all files
Create a new DataFrame with only the common columns
Concatenate all the DataFrames without the last {test_years} years into a single DataFrame called X_train
Concatenate the DataFrames from the last {test_years} years into a single DataFrame called X_test
"""

# Number of years to use for testing
test_years = 2
assert common_columns, "No common columns found. Please check your file paths."
assert len(years) > test_years, "At least {test_years + 1}  years of data are required."

# Load the train and test data for each year, keeping only the common columns
X_train = []
X_test = []

for year in years:
    file_name = f'{sinan_path}CHIKBR{str(year)}.csv'
    try:
        df = pd.read_csv(file_name, usecols=common_columns, low_memory=False)
        # Drop the "Unnamed: 0" column if present
        if "Unnamed: 0" in df.columns:
            df = df.drop("Unnamed: 0", axis=1)
        if year < years[-test_years]:
            X_train.append(df)
        else:
            X_test.append(df)
    except Exception as e:
        print(f"Error loading {file_name}: {e}")

# Concatenate all the DataFrames into a single train and test DataFrame
X_train = pd.concat(X_train, ignore_index=True)
X_test = pd.concat(X_test, ignore_index=True)

# Display the shape of the concatenated DataFrame
print("\nShape of the concatenated X_train DataFrame: ", X_train.shape)
print("Shape of the concatenated X_test DataFrame: ", X_test.shape)



Shape of the concatenated X_train DataFrame:  (802287, 116)
Shape of the concatenated X_test DataFrame:  (650214, 116)


In [20]:
"""
Filter the data to include only the rows where the pacient was hospitalized ("HOSPITALIZ" column is equal to 1 or to 1.0)
"""

print("\nUnique values in the 'HOSPITALIZ' column:")
print(X_train['HOSPITALIZ'].value_counts())

# Filter the data to include only the rows where the patient was hospitalized
X_train = X_train[(X_train['HOSPITALIZ'] == 1) | (X_train['HOSPITALIZ'] == 1.0)]
X_test = X_test[(X_test['HOSPITALIZ'] == 1) | (X_test['HOSPITALIZ'] == 1.0)]

# Display the shape of the filtered DataFrame
print("\nShape of the filtered X_train DataFrame: ", X_train.shape)
print("Shape of the filtered X_test DataFrame: ", X_test.shape)


Unique values in the 'HOSPITALIZ' column:
HOSPITALIZ
2.0    393418
2       73033
1.0     21844
9.0     15658
1        3629
9        2885
           1
Ø           1
J           1
ï           1
Name: count, dtype: int64

Shape of the filtered X_train DataFrame:  (21844, 116)
Shape of the filtered X_test DataFrame:  (18939, 116)


In [21]:
"""
Extract the target variable from the data (EVOLUCAO column)

1- cura
2- óbito pelo
agravo
3- óbito por outras
causas
4- óbito em
investigação
9- ignorado

Remove rows where the target variable is Nan, 3 (death by other causes), 4 (under investigation) or 9 (ignored)
Only keep rows where the target variable is 1 (cure) or 2 (death by the disease)
"""

##############################################################
# Should we drop rows where the target variable is 3 or 4?
# Dropping them results in a very unbalanced dataset
##############################################################

# Print the value counts for the 'EVOLUCAO' column in X_train
print("\nValues for 'EVOLUCAO' in X_train:")
print(X_train["EVOLUCAO"].value_counts(dropna=False))

# Print the value counts for the 'EVOLUCAO' column in X_test
print("\nValues for 'EVOLUCAO' in X_test:")
print(X_test["EVOLUCAO"].value_counts(dropna=False))



Values for 'EVOLUCAO' in X_train:
EVOLUCAO
1.0    15203
NaN     3865
9.0     1587
3.0      858
2.0      251
4.0       80
Name: count, dtype: int64

Values for 'EVOLUCAO' in X_test:
EVOLUCAO
1.0    13453
NaN     2780
9.0     1540
3.0      761
2.0      296
4.0      109
Name: count, dtype: int64


In [22]:
# Filter the data to include only the rows where the target variable is 1 or 2
X_train = X_train[(X_train['EVOLUCAO'] == 1.0) | (X_train['EVOLUCAO'] == 2.0) | (X_train['EVOLUCAO'] == 3.0) | (X_train['EVOLUCAO'] == 4.0)]
X_test = X_test[(X_test['EVOLUCAO'] == 1.0) | (X_test['EVOLUCAO'] == 2.0) | (X_test['EVOLUCAO'] == 3.0) | (X_test['EVOLUCAO'] == 4.0)]

# Remove the 'EVOLUCAO' column from X_train and save it in y_train
y_train = X_train.pop("EVOLUCAO")

# Remove the 'EVOLUCAO' column from X_test and save it in y_test
y_test = X_test.pop("EVOLUCAO")

# Change the target variable to 0 for cure and 1 for death
y_train = y_train.map({1: 0, 2: 1, 3:1, 4:1})
y_test = y_test.map({1: 0, 2: 1, 3:1, 4:1})

In [23]:
# Remove columns where all values are the same in X_train and X_test (constant columns)
constant_columns = [col for col in X_train.columns if X_train[col].nunique() == 1]

# Drop these constant columns from both X_train and X_test
X_train = X_train.drop(columns=constant_columns)
X_test = X_test.drop(columns=constant_columns, errors='ignore')

print("\nDropped constant columns:", constant_columns)


Dropped constant columns: ['TP_NOT', 'ID_AGRAVO', 'ID_PAIS', 'HISTOPA_N', 'IMUNOH_N', 'HOSPITALIZ', 'TP_SISTEMA', 'NDUPLIC_N']


In [24]:
"""
Identify and remove columns with more than missing_values_threshold missing values in X_train and X_test
"""

missing_values_threshold = 0.20
assert 0.0 <= missing_values_threshold <= 1.0, "missing_values_threshold must be between 0 and 1"

# Compute the percentage of missing values in each column of X_train
missing_values_train = X_train.isnull().mean()
missing_values_test = X_test.isnull().mean()
missing_values_mean = (missing_values_train + missing_values_test) / 2

# Print the percentage of missing values in each column of X_train
print("\nPercentage of missing values in X_train:")
print(missing_values_train)

# Print the percentage of missing values in each column of X_test
print("\nPercentage of missing values in X_test:")
print(missing_values_test)



Percentage of missing values in X_train:
DT_NOTIFIC    0.0
SEM_NOT       0.0
NU_ANO        0.0
SG_UF_NOT     0.0
ID_MUNICIP    0.0
             ... 
PLASMATICO    1.0
EVIDENCIA     1.0
PLAQ_MENOR    1.0
CON_FHD       1.0
COMPLICA      1.0
Length: 107, dtype: float64

Percentage of missing values in X_test:
DT_NOTIFIC    0.0
SEM_NOT       0.0
NU_ANO        0.0
SG_UF_NOT     0.0
ID_MUNICIP    0.0
             ... 
PLASMATICO    1.0
EVIDENCIA     1.0
PLAQ_MENOR    1.0
CON_FHD       1.0
COMPLICA      1.0
Length: 107, dtype: float64


In [25]:
cols_to_drop = missing_values_train[missing_values_mean > missing_values_threshold].index

X_train = X_train.drop(columns=cols_to_drop)
X_test = X_test.drop(columns=cols_to_drop)
print(f"\nDropped columns (more than {missing_values_threshold * 100}% missing):")
print(list(cols_to_drop))

# Display the shape of the filtered DataFrame
print("\nShape of the filtered X_train DataFrame: ", X_train.shape)
print("Shape of the filtered X_test DataFrame: ", X_test.shape)


Dropped columns (more than 20.0% missing):
['ID_OCUPA_N', 'DT_CHIK_S1', 'DT_CHIK_S2', 'DT_PRNT', 'RES_CHIKS1', 'RES_CHIKS2', 'RESUL_PRNT', 'DT_SORO', 'RESUL_SORO', 'DT_NS1', 'RESUL_NS1', 'DT_VIRAL', 'RESUL_VI_N', 'DT_PCR', 'RESUL_PCR_', 'SOROTIPO', 'TPAUTOCTO', 'COUFINF', 'COPAISINF', 'COMUNINF', 'DOENCA_TRA', 'CLINC_CHIK', 'DT_OBITO', 'ALRM_HIPOT', 'ALRM_PLAQ', 'ALRM_VOM', 'ALRM_SANG', 'ALRM_HEMAT', 'ALRM_ABDOM', 'ALRM_LETAR', 'ALRM_HEPAT', 'ALRM_LIQ', 'DT_ALRM', 'GRAV_PULSO', 'GRAV_CONV', 'GRAV_ENCH', 'GRAV_INSUF', 'GRAV_TAQUI', 'GRAV_EXTRE', 'GRAV_HIPOT', 'GRAV_HEMAT', 'GRAV_MELEN', 'GRAV_METRO', 'GRAV_SANG', 'GRAV_AST', 'GRAV_MIOC', 'GRAV_CONSC', 'GRAV_ORGAO', 'DT_GRAV', 'MANI_HEMOR', 'EPISTAXE', 'GENGIVO', 'METRO', 'PETEQUIAS', 'HEMATURA', 'SANGRAM', 'LACO_N', 'PLASMATICO', 'EVIDENCIA', 'PLAQ_MENOR', 'CON_FHD', 'COMPLICA']

Shape of the filtered X_train DataFrame:  (16392, 45)
Shape of the filtered X_test DataFrame:  (14619, 45)


In [26]:
# Values of the y_train and y_test
print("\nValues for 'EVOLUCAO' in y_train:")
print(y_train.value_counts(dropna=True))

print("\nValues for 'EVOLUCAO' in y_test:")
print(y_test.value_counts(dropna=True))


Values for 'EVOLUCAO' in y_train:
EVOLUCAO
0    15203
1     1189
Name: count, dtype: int64

Values for 'EVOLUCAO' in y_test:
EVOLUCAO
0    13453
1     1166
Name: count, dtype: int64


In [27]:
print("Columns in X_train: ", X_train.columns)

Columns in X_train:  Index(['DT_NOTIFIC', 'SEM_NOT', 'NU_ANO', 'SG_UF_NOT', 'ID_MUNICIP',
       'ID_REGIONA', 'ID_UNIDADE', 'DT_SIN_PRI', 'SEM_PRI', 'NU_IDADE_N',
       'CS_SEXO', 'CS_GESTANT', 'CS_RACA', 'CS_ESCOL_N', 'SG_UF', 'ID_MN_RESI',
       'ID_RG_RESI', 'DT_INVEST', 'FEBRE', 'MIALGIA', 'CEFALEIA', 'EXANTEMA',
       'VOMITO', 'NAUSEA', 'DOR_COSTAS', 'CONJUNTVIT', 'ARTRITE', 'ARTRALGIA',
       'PETEQUIA_N', 'LEUCOPENIA', 'LACO', 'DOR_RETRO', 'DIABETES',
       'HEMATOLOG', 'HEPATOPAT', 'RENAL', 'HIPERTENSA', 'ACIDO_PEPT',
       'AUTO_IMUNE', 'DT_INTERNA', 'UF', 'MUNICIPIO', 'CLASSI_FIN', 'CRITERIO',
       'DT_ENCERRA'],
      dtype='object')


In [28]:
# # Not using SMOTE yet, need to clean the dataset columns first
# # Set SMOTE sampling strategy:
# # We want the minority class to be 10% of the total training data.
# # Let r be the ratio minority/majority after resampling, then minority fraction = r/(1+r).
# # For a 10% minority fraction, r = 0.10 / (1 - 0.10) ≈ 0.1111.
# smote = SMOTE(sampling_strategy=0.1111, random_state=42)

# # Apply SMOTE oversampling to the training data only
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# # Print the new distribution for y_train
# print("\nNew y_train distribution after SMOTE oversampling:")
# print(y_train_smote.value_counts())


In [29]:
"""
Save the filtered data to a new CSV file in the cleaned_path directory
"""

# Save the filtered data to a new CSV file in the cleaned_path directory
X_train.to_csv(f'{cleaned_path}X_train.csv', index=False)
y_train.to_csv(f'{cleaned_path}y_train.csv', index=False)

X_test.to_csv(f'{cleaned_path}X_test.csv', index=False)
y_test.to_csv(f'{cleaned_path}y_test.csv', index=False)