<a href="https://colab.research.google.com/github/OseiasBeu/Cardiovascular-diseases-risk-project/blob/main/ETL_Cardiovascular_diseases_risk_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering

- Fazer o balanceamento dos dados
- Fazer o Encoded
- Separação em X e y
- Separação de Datasets para treino e para teste

In [31]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import requests
import zipfile
import io
import os
import numpy as np


In [32]:
# URL do arquivo ZIP
url = "https://github.com/OseiasBeu/Cardiovascular-diseases-risk-project/raw/main/dataset.zip"

# Diretório onde deseja extrair o conteúdo do arquivo zip
diretorio_destino = '/content/dataset/'

# Faz o download do arquivo ZIP
response = requests.get(url)
zip_file = io.BytesIO(response.content)

# Verifica se o diretório de destino existe, senão cria
if not os.path.exists(diretorio_destino):
    os.makedirs(diretorio_destino)

# Descompacta o arquivo zip
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(diretorio_destino)



In [33]:
data = pd.read_csv('/content/dataset/dataset/CVD_cleaned.csv')

## Encoded

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  object 
 3   Heart_Disease                 308854 non-null  object 
 4   Skin_Cancer                   308854 non-null  object 
 5   Other_Cancer                  308854 non-null  object 
 6   Depression                    308854 non-null  object 
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  object 
 9   Sex                           308854 non-null  object 
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

In [35]:
data.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   General_Health   308854 non-null  object
 1   Checkup          308854 non-null  object
 2   Exercise         308854 non-null  object
 3   Heart_Disease    308854 non-null  object
 4   Skin_Cancer      308854 non-null  object
 5   Other_Cancer     308854 non-null  object
 6   Depression       308854 non-null  object
 7   Diabetes         308854 non-null  object
 8   Arthritis        308854 non-null  object
 9   Sex              308854 non-null  object
 10  Age_Category     308854 non-null  object
 11  Smoking_History  308854 non-null  object
dtypes: object(12)
memory usage: 28.3+ MB


In [36]:
data.Heart_Disease.value_counts()

No     283883
Yes     24971
Name: Heart_Disease, dtype: int64

In [37]:
categorical_data = data.select_dtypes(include='object')
categorical_column_names = list(data.columns)
arr = []

print(f'Lista de variáveis categóricas: {list(categorical_data)}')
print(f'Número de variáveis categóricas: {len(list(categorical_data))}')
print()
print()
print('-----------------------------------------------------------------------')

for col in categorical_data:
    print(f'{col} Valores distintos: {list(categorical_data[col].unique())}')
    print(f'{col} tem {len(list(categorical_data[col].unique()))} valores únicos')
    if len(list(categorical_data[col].unique())) == 2:
      arr.append(col)
    print()
    print()
print(arr)
for col in arr:
    print(f'{col} Valores distintos: {list(categorical_data[col].unique())}')
    print(f'{col} tem {len(list(categorical_data[col].unique()))} valores únicos')
    print()
    print()

Lista de variáveis categóricas: ['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex', 'Age_Category', 'Smoking_History']
Número de variáveis categóricas: 12


-----------------------------------------------------------------------
General_Health Valores distintos: ['Poor', 'Very Good', 'Good', 'Fair', 'Excellent']
General_Health tem 5 valores únicos


Checkup Valores distintos: ['Within the past 2 years', 'Within the past year', '5 or more years ago', 'Within the past 5 years', 'Never']
Checkup tem 5 valores únicos


Exercise Valores distintos: ['No', 'Yes']
Exercise tem 2 valores únicos


Heart_Disease Valores distintos: ['No', 'Yes']
Heart_Disease tem 2 valores únicos


Skin_Cancer Valores distintos: ['No', 'Yes']
Skin_Cancer tem 2 valores únicos


Other_Cancer Valores distintos: ['No', 'Yes']
Other_Cancer tem 2 valores únicos


Depression Valores distintos: ['No', 'Yes']
Depression tem 2 valores únicos

### Tratando os campos booleanos

In [38]:
mapper_hd = {
    'No':0,
    'Yes':1,
    np.nan: 1
}

mapper_sex = {
    'Female':0,
    'Male':1,
    np.nan: 1
}


In [39]:
data['Heart_Disease'] = data['Heart_Disease'].map(mapper_hd,na_action=None)
data['Sex'] = data['Sex'].map(mapper_sex,na_action=None)
data['Exercise'] = data['Exercise'].map(mapper_hd,na_action=None)
data['Skin_Cancer'] = data['Skin_Cancer'].map(mapper_hd,na_action=None)
data['Other_Cancer'] = data['Other_Cancer'].map(mapper_hd,na_action=None)
data['Depression'] = data['Depression'].map(mapper_hd,na_action=None)
data['Arthritis'] = data['Arthritis'].map(mapper_hd,na_action=None)
data['Smoking_History'] = data['Smoking_History'].map(mapper_hd,na_action=None)

In [40]:
data.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   General_Health  308854 non-null  object
 1   Checkup         308854 non-null  object
 2   Diabetes        308854 non-null  object
 3   Age_Category    308854 non-null  object
dtypes: object(4)
memory usage: 9.4+ MB


### Tratando o campo Checkup

In [41]:
data.Checkup.value_counts()

Within the past year       239371
Within the past 2 years     37213
Within the past 5 years     17442
5 or more years ago         13421
Never                        1407
Name: Checkup, dtype: int64

In [42]:
data.loc[data['Checkup'] == 'Within the past year', 'last_Checkup_years'] = '<1'
data.loc[data['Checkup'] == 'Within the past 2 years', 'last_Checkup_years'] = '<2'
data.loc[data['Checkup'] == 'Within the past 5 years', 'last_Checkup_years'] = '<5'
data.loc[data['Checkup'] == '5 or more years ago', 'last_Checkup_years'] = '>5'
data.loc[data['Checkup'] == 'Never', 'last_Checkup_years'] = 'Never'

### Tratando campo de diabetes

In [43]:
data.Diabetes.value_counts()

No                                            259141
Yes                                            40171
No, pre-diabetes or borderline diabetes         6896
Yes, but female told only during pregnancy      2646
Name: Diabetes, dtype: int64

In [44]:
data.loc[data['Diabetes'] == 'No', 'diabetes?'] = 'No'
data.loc[data['Diabetes'] == 'Yes', 'diabetes?'] = 'Yes'
data.loc[data['Diabetes'] == 'No, pre-diabetes or borderline diabetes', 'diabetes?'] = 'Pre_diabetes_boderline'
data.loc[data['Diabetes'] == 'Yes, but female told only during pregnancy', 'diabetes?'] = 'Yes_scovered_in_pregnancy'

### Tratando campo Age:

In [45]:
data.Age_Category.value_counts()

65-69    33434
60-64    32418
70-74    31103
55-59    28054
50-54    25097
80+      22271
40-44    21595
45-49    20968
75-79    20705
35-39    20606
18-24    18681
30-34    18428
25-29    15494
Name: Age_Category, dtype: int64

 - Esse campo não vai precisar ser tratado

## Removendo campos antigos

In [46]:
data = data.drop(columns=['Diabetes','Checkup'])

## Fazendo o encoded

In [47]:
data.select_dtypes(include='object').columns

Index(['General_Health', 'Age_Category', 'last_Checkup_years', 'diabetes?'], dtype='object')

In [48]:
# Get one hot encoding of columns
one_hot = pd.get_dummies(data[['General_Health','Age_Category','last_Checkup_years', 'diabetes?']])
# Drop columns as it is now encoded
data = data.drop(['General_Health', 'Age_Category','last_Checkup_years', 'diabetes?'],axis = 1)
# Join the encoded df
df = data.join(one_hot)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 42 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   Exercise                             308854 non-null  int64  
 1   Heart_Disease                        308854 non-null  int64  
 2   Skin_Cancer                          308854 non-null  int64  
 3   Other_Cancer                         308854 non-null  int64  
 4   Depression                           308854 non-null  int64  
 5   Arthritis                            308854 non-null  int64  
 6   Sex                                  308854 non-null  int64  
 7   Height_(cm)                          308854 non-null  float64
 8   Weight_(kg)                          308854 non-null  float64
 9   BMI                                  308854 non-null  float64
 10  Smoking_History                      308854 non-null  int64  
 11  Alcohol_Consu

## Balanceamento

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 42 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   Exercise                             308854 non-null  int64  
 1   Heart_Disease                        308854 non-null  int64  
 2   Skin_Cancer                          308854 non-null  int64  
 3   Other_Cancer                         308854 non-null  int64  
 4   Depression                           308854 non-null  int64  
 5   Arthritis                            308854 non-null  int64  
 6   Sex                                  308854 non-null  int64  
 7   Height_(cm)                          308854 non-null  float64
 8   Weight_(kg)                          308854 non-null  float64
 9   BMI                                  308854 non-null  float64
 10  Smoking_History                      308854 non-null  int64  
 11  Alcohol_Consu

In [55]:
# Dividir os dados em features (X) e target (y)
X = df.drop('Heart_Disease', axis=1)
y = df['Heart_Disease']

# Dividir o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mostrar contagem de classes antes do balanceamento
print("Contagem de classes antes do balanceamento:", Counter(y_train))

# Aplicar oversampling para equilibrar as classes
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

# Mostrar contagem de classes após o balanceamento
print("Contagem de classes após o balanceamento:", Counter(y_resampled))

Contagem de classes antes do balanceamento: Counter({0: 227109, 1: 19974})
Contagem de classes após o balanceamento: Counter({0: 227109, 1: 227109})


In [56]:
# Dividir o conjunto de dados em treinamento e teste
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

- Salvar os arquivos de resemple gerados para fazer um teste com os modelos

In [57]:
X_train_r.to_csv('dataset/X_train_r_cdr.csv',sep=';')
X_test_r.to_csv('dataset/X_test_r_cdr.csv',sep=';')
y_train_r.to_csv('dataset/y_train_r_cdr.csv',sep=';')
y_test_r.to_csv('dataset/y_teste_r_cdr.csv',sep=';')

## Sem Balanceamento

In [58]:
# Dividir os dados em features (X) e target (y)
X = df.drop('Heart_Disease', axis=1)
y = df['Heart_Disease']

# Dividir o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
X_train.to_csv('dataset/X_train_cdr.csv',sep=';')
X_test.to_csv('dataset/X_test_cdr.csv',sep=';')
y_train.to_csv('dataset/y_train_cdr.csv',sep=';')
y_test.to_csv('dataset/y_teste_cdr.csv',sep=';')