# Pré-Processamento
O objetivo desse notebook é realizar as mesmas etapas de pré-processamento feitas no artigo "[An Empirical and Statistical Analysis of Fetal Health Classification Using Different Machine Learning Algorithm](/An_Empirical_and_Statistical_Analysis_of_Fetal_Health_Classification_Using_Diffe.pdf).

## Importação de bibliotecas e configuração de notebook

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler as min_max_scaler
from ucimlrepo import fetch_ucirepo as repo
from imblearn.over_sampling import SMOTE as smote

## Carreamento de dataset

In [3]:
cardiotocography = repo(id=193) 
X = cardiotocography.data.features 
y = cardiotocography.data.targets 

In [21]:
y = y.drop(columns=['CLASS'])

In [22]:
df = pd.concat([X, y], axis=1)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   LB        2126 non-null   int64  
 1   AC        2126 non-null   float64
 2   FM        2126 non-null   float64
 3   UC        2126 non-null   float64
 4   DL        2126 non-null   float64
 5   DS        2126 non-null   float64
 6   DP        2126 non-null   float64
 7   ASTV      2126 non-null   int64  
 8   MSTV      2126 non-null   float64
 9   ALTV      2126 non-null   int64  
 10  MLTV      2126 non-null   float64
 11  Width     2126 non-null   int64  
 12  Min       2126 non-null   int64  
 13  Max       2126 non-null   int64  
 14  Nmax      2126 non-null   int64  
 15  Nzeros    2126 non-null   int64  
 16  Mode      2126 non-null   int64  
 17  Mean      2126 non-null   int64  
 18  Median    2126 non-null   int64  
 19  Variance  2126 non-null   int64  
 20  Tendency  2126 non-null   int6

In [24]:
df.head().T

Unnamed: 0,0,1,2,3,4
LB,120.0,132.0,133.0,134.0,132.0
AC,0.0,0.006,0.003,0.003,0.007
FM,0.0,0.0,0.0,0.0,0.0
UC,0.0,0.006,0.008,0.008,0.008
DL,0.0,0.003,0.003,0.003,0.0
DS,0.0,0.0,0.0,0.0,0.0
DP,0.0,0.0,0.0,0.0,0.0
ASTV,73.0,17.0,16.0,16.0,16.0
MSTV,0.5,2.1,2.1,2.4,2.4
ALTV,43.0,0.0,0.0,0.0,0.0


In [25]:
df[df.columns.to_list()[:-1]].describe().T[['mean','std','min','25%','50%','75%','max']]

Unnamed: 0,mean,std,min,25%,50%,75%,max
LB,133.303857,9.840844,106.0,126.0,133.0,140.0,160.0
AC,0.003178,0.003866,0.0,0.0,0.002,0.006,0.019
FM,0.009481,0.046666,0.0,0.0,0.0,0.003,0.481
UC,0.004366,0.002946,0.0,0.002,0.004,0.007,0.015
DL,0.001889,0.00296,0.0,0.0,0.0,0.003,0.015
DS,3e-06,5.7e-05,0.0,0.0,0.0,0.0,0.001
DP,0.000159,0.00059,0.0,0.0,0.0,0.0,0.005
ASTV,46.990122,17.192814,12.0,32.0,49.0,61.0,87.0
MSTV,1.332785,0.883241,0.2,0.7,1.2,1.7,7.0
ALTV,9.84666,18.39688,0.0,0.0,0.0,11.0,91.0


## Funções

## Distribuições das variáveis

## Balanceamento de dados

In [26]:
sm = smote(sampling_strategy='not majority', random_state=42, k_neighbors=5)

In [27]:
X_balanced, y_balanced = sm.fit_resample(X, y)
df_balanced = pd.concat([X_balanced, y_balanced], axis=1)

In [28]:
df_balanced.shape

(4965, 22)

In [30]:
df_balanced[df_balanced.columns.to_list()[:-1]].describe().T[['mean','std','min','25%','50%','75%','max']]

Unnamed: 0,mean,std,min,25%,50%,75%,max
LB,135.100705,9.843473,106.0,128.0,134.0,143.0,160.0
AC,0.001552,0.002958,0.0,0.0,0.0,0.002,0.019
FM,0.012083,0.049934,0.0,0.0,0.00077,0.004,0.481
UC,0.003701,0.003093,0.0,0.000825,0.003359,0.006,0.015
DL,0.002021,0.003177,0.0,0.0,0.0,0.003,0.015
DS,1.2e-05,0.000101,0.0,0.0,0.0,0.0,0.001
DP,0.000474,0.000987,0.0,0.0,0.0,0.0,0.005
ASTV,56.221349,16.465612,12.0,46.0,60.0,66.0,87.0
MSTV,1.21944,0.990519,0.2,0.4,0.9,1.8,7.0
ALTV,18.815509,25.475894,0.0,0.0,4.0,32.0,91.0


In [31]:
describe = df[df.columns.to_list()[:-1]].describe().T[['mean','std','min','25%','50%','75%','max']].copy()
describe_balanced = df_balanced[df_balanced.columns.to_list()[:-1]].describe().T[['mean','std','min','25%','50%','75%','max']]
abs(describe - describe_balanced)

Unnamed: 0,mean,std,min,25%,50%,75%,max
LB,1.796848,0.002629,0.0,2.0,1.0,3.0,0.0
AC,0.001626,0.000908,0.0,0.0,0.002,0.004,0.0
FM,0.002603,0.003268,0.0,0.0,0.00077,0.001,0.0
UC,0.000666,0.000147,0.0,0.001175,0.000641,0.001,0.0
DL,0.000131,0.000216,0.0,0.0,0.0,0.0,0.0
DS,8e-06,4.3e-05,0.0,0.0,0.0,0.0,0.0
DP,0.000315,0.000397,0.0,0.0,0.0,0.0,0.0
ASTV,9.231227,0.727202,0.0,14.0,11.0,5.0,0.0
MSTV,0.113345,0.107277,0.0,0.3,0.3,0.1,0.0
ALTV,8.968848,7.079014,0.0,0.0,4.0,21.0,0.0


## Normalização

In [33]:
scaler = min_max_scaler()

In [34]:
X_scaled = scaler.fit_transform(X_balanced)

In [39]:
X_scaled[:3]

array([[0.25925926, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.81333333, 0.04411765, 0.47252747,
        0.04733728, 0.34463277, 0.11009174, 0.03448276, 0.11111111,
        0.        , 0.47244094, 0.58715596, 0.40366972, 0.27137546,
        1.        ],
       [0.48148148, 0.31578947, 0.        , 0.4       , 0.2       ,
        0.        , 0.        , 0.06666667, 0.27941176, 0.        ,
        0.20512821, 0.71751412, 0.16513761, 0.65517241, 0.33333333,
        0.1       , 0.63779528, 0.57798165, 0.57798165, 0.04460967,
        0.5       ],
       [0.5       , 0.15789474, 0.        , 0.53333333, 0.2       ,
        0.        , 0.        , 0.05333333, 0.27941176, 0.        ,
        0.2642998 , 0.71751412, 0.16513761, 0.65517241, 0.27777778,
        0.1       , 0.63779528, 0.56880734, 0.55963303, 0.04832714,
        0.5       ]])

## Salvamento

In [40]:
df_scaled = pd.DataFrame(data=X_scaled, columns=df_balanced.columns.to_list()[:-1])

In [42]:
result = pd.concat([df_scaled,y_balanced], axis=1)

In [44]:
result.to_csv('CTG_pre_processado.csv', index=False)