In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data mining

## A. Pré Processamento do conjunto de dados

### Limpeza e correções

In [2]:
df = pd.read_csv('fetal_health.csv')
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability  

O dataset não apresenta dados nulos e as colunas estão no formato numérico

### Balanceamento

In [4]:
#variável target da classificação: fetal_health

df['fetal_health'].value_counts(normalize=True)

1.0    0.778457
2.0    0.138758
3.0    0.082785
Name: fetal_health, dtype: float64

In [5]:
df['fetal_health'].value_counts()

1.0    1655
2.0     295
3.0     176
Name: fetal_health, dtype: int64

In [6]:
#os dados do target estão desbalanceados -> redução dos casos 1 e 2 para 250 (undersampling), a fim de diminuir 
#                                           a distância do número de casos entre eles e o caso 3.

In [7]:
#Numero final de elementos
n = 250

index_1 = df[df['fetal_health'] == 1].index.to_numpy()
index_2 = df[df['fetal_health'] == 2].index.to_numpy()

print('Numero de casos 1 (antes do undersampling): ', len(index_1))
print('Numero de casos 2 (antes do undersampling): ', len(index_2))

#Escolhendo os n indices de cada amostra de forma arbitrária
u_index_1 = list(np.random.choice(index_1, n))
u_index_2 = list(np.random.choice(index_2, n))

print('Numero de casos 1 (após undersampling): ', len(u_index_1))
print('Numero de casos 2 (após undersampling): ', len(u_index_2))

#Refazendo o data frame com os dados balanceados
df = pd.concat([df[df['fetal_health'] == 1].loc[u_index_1],
                df[df['fetal_health'] == 2].loc[u_index_2],
                df[df['fetal_health'] == 3]], ignore_index=True)

print('\nNumero de casos para cada classe (balanceado):')
display(df['fetal_health'].value_counts())

Numero de casos 1 (antes do undersampling):  1655
Numero de casos 2 (antes do undersampling):  295
Numero de casos 1 (após undersampling):  250
Numero de casos 2 (após undersampling):  250

Numero de casos para cada classe (balanceado):


2.0    250
1.0    250
3.0    176
Name: fetal_health, dtype: int64

## B. Processamento do conjunto de dados

### Identificando a correlação das características (features)

In [8]:
#Vendo a correlação numérica entre todas as features e o target (ordenado da mais influente para a menos)
df.corr()['fetal_health'].apply(lambda x:x*(-1) if x<0 else x).sort_values(ascending=False)

fetal_health                                              1.000000
abnormal_short_term_variability                           0.528437
accelerations                                             0.499608
prolongued_decelerations                                  0.473383
histogram_mode                                            0.380240
mean_value_of_long_term_variability                       0.377682
histogram_mean                                            0.359498
histogram_median                                          0.358398
percentage_of_time_with_abnormal_long_term_variability    0.336112
histogram_variance                                        0.278569
histogram_tendency                                        0.238695
uterine_contractions                                      0.158036
severe_decelerations                                      0.133534
light_decelerations                                       0.119829
fetal_movement                                            0.10

### Filtrando o conjunto de dados

Como o conjunto de dados conta com muitas colunas, será feita uma redução para as colunas mais influentes na variável. As colunas escolhidas foram:

Column|Value
-----------|------------------
abnormal_short_term_variability|                           0.553641
prolongued_decelerations|                                  0.490576
accelerations|                                             0.460534
histogram_mode|                                            0.397466
histogram_mean|                                           0.393664
histogram_median|                                          0.370974
mean_value_of_long_term_variability|                       0.368310
percentage_of_time_with_abnormal_long_term_variability|    0.321691
histogram_variance|                                        0.309877
histogram_tendency|                                        0.213464
light_decelerations|                                       0.195942

In [12]:
df = df[['abnormal_short_term_variability',
         'prolongued_decelerations', 
         'accelerations',
         'histogram_mode',
         'histogram_mean',
         'histogram_median',
         'mean_value_of_long_term_variability', 
         'percentage_of_time_with_abnormal_long_term_variability',
         'histogram_variance',
         'histogram_tendency',
         'light_decelerations',
         'fetal_health']].copy()

### Salvando os resultados

In [10]:
df.to_csv('fetal_health_balanced.csv', index=False)