In [1]:
#Computação científica
import numpy as np

#análise de dados
import pandas as pd

#visualização
import matplotlib.pyplot as plt

#machine learning
import sklearn

# feature engineering
from sklearn.impute import SimpleImputer
from feature_engine.imputation import (
    AddMissingIndicator)
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.encoding import  RareLabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


# Introdução

## Motivação


## Objetivos


# Parâmetros

Definição de alguns parâmetros que serão usado ao longo de todo o dataframe

In [2]:
#Definição de ssed para random state
#Com o uso da mesma seed os resultados são reprodutíveis
seed=0

In [3]:
sklearn.set_config(display='diagram')
sklearn.set_config(transform_output="pandas")

# Download Dataset

In [4]:
# O dataset está disponível no seguinte URL:
#https://www.kaggle.com/competitions/porto-seguro-safe-driver-prediction/data

path='/home/rodolfo/Insync/rodolfopcruz2@gmail.com/Google Drive/Estudo/Python_Projects/datasets/porto-seguro-safe-driver-prediction/'
train=pd.read_csv(path+'x_train.csv')


# Feature Engineering

- Missing Values
- Distribuição não normal
- Remoção de labels raras nas variáveis categóricas
- Encoding
- Normalização

In [5]:
cat_features=[feature for feature in train.columns if 'cat' in feature]
bin_features=[feature for feature in train.columns if 'bin' in feature]
num_features=[feature for feature in train.columns if 'cat' not in feature 
                                                    and 'bin' not in feature and
                                                    feature!='target' and feature!='id']


## Missing Values

In [6]:
#Missing values estão com valor -1
#Substituir -1 por nan para facilitar a identificação dos valores ausentes
train=train.replace(-1, np.nan)
#test =test.replace(-1,np.nan)

In [7]:
#Separar os dados de input dos dados de output

y_train=train['target'] #Resposta esperada (cliente solicitou ou não um seguro)
train=train.drop(columns='target')

### Features Binárias

- Nas colunas com baixa proporção de missing values eles serão substituídos pelo valor mais comum;
- Nas colunas com elevada proporção de missing values eles serão substituídos por uma string para identificar o valor ausente;


In [8]:
#data type das features binárias
train[bin_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416648 entries, 0 to 416647
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   ps_ind_06_bin   416648 non-null  int64
 1   ps_ind_07_bin   416648 non-null  int64
 2   ps_ind_08_bin   416648 non-null  int64
 3   ps_ind_09_bin   416648 non-null  int64
 4   ps_ind_10_bin   416648 non-null  int64
 5   ps_ind_11_bin   416648 non-null  int64
 6   ps_ind_12_bin   416648 non-null  int64
 7   ps_ind_13_bin   416648 non-null  int64
 8   ps_ind_16_bin   416648 non-null  int64
 9   ps_ind_17_bin   416648 non-null  int64
 10  ps_ind_18_bin   416648 non-null  int64
 11  ps_calc_15_bin  416648 non-null  int64
 12  ps_calc_16_bin  416648 non-null  int64
 13  ps_calc_17_bin  416648 non-null  int64
 14  ps_calc_18_bin  416648 non-null  int64
 15  ps_calc_19_bin  416648 non-null  int64
 16  ps_calc_20_bin  416648 non-null  int64
dtypes: int64(17)
memory usage: 54.0 MB


In [9]:
#Converter o data type das features binárias
train[bin_features]=train[bin_features].astype('object')
#test[bin_features]=test[bin_features].astype('object')

train[bin_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416648 entries, 0 to 416647
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ps_ind_06_bin   416648 non-null  object
 1   ps_ind_07_bin   416648 non-null  object
 2   ps_ind_08_bin   416648 non-null  object
 3   ps_ind_09_bin   416648 non-null  object
 4   ps_ind_10_bin   416648 non-null  object
 5   ps_ind_11_bin   416648 non-null  object
 6   ps_ind_12_bin   416648 non-null  object
 7   ps_ind_13_bin   416648 non-null  object
 8   ps_ind_16_bin   416648 non-null  object
 9   ps_ind_17_bin   416648 non-null  object
 10  ps_ind_18_bin   416648 non-null  object
 11  ps_calc_15_bin  416648 non-null  object
 12  ps_calc_16_bin  416648 non-null  object
 13  ps_calc_17_bin  416648 non-null  object
 14  ps_calc_18_bin  416648 non-null  object
 15  ps_calc_19_bin  416648 non-null  object
 16  ps_calc_20_bin  416648 non-null  object
dtypes: object(17)
memory usage: 5

In [10]:
#Verificar a presença de missing values
train.loc[:,bin_features].isna().sum()

ps_ind_06_bin     0
ps_ind_07_bin     0
ps_ind_08_bin     0
ps_ind_09_bin     0
ps_ind_10_bin     0
ps_ind_11_bin     0
ps_ind_12_bin     0
ps_ind_13_bin     0
ps_ind_16_bin     0
ps_ind_17_bin     0
ps_ind_18_bin     0
ps_calc_15_bin    0
ps_calc_16_bin    0
ps_calc_17_bin    0
ps_calc_18_bin    0
ps_calc_19_bin    0
ps_calc_20_bin    0
dtype: int64

In [11]:
#Os missing values das features binárias serão substituídos pelo valor mais frequente na coluna

bin_imputer_most_frequent=SimpleImputer(strategy='most_frequent')
bin_imputer_most_frequent.fit(train[bin_features]) #fit nos dados de treino


In [12]:
# usar imputer para transformar os dados

train[bin_features]=bin_imputer_most_frequent.transform(train[bin_features]) 
#test[bin_features] =bin_imputer_most_frequent.transform(test[bin_features])

In [13]:
#formato dos dados após transformação
train[bin_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416648 entries, 0 to 416647
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ps_ind_06_bin   416648 non-null  object
 1   ps_ind_07_bin   416648 non-null  object
 2   ps_ind_08_bin   416648 non-null  object
 3   ps_ind_09_bin   416648 non-null  object
 4   ps_ind_10_bin   416648 non-null  object
 5   ps_ind_11_bin   416648 non-null  object
 6   ps_ind_12_bin   416648 non-null  object
 7   ps_ind_13_bin   416648 non-null  object
 8   ps_ind_16_bin   416648 non-null  object
 9   ps_ind_17_bin   416648 non-null  object
 10  ps_ind_18_bin   416648 non-null  object
 11  ps_calc_15_bin  416648 non-null  object
 12  ps_calc_16_bin  416648 non-null  object
 13  ps_calc_17_bin  416648 non-null  object
 14  ps_calc_18_bin  416648 non-null  object
 15  ps_calc_19_bin  416648 non-null  object
 16  ps_calc_20_bin  416648 non-null  object
dtypes: object(17)
memory usage: 5

In [14]:
#Parametros que serão usados pelo imputer
bin_imputer_most_frequent.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'most_frequent',
 'fill_value': None,
 'copy': True,
 'feature_names_in_': array(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
        'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
        'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin',
        'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin',
        'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin'], dtype=object),
 'n_features_in_': 17,
 '_fit_dtype': dtype('O'),
 'indicator_': None,
 'statistics_': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0], dtype=object)}

### Features Categóricas

Duas estratégias para substituição doos missing values:

1) Features com proporção de ausentes inferior ao threshold estipulado:

        Substituição por valor mais comum

2) Features com proporção de ausentes superior ao threshold estipulado:
        
        Substituição dos valores ausentes pela string missing

In [15]:
cat_features

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [16]:
#proporção de missing values
train[cat_features].isna().mean().sort_values(ascending=False)


ps_car_03_cat    0.690893
ps_car_05_cat    0.447349
ps_car_07_cat    0.019422
ps_ind_05_cat    0.009838
ps_car_09_cat    0.000943
ps_ind_02_cat    0.000365
ps_car_01_cat    0.000190
ps_ind_04_cat    0.000154
ps_car_02_cat    0.000010
ps_car_04_cat    0.000000
ps_car_06_cat    0.000000
ps_car_08_cat    0.000000
ps_car_10_cat    0.000000
ps_car_11_cat    0.000000
dtype: float64

In [17]:
#formato dos dados categóricos
train[cat_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416648 entries, 0 to 416647
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ps_ind_02_cat  416496 non-null  float64
 1   ps_ind_04_cat  416584 non-null  float64
 2   ps_ind_05_cat  412549 non-null  float64
 3   ps_car_01_cat  416569 non-null  float64
 4   ps_car_02_cat  416644 non-null  float64
 5   ps_car_03_cat  128789 non-null  float64
 6   ps_car_04_cat  416648 non-null  int64  
 7   ps_car_05_cat  230261 non-null  float64
 8   ps_car_06_cat  416648 non-null  int64  
 9   ps_car_07_cat  408556 non-null  float64
 10  ps_car_08_cat  416648 non-null  int64  
 11  ps_car_09_cat  416255 non-null  float64
 12  ps_car_10_cat  416648 non-null  int64  
 13  ps_car_11_cat  416648 non-null  int64  
dtypes: float64(9), int64(5)
memory usage: 44.5 MB


In [18]:
#transformação do formato dos dados
train[cat_features]=train[cat_features].astype(object)
#test[cat_features] =test[cat_features].astype(object)

In [19]:
missing_threshold=1/100
#features categóricas com número de missing values superior ao thresold
cat_features_muitos_na=train[cat_features].isna().mean()>missing_threshold
cat_features_muitos_na=cat_features_muitos_na[cat_features_muitos_na].index.to_list()

#features categóricas com número de missing values inferior ao thresold
cat_features_poucos_na=train[cat_features].isna().mean()<missing_threshold
cat_features_poucos_na=cat_features_poucos_na[cat_features_poucos_na].index.to_list()

In [20]:
#imputer para features categóricas com elevada proporção de missing values
cat_imputer_muitos_na=SimpleImputer(strategy='constant',fill_value='missing')
cat_imputer_muitos_na.fit(train[cat_features_muitos_na])

In [21]:
#imputer para features categóricas com baixa proporção de missing values
cat_imputer_poucos_na=SimpleImputer(strategy='most_frequent')
cat_imputer_poucos_na.fit(train[cat_features_poucos_na])

In [22]:
train[cat_features_muitos_na]=cat_imputer_muitos_na.transform(train[cat_features_muitos_na]) #transformar os dados de treino
#test[cat_features_muitos_na]  =cat_imputer_muitos_na.transform(test[cat_features_muitos_na])  #transformar os dados de test


train[cat_features_poucos_na]=cat_imputer_poucos_na.transform(train[cat_features_poucos_na]) #transformar os dados de treino
#test[cat_features_poucos_na] =cat_imputer_poucos_na.transform(test[cat_features_poucos_na]) #transformar os dados de treino


train[cat_features].isna().mean()

ps_ind_02_cat    0.0
ps_ind_04_cat    0.0
ps_ind_05_cat    0.0
ps_car_01_cat    0.0
ps_car_02_cat    0.0
ps_car_03_cat    0.0
ps_car_04_cat    0.0
ps_car_05_cat    0.0
ps_car_06_cat    0.0
ps_car_07_cat    0.0
ps_car_08_cat    0.0
ps_car_09_cat    0.0
ps_car_10_cat    0.0
ps_car_11_cat    0.0
dtype: float64

In [23]:
#Parâmetros que serão usados pelo imputer

cat_imputer_muitos_na.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'constant',
 'fill_value': 'missing',
 'copy': True,
 'feature_names_in_': array(['ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat'], dtype=object),
 'n_features_in_': 3,
 '_fit_dtype': dtype('O'),
 'indicator_': None,
 'statistics_': array(['missing', 'missing', 'missing'], dtype=object)}

In [24]:
#Parâmetros que serão usados pelo imputer

cat_imputer_poucos_na.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'most_frequent',
 'fill_value': None,
 'copy': True,
 'feature_names_in_': array(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
        'ps_car_02_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_08_cat',
        'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat'], dtype=object),
 'n_features_in_': 11,
 '_fit_dtype': dtype('O'),
 'indicator_': None,
 'statistics_': array([1.0, 0.0, 0.0, 11.0, 1.0, 0, 11, 1, 2.0, 1, 104], dtype=object)}

### Features Numéricas

In [25]:
#proporção de missing values nas features numéricas
train[num_features].isna().mean()

ps_ind_01     0.000000
ps_ind_03     0.000000
ps_ind_14     0.000000
ps_ind_15     0.000000
ps_reg_01     0.000000
ps_reg_02     0.000000
ps_reg_03     0.181170
ps_car_11     0.000010
ps_car_12     0.000002
ps_car_13     0.000000
ps_car_14     0.071874
ps_car_15     0.000000
ps_calc_01    0.000000
ps_calc_02    0.000000
ps_calc_03    0.000000
ps_calc_04    0.000000
ps_calc_05    0.000000
ps_calc_06    0.000000
ps_calc_07    0.000000
ps_calc_08    0.000000
ps_calc_09    0.000000
ps_calc_10    0.000000
ps_calc_11    0.000000
ps_calc_12    0.000000
ps_calc_13    0.000000
ps_calc_14    0.000000
dtype: float64

In [26]:
missing_threshold=1/100

#numeric features com muitos mussing values
num_features_muitos_na=train[num_features].isna().mean()>missing_threshold
num_features_muitos_na=num_features_muitos_na[num_features_muitos_na].index.to_list()

#numeric features com poucos missing values
num_features_poucos_na=train[num_features].isna().mean()<missing_threshold
num_features_poucos_na=num_features_poucos_na[num_features_poucos_na].index.to_list()

In [27]:
#imputer com muitos na


missing_indicator=AddMissingIndicator(variables=num_features_muitos_na)
missing_indicator.fit(train) #fit nos dados de treino



In [28]:
train=missing_indicator.transform(train) #transformar dados de treino 
#test =missing_indicator.transform(test)

In [29]:
#Todas os missing values nas features numéricas serão substituídas pela média
num_imputer=SimpleImputer(strategy='mean',)
num_imputer.fit(train[num_features]) #fit nos dados de treino

In [None]:
# Parâmetros que serão usados pelo imputer
num_imputer.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'mean',
 'fill_value': None,
 'copy': True,
 'feature_names_in_': array(['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01',
        'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13',
        'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03',
        'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07',
        'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
        'ps_calc_12', 'ps_calc_13', 'ps_calc_14'], dtype=object),
 'n_features_in_': 26,
 '_fit_dtype': dtype('float64'),
 'indicator_': None,
 'statistics_': array([1.90239003, 4.42646311, 0.01235095, 7.29857818, 0.61073736,
        0.43818091, 0.89334173, 2.34634364, 0.37999363, 0.81323001,
        0.37469046, 3.06557526, 0.4491398 , 0.44935629, 0.44986991,
        2.37176946, 1.88546447, 7.68685797, 3.00647069, 9.22792861,
        2.33979762, 8.43378823, 5.44559676, 1.4419846 , 2.8704

In [31]:
train[num_features]=num_imputer.transform(train[num_features]) #transformar dados de treino 
#test[num_features] =num_imputer.transform(test[num_features])

train[num_features].isna().mean()

ps_ind_01     0.0
ps_ind_03     0.0
ps_ind_14     0.0
ps_ind_15     0.0
ps_reg_01     0.0
ps_reg_02     0.0
ps_reg_03     0.0
ps_car_11     0.0
ps_car_12     0.0
ps_car_13     0.0
ps_car_14     0.0
ps_car_15     0.0
ps_calc_01    0.0
ps_calc_02    0.0
ps_calc_03    0.0
ps_calc_04    0.0
ps_calc_05    0.0
ps_calc_06    0.0
ps_calc_07    0.0
ps_calc_08    0.0
ps_calc_09    0.0
ps_calc_10    0.0
ps_calc_11    0.0
ps_calc_12    0.0
ps_calc_13    0.0
ps_calc_14    0.0
dtype: float64

## Transformações

- Aplicação da transformação de Yeo-Jhoson as seguintes variáveis:

    - ps_reg_03
    - ps_car_12
    - ps_car_13
    - ps_car_14
    - ps_car_15
    - ps_reg_02


In [32]:
continuous_numerical_variables=['ps_reg_03','ps_car_12','ps_car_13','ps_car_14','ps_car_15','ps_reg_02'] #features foram selecionadas durante análise exploratória
yeo_transformer=YeoJohnsonTransformer(variables=continuous_numerical_variables)

In [33]:
train=yeo_transformer.fit_transform(train) #fit nos dados de treino
#test=yeo_transformer.transform(test)

In [None]:
#Parâmetros que serão usado pleo transformer
yeo_transformer.lambda_dict_

{'ps_reg_03': np.float64(-1.5666809349235482),
 'ps_car_12': np.float64(-5.9282444714177185),
 'ps_car_13': np.float64(-2.536653261055529),
 'ps_car_14': np.float64(-3.205470228477302),
 'ps_car_15': np.float64(4.087523800482025),
 'ps_reg_02': np.float64(-1.3937305248976124)}

## Labels Raras

Agrupar labels que aparecem em pequena proporção em cada feature categórica

In [35]:
rare_threshold=1/100
#Todas as labels que aparecem em proproção inferior a rare_thrshold serão agrupadas como uma única

#Identificar fearues que contem labels raras
rare_labels=[]
for feature in cat_features:
    if  not ((train[feature].value_counts()/len(train))>rare_threshold).all():
        rare_labels.append(feature)

In [36]:
rare_labels

['ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_04_cat',
 'ps_car_06_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [37]:
rare_encoder = RareLabelEncoder(tol=rare_threshold, n_categories=1, variables=rare_labels)

rare_encoder.fit(train)



In [38]:
#Transformar dados de treino 
train=rare_encoder.transform(train)
#test=rare_encoder.transform(test)

In [39]:
#Verificar a criação da nova label
train['ps_car_04_cat'].value_counts()/len(train)

ps_car_04_cat
0       0.834167
1       0.054130
2       0.039907
8       0.034518
9       0.032051
Rare    0.005227
Name: count, dtype: float64

## Encoder

In [40]:
#Encoder aplicado as featres categóricas
cat_features

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [41]:
#one hot encoder
train[cat_features]=train[cat_features].astype(str) #Necessário converter para string para que todos os valores na coluna tenham o mesmo formato
#test[cat_features]=test[cat_features].astype(str)

enc = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
enc.fit(train[cat_features])

In [42]:
# Transformar dados de treino 
train_one_hot_encoded_features=enc.transform(train[cat_features])
#test_one_hot_encoded_features =enc.transform(test[cat_features])


## Scaling

In [43]:
train[num_features].describe()

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,...,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14
count,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,...,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0
mean,1.90239,4.426463,0.012351,7.298578,0.610737,0.237317,0.392051,2.346344,0.142834,0.302034,...,1.885464,7.686858,3.006471,9.227929,2.339798,8.433788,5.445597,1.441985,2.870442,7.538011
std,1.985387,2.697667,0.127135,3.546144,0.287767,0.150321,0.05591,0.832347,0.005911,0.024074,...,1.134474,1.33444,1.414637,1.459645,1.248057,2.905236,2.332877,1.203898,1.692596,2.74424
min,0.0,0.0,0.0,0.0,0.0,0.0,0.056752,0.0,0.072813,0.187749,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,5.0,0.4,0.161,0.351435,2.0,0.135598,0.286978,...,1.0,7.0,2.0,8.0,1.0,6.0,4.0,1.0,2.0,6.0
50%,1.0,4.0,0.0,7.0,0.7,0.219745,0.403496,3.0,0.143054,0.301003,...,2.0,8.0,3.0,9.0,2.0,8.0,5.0,1.0,3.0,7.0
75%,3.0,6.0,0.0,10.0,0.9,0.344822,0.422603,3.0,0.145734,0.317474,...,3.0,9.0,4.0,10.0,3.0,10.0,7.0,2.0,4.0,9.0
max,7.0,11.0,4.0,13.0,0.9,0.546653,0.587615,3.0,0.167359,0.386528,...,6.0,10.0,9.0,12.0,7.0,25.0,19.0,10.0,13.0,23.0


In [44]:
scaler = StandardScaler()

In [45]:
#fit nos dados de treino
scaler.fit(train[num_features])

In [46]:
#transformar os dados de treino 
train[num_features]=scaler.transform(train[num_features])
#test[num_features]=scaler.transform(test[num_features])


In [47]:
train[num_features].describe()

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,...,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14
count,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,...,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0,416648.0
mean,3.588117e-17,4.5294870000000003e-17,-1.21423e-17,7.299022e-17,5.5083740000000004e-17,-6.783998e-17,3.40704e-15,3.536956e-16,1.037423e-14,9.782054000000001e-17,...,1.109008e-16,3.204407e-16,-1.6235210000000003e-17,5.243529e-16,2.506907e-17,-2.783861e-16,8.786112000000001e-17,5.1024940000000005e-17,-1.282957e-16,1.494935e-16
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-0.9581972,-1.640851,-0.0971481,-2.058176,-2.122338,-1.578729,-5.99708,-2.818951,-11.84658,-4.747169,...,-1.661975,-5.760367,-2.125263,-4.951847,-1.874754,-2.902965,-2.334286,-1.197765,-1.695883,-2.746851
25%,-0.9581972,-0.8994685,-0.0971481,-0.6481917,-0.7323214,-0.5076867,-0.7264554,-0.4161052,-1.224309,-0.6253995,...,-0.7805079,-0.5147167,-0.7114703,-0.8412527,-1.073508,-0.8377258,-0.6196633,-0.3671284,-0.5142651,-0.5604511
50%,-0.4545165,-0.1580861,-0.0971481,-0.08419809,0.3101915,-0.1168956,0.2047083,0.7853177,0.03726058,-0.04282862,...,0.1009593,0.234662,-0.004574106,-0.1561537,-0.2722616,-0.1493127,-0.1910076,-0.3671284,0.07654393,-0.196051
75%,0.552845,0.5832962,-0.0971481,0.7617923,1.0052,0.7151676,0.546453,0.7853177,0.4905591,0.6413456,...,0.9824265,0.9840406,0.7023221,0.5289454,0.5289847,0.5391003,0.6663038,0.4635078,0.667353,0.532749
max,2.567568,2.436752,31.3654,1.607783,1.0052,2.057837,3.497809,0.7853177,4.149262,3.509699,...,3.626828,1.733419,4.236803,1.899143,3.73397,5.702199,5.810172,7.108598,5.984635,5.634349
