## Técnicas de Aprendizado de Máquina: Scaling

### Bibliotecas, dados e pré-processamento

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from utils.churn_modelling_utils import import_data, initial_preprocess, num_cat_cols

In [2]:
df = import_data()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10002 non-null  int64  
 1   Geography        10001 non-null  object 
 2   Gender           10002 non-null  object 
 3   Age              10001 non-null  float64
 4   Tenure           10002 non-null  int64  
 5   Balance          10002 non-null  float64
 6   NumOfProducts    10002 non-null  int64  
 7   HasCrCard        10001 non-null  float64
 8   IsActiveMember   10001 non-null  float64
 9   EstimatedSalary  10002 non-null  float64
 10  Exited           10002 non-null  int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 859.7+ KB


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [3]:
X_train, X_test, y_train, y_test = initial_preprocess(df)

### Scaling

In [4]:
from scaler import Scaler

In [5]:
num_cols, _ = num_cat_cols(X_train)

#### MinMaxScale

$$x_{i,scaled} = \frac{x_{i} - x_{min}}{x_{max} - x_{min}}$$

In [6]:
scaler = Scaler("minmax")
X_train, X_test = scaler.scale(X_train, X_test)
X_train[num_cols].head()

Unnamed: 0,Gender_Female,Tenure,Geography_Germany,IsActiveMember,Geography_France,CreditScore,Balance,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,Age,EstimatedSalary
0,1.0,1.0,1.0,1.0,0.0,0.31,0.488063,0.333333,0.0,1.0,0.0,0.271429,0.263437
1,0.0,0.3,0.0,1.0,1.0,0.418,0.562377,0.0,1.0,1.0,0.0,0.142857,0.077986
2,1.0,0.8,0.0,1.0,1.0,0.616,0.0,0.333333,0.0,0.0,0.0,0.157143,0.65231
3,0.0,0.7,0.0,0.0,1.0,0.466,0.0,0.333333,1.0,1.0,0.0,0.128571,0.256394
4,0.0,0.3,0.0,1.0,1.0,0.84,0.61797,0.333333,1.0,1.0,0.0,0.185714,0.930353


#### StandardScale

$$x_{i,scaled} = \frac{x_{i} - \bar{x}}{s_{x}}$$

In [7]:
scaler = Scaler("standard")
X_train, X_test = scaler.scale(X_train, X_test)
X_train[num_cols].head()

Unnamed: 0,Gender_Female,Tenure,Geography_Germany,IsActiveMember,Geography_France,CreditScore,Balance,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,Age,EstimatedSalary
0,1.09816,1.728965,1.732773,0.97202,-1.004384,-1.503588,0.74203,0.8161,-1.09816,0.644674,-0.574223,-0.188549,-0.825014
1,-0.910614,-0.694312,-0.57711,0.97202,0.995635,-0.943643,1.040751,-0.909029,0.910614,0.644674,-0.574223,-1.044471,-1.469472
2,1.09816,1.0366,-0.57711,0.97202,0.995635,0.082922,-1.219821,0.8161,-1.09816,-1.551171,-0.574223,-0.949369,0.52635
3,-0.910614,0.690418,-0.57711,-1.028786,0.995635,-0.694779,-1.219821,0.8161,0.910614,0.644674,-0.574223,-1.139574,-0.849489
4,-0.910614,-0.694312,-0.57711,0.97202,0.995635,1.24429,1.264216,0.8161,0.910614,0.644674,-0.574223,-0.759164,1.49257


#### RobustScale

$$x_{i,scaled} = \frac{x_{i} - x_{median}}{IQR}$$

IQR = 3rd quartile - 1st quartile = 75th quantile - 25th quantile

In [8]:
scaler = Scaler("robust")
X_train, X_test = scaler.scale(X_train, X_test)
X_train[num_cols].head()

Unnamed: 0,Gender_Female,Tenure,Geography_Germany,IsActiveMember,Geography_France,CreditScore,Balance,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,Age,EstimatedSalary
0,1.0,1.0,2.309882,0.0,-1.0,-1.105263,0.200529,1.0,-1.0,0.0,0.0,0.0,-0.48496
1,0.0,-0.4,0.0,0.0,0.0,-0.699248,0.346797,0.0,0.0,0.0,0.0,-0.75,-0.862727
2,1.0,0.6,0.0,0.0,0.0,0.045113,-0.760081,1.0,-1.0,-1.0,0.0,-0.666667,0.307181
3,0.0,0.4,0.0,-1.0,0.0,-0.518797,-0.760081,1.0,0.0,0.0,0.0,-0.833333,-0.499307
4,0.0,-0.4,0.0,0.0,0.0,0.887218,0.456215,1.0,0.0,0.0,0.0,-0.5,0.873559
