## Técnicas de Aprendizado de Máquina: Scaling

### Bibliotecas, dados e pré-processamento

In [1]:
import sys
sys.path.append('../')
from utils.churn_modelling_utils import (
    import_data, 
    preprocess_without_scale, 
    num_cat_cols
)

In [2]:
df = import_data()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10002 non-null  int64  
 1   Geography        10001 non-null  object 
 2   Gender           10002 non-null  object 
 3   Age              10001 non-null  float64
 4   Tenure           10002 non-null  int64  
 5   Balance          10002 non-null  float64
 6   NumOfProducts    10002 non-null  int64  
 7   HasCrCard        10001 non-null  float64
 8   IsActiveMember   10001 non-null  float64
 9   EstimatedSalary  10002 non-null  float64
 10  Exited           10002 non-null  int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 859.7+ KB


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [3]:
X_train, X_test, y_train, y_test = preprocess_without_scale(df)

#### Dados antes do scaling

In [4]:
X_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,743,36.0,8,92716.96,1,1.0,1.0,33693.78,False,False,True,True,False
1,596,54.0,1,123544.0,1,1.0,1.0,120314.75,False,True,False,False,True
2,775,42.0,6,133970.22,2,0.0,1.0,187839.9,True,False,False,False,True
3,850,57.0,8,126776.3,2,1.0,1.0,132298.49,False,False,True,True,False
4,776,37.0,2,103769.22,2,1.0,0.0,194099.12,False,True,False,True,False


In [5]:
X_test.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,735,28.0,5,160454.15,2,0.0,1.0,114957.22,False,True,False,False,True
1,523,40.0,2,102967.41,1,1.0,0.0,128702.1,True,False,False,True,False
2,706,42.0,8,95386.82,1,1.0,1.0,75732.25,False,False,True,True,False
3,791,36.0,6,111168.97,1,1.0,1.0,189969.91,True,False,False,False,True
4,706,38.0,5,163034.82,2,1.0,1.0,135662.17,False,True,False,False,True


### Scaling

In [6]:
from scaler import Scaler

In [7]:
num_cols, _ = num_cat_cols(X_train)

#### MinMaxScale

$$x_{i,scaled} = \frac{x_{i} - x_{min}}{x_{max} - x_{min}}$$

In [8]:
scaler = Scaler("minmax")
X_train_min_max, X_test_min_max = scaler.scale(X_train, X_test, num_cols)

In [9]:
X_train_min_max[num_cols].head()

Unnamed: 0,IsActiveMember,Geography_Germany,Age,Geography_France,Gender_Female,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,CreditScore,EstimatedSalary,Balance,Tenure
0,1.0,0.0,0.257143,0.0,1.0,0.0,0.0,1.0,1.0,0.786,0.168427,0.388934,0.8
1,1.0,1.0,0.514286,0.0,0.0,0.0,1.0,1.0,0.0,0.492,0.601573,0.518249,0.1
2,1.0,0.0,0.342857,1.0,0.0,0.333333,1.0,0.0,0.0,0.85,0.939231,0.561985,0.6
3,1.0,0.0,0.557143,0.0,1.0,0.333333,0.0,1.0,1.0,1.0,0.661498,0.531808,0.8
4,0.0,1.0,0.271429,0.0,1.0,0.333333,0.0,1.0,0.0,0.852,0.97053,0.435296,0.2


In [10]:
X_test_min_max[num_cols].head()

Unnamed: 0,IsActiveMember,Geography_Germany,Age,Geography_France,Gender_Female,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,CreditScore,EstimatedSalary,Balance,Tenure
0,1.0,1.0,0.142857,0.0,0.0,0.333333,1.0,0.0,0.0,0.77,0.574783,0.673081,0.5
1,0.0,0.0,0.314286,1.0,1.0,0.0,0.0,1.0,0.0,0.346,0.643514,0.431933,0.2
2,1.0,0.0,0.342857,0.0,1.0,0.0,0.0,1.0,1.0,0.712,0.37864,0.400133,0.8
3,1.0,0.0,0.257143,1.0,0.0,0.0,1.0,1.0,0.0,0.882,0.949882,0.466337,0.6
4,1.0,1.0,0.285714,0.0,0.0,0.333333,1.0,1.0,0.0,0.712,0.678318,0.683907,0.5


#### StandardScale

$$x_{i,scaled} = \frac{x_{i} - \bar{x}}{s_{x}}$$

In [11]:
scaler = Scaler("standard")
X_train_std, X_test_std = scaler.scale(X_train, X_test, num_cols)

In [12]:
X_train_std[num_cols].head()

Unnamed: 0,IsActiveMember,Geography_Germany,Age,Geography_France,Gender_Female,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,CreditScore,EstimatedSalary,Balance,Tenure
0,0.971048,-0.577302,-0.279644,-1.004886,1.095395,-0.914791,-1.095395,0.643897,1.743235,0.960907,-1.156867,0.260781,1.0343
1,0.971048,1.732195,1.427828,-1.004886,-0.912913,-0.914791,0.912913,0.643897,-0.573646,-0.561344,0.350935,0.754247,-1.384509
2,0.971048,-0.577302,0.289513,0.995137,-0.912913,0.815118,0.912913,-1.553042,-0.573646,1.292281,1.526339,0.921145,0.343212
3,0.971048,-0.577302,1.712407,-1.004886,1.095395,0.815118,-1.095395,0.643897,1.743235,2.06894,0.559535,0.805988,1.0343
4,-1.029816,1.732195,-0.184784,-1.004886,1.095395,0.815118,-1.095395,0.643897,-0.573646,1.302637,1.635293,0.437701,-1.038964


In [13]:
X_test_std[num_cols].head()

Unnamed: 0,IsActiveMember,Geography_Germany,Age,Geography_France,Gender_Female,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,CreditScore,EstimatedSalary,Balance,Tenure
0,0.971048,1.732195,-1.038521,-1.004886,-0.912913,0.815118,0.912913,-1.553042,-0.573646,0.878063,0.257677,1.345088,-0.002332
1,-1.029816,-0.577302,0.099794,0.995137,1.095395,-0.914791,-1.095395,0.643897,-0.573646,-1.317292,0.496933,0.424866,-1.038964
2,0.971048,-0.577302,0.289513,-1.004886,1.095395,-0.914791,-1.095395,0.643897,1.743235,0.577755,-0.425108,0.303519,1.0343
3,0.971048,-0.577302,-0.279644,0.995137,-0.912913,-0.914791,0.912913,0.643897,-0.573646,1.457968,1.563416,0.556153,0.343212
4,0.971048,1.732195,-0.089925,-1.004886,-0.912913,0.815118,0.912913,0.643897,-0.573646,0.577755,0.618087,1.386398,-0.002332


#### RobustScale

$$x_{i,scaled} = \frac{x_{i} - x_{median}}{IQR}$$

IQR = 3rd quartile - 1st quartile = 75th quantile - 25th quantile

In [14]:
scaler = Scaler("robust")
X_train_robust, X_test_robust = scaler.scale(X_train, X_test, num_cols)

In [15]:
X_train_robust[num_cols].head()

Unnamed: 0,IsActiveMember,Geography_Germany,Age,Geography_France,Gender_Female,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,CreditScore,EstimatedSalary,Balance,Tenure
0,0.0,0.0,-0.083333,-1.0,1.0,0.0,-1.0,0.0,2.316881,0.686567,-0.675673,-0.035155,0.6
1,0.0,2.309497,1.416667,-1.0,0.0,0.0,0.0,0.0,0.0,-0.410448,0.207213,0.206128,-0.8
2,0.0,0.0,0.416667,0.0,0.0,1.0,0.0,-1.0,0.0,0.925373,0.895464,0.287733,0.2
3,0.0,0.0,1.666667,-1.0,1.0,1.0,-1.0,0.0,2.316881,1.485075,0.329357,0.231427,0.6
4,-1.0,2.309497,0.0,-1.0,1.0,1.0,-1.0,0.0,0.0,0.932836,0.959261,0.051351,-0.6


In [16]:
X_test_robust[num_cols].head()

Unnamed: 0,IsActiveMember,Geography_Germany,Age,Geography_France,Gender_Female,NumOfProducts,Gender_Male,HasCrCard,Geography_Spain,CreditScore,EstimatedSalary,Balance,Tenure
0,0.0,2.309497,-0.75,-1.0,0.0,1.0,0.0,-1.0,0.0,0.626866,0.152606,0.495023,0.0
1,-1.0,0.0,0.25,0.0,1.0,0.0,-1.0,0.0,0.0,-0.955224,0.292701,0.045075,-0.6
2,0.0,0.0,0.416667,-1.0,1.0,0.0,-1.0,0.0,2.316881,0.410448,-0.247195,-0.014258,0.6
3,0.0,0.0,-0.083333,0.0,0.0,0.0,0.0,0.0,0.0,1.044776,0.917174,0.109269,0.2
4,0.0,2.309497,0.083333,-1.0,0.0,1.0,0.0,0.0,0.0,0.410448,0.363642,0.515221,0.0
