## Técnicas de Aprendizado de Máquina: Scaling

### Bibliotecas, dados e pré-processamento

In [1]:
import sys
sys.path.append('../')
from utils.churn_modelling_utils import import_data, initial_preprocess, num_cat_cols

In [2]:
df = import_data()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10002 non-null  int64  
 1   Geography        10001 non-null  object 
 2   Gender           10002 non-null  object 
 3   Age              10001 non-null  float64
 4   Tenure           10002 non-null  int64  
 5   Balance          10002 non-null  float64
 6   NumOfProducts    10002 non-null  int64  
 7   HasCrCard        10001 non-null  float64
 8   IsActiveMember   10001 non-null  float64
 9   EstimatedSalary  10002 non-null  float64
 10  Exited           10002 non-null  int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 859.7+ KB


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [3]:
X_train, X_test, y_train, y_test = initial_preprocess(df)

#### Dados antes do scaling

In [4]:
X_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,514,41.0,7,0.0,2,1.0,1.0,3756.65,True,False,False,True,False
1,620,43.0,2,0.0,2,1.0,0.0,20670.1,False,False,True,True,False
2,597,24.0,1,103219.47,1,1.0,0.0,60420.07,True,False,False,True,False
3,546,27.0,8,0.0,2,1.0,1.0,14858.1,True,False,False,True,False
4,568,32.0,7,169399.6,1,1.0,0.0,61936.22,False,False,True,True,False


In [5]:
X_test.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,611,35.0,10,0.0,1,1.0,1.0,23598.23,True,False,False,False,True
1,675,69.0,1,0.0,2,1.0,0.0,157097.09,True,False,False,False,True
2,637,36.0,9,166939.88,1,1.0,1.0,72504.76,True,False,False,True,False
3,553,44.0,4,0.0,1,1.0,0.0,10789.3,False,False,True,False,True
4,841,33.0,7,154969.79,2,1.0,1.0,99505.75,False,True,False,True,False


### Scaling

In [6]:
from scaler import Scaler

In [7]:
num_cols, _ = num_cat_cols(X_train)

#### MinMaxScale

$$x_{i,scaled} = \frac{x_{i} - x_{min}}{x_{max} - x_{min}}$$

In [8]:
scaler = Scaler("minmax")
X_train_min_max, X_test_min_max = scaler.scale(X_train, X_test)

In [9]:
X_train_min_max[num_cols].head()

Unnamed: 0,Gender_Female,Geography_Spain,CreditScore,Tenure,EstimatedSalary,IsActiveMember,Geography_France,Gender_Male,Geography_Germany,HasCrCard,NumOfProducts,Age,Balance
0,1.0,0.0,0.328,0.7,0.018342,1.0,1.0,0.0,0.0,1.0,0.333333,0.328571,0.0
1,1.0,1.0,0.54,0.2,0.10295,0.0,0.0,0.0,0.0,1.0,0.333333,0.357143,0.0
2,1.0,0.0,0.494,0.1,0.301797,0.0,1.0,0.0,0.0,1.0,0.0,0.085714,0.4114
3,1.0,0.0,0.392,0.8,0.073876,1.0,1.0,0.0,0.0,1.0,0.333333,0.128571,0.0
4,1.0,1.0,0.436,0.7,0.309382,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.675173


In [10]:
X_test_min_max[num_cols].head()

Unnamed: 0,Gender_Female,Geography_Spain,CreditScore,Tenure,EstimatedSalary,IsActiveMember,Geography_France,Gender_Male,Geography_Germany,HasCrCard,NumOfProducts,Age,Balance
0,0.0,0.0,0.522,1.0,0.117598,1.0,1.0,1.0,0.0,1.0,0.0,0.242857,0.0
1,0.0,0.0,0.65,0.1,0.785418,0.0,1.0,1.0,0.0,1.0,0.333333,0.728571,0.0
2,1.0,0.0,0.574,0.9,0.36225,1.0,1.0,0.0,0.0,1.0,0.0,0.257143,0.665369
3,0.0,1.0,0.406,0.4,0.053522,0.0,0.0,1.0,0.0,1.0,0.0,0.371429,0.0
4,1.0,0.0,0.982,0.7,0.497321,1.0,0.0,0.0,1.0,1.0,0.333333,0.214286,0.61766


#### StandardScale

$$x_{i,scaled} = \frac{x_{i} - \bar{x}}{s_{x}}$$

In [11]:
scaler = Scaler("standard")
X_train_std, X_test_std = scaler.scale(X_train, X_test)

In [12]:
X_train_std[num_cols].head()

Unnamed: 0,Gender_Female,Geography_Spain,CreditScore,Tenure,EstimatedSalary,IsActiveMember,Geography_France,Gender_Male,Geography_Germany,HasCrCard,NumOfProducts,Age,Balance
0,1.087149,-0.573838,-1.412599,0.688627,-1.662388,0.973723,1.005389,-1.087149,-0.585,0.639433,0.807034,0.199088,-1.233772
1,1.087149,1.742651,-0.314003,-1.034556,-1.368849,-1.026986,-0.99464,-1.087149,-0.585,0.639433,0.807034,0.390004,-1.233772
2,1.087149,-0.573838,-0.552377,-1.379193,-0.678975,-1.026986,1.005389,-1.087149,-0.585,0.639433,-0.904357,-1.423696,0.427363
3,1.087149,-0.573838,-1.080947,1.033264,-1.469718,0.973723,1.005389,-1.087149,-0.585,0.639433,0.807034,-1.137322,-1.233772
4,1.087149,1.742651,-0.852937,0.688627,-0.652661,-1.026986,-0.99464,-1.087149,-0.585,0.639433,-0.904357,-0.660033,1.492414


In [13]:
X_test_std[num_cols].head()

Unnamed: 0,Gender_Female,Geography_Spain,CreditScore,Tenure,EstimatedSalary,IsActiveMember,Geography_France,Gender_Male,Geography_Germany,HasCrCard,NumOfProducts,Age,Balance
0,-0.919837,-0.573838,-0.40728,1.722537,-1.31803,0.973723,1.005389,0.919837,-0.585,0.639433,-0.904357,-0.373659,-1.233772
1,-0.919837,-0.573838,0.256024,-1.379193,0.998888,-1.026986,1.005389,0.919837,-0.585,0.639433,0.807034,2.871908,-1.233772
2,1.087149,-0.573838,-0.137813,1.377901,-0.469241,0.973723,1.005389,-1.087149,-0.585,0.639433,-0.904357,-0.278201,1.45283
3,-0.919837,1.742651,-1.008399,-0.345283,-1.540333,-1.026986,-0.99464,0.919837,-0.585,0.639433,-0.904357,0.485461,-1.233772
4,1.087149,-0.573838,1.976468,0.688627,-0.00063,0.973723,-0.99464,-1.087149,1.709403,0.639433,0.807034,-0.564575,1.260192


#### RobustScale

$$x_{i,scaled} = \frac{x_{i} - x_{median}}{IQR}$$

IQR = 3rd quartile - 1st quartile = 75th quantile - 25th quantile

In [14]:
scaler = Scaler("robust")
X_train_robust, X_test_robust = scaler.scale(X_train, X_test)

In [15]:
X_train_robust[num_cols].head()

Unnamed: 0,Gender_Female,Geography_Spain,CreditScore,Tenure,EstimatedSalary,IsActiveMember,Geography_France,Gender_Male,Geography_Germany,HasCrCard,NumOfProducts,Age,Balance
0,1.0,0.0,-1.037594,0.333333,-0.962692,0.0,1.0,-1.0,0.0,0.0,1.0,0.333333,-0.761244
1,1.0,2.316489,-0.240602,-0.5,-0.791579,-1.0,0.0,-1.0,0.0,0.0,1.0,0.5,-0.761244
2,1.0,0.0,-0.413534,-0.666667,-0.389429,-1.0,1.0,-1.0,0.0,0.0,0.0,-1.083333,0.048089
3,1.0,0.0,-0.796992,0.5,-0.850379,0.0,1.0,-1.0,0.0,0.0,1.0,-0.833333,-0.761244
4,1.0,2.316489,-0.631579,0.333333,-0.37409,-1.0,0.0,-1.0,0.0,0.0,0.0,-0.416667,0.567


In [16]:
X_test_robust[num_cols].head()

Unnamed: 0,Gender_Female,Geography_Spain,CreditScore,Tenure,EstimatedSalary,IsActiveMember,Geography_France,Gender_Male,Geography_Germany,HasCrCard,NumOfProducts,Age,Balance
0,0.0,0.0,-0.308271,0.833333,-0.761955,0.0,1.0,0.0,0.0,0.0,0.0,-0.166667,-0.761244
1,0.0,0.0,0.172932,-0.666667,0.588651,-1.0,1.0,0.0,0.0,0.0,1.0,2.666667,-0.761244
2,1.0,0.0,-0.112782,0.666667,-0.267169,0.0,1.0,-1.0,0.0,0.0,0.0,-0.083333,0.547714
3,0.0,2.316489,-0.744361,-0.166667,-0.891543,-1.0,0.0,0.0,0.0,0.0,0.0,0.583333,-0.761244
4,1.0,0.0,1.421053,0.333333,0.006,0.0,0.0,-1.0,1.0,0.0,1.0,-0.333333,0.453858
