## Machine Learning Tecniques: Scaling

### Libraries, data and preprocessing

In [1]:
from churn_modelling_utils import (
    import_data, 
    preprocess_without_scale, 
    num_cat_cols
)

In [2]:
df = import_data()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10002 non-null  int64  
 1   Geography        10001 non-null  object 
 2   Gender           10002 non-null  object 
 3   Age              10001 non-null  float64
 4   Tenure           10002 non-null  int64  
 5   Balance          10002 non-null  float64
 6   NumOfProducts    10002 non-null  int64  
 7   HasCrCard        10001 non-null  float64
 8   IsActiveMember   10001 non-null  float64
 9   EstimatedSalary  10002 non-null  float64
 10  Exited           10002 non-null  int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 859.7+ KB


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [3]:
X_train, X_test, y_train, y_test = preprocess_without_scale(df)

#### Data before scaling

In [4]:
X_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42.0,2,0.0,1,1.0,1.0,101348.88,True,False,False,True,False
1,608,41.0,1,83807.86,1,0.0,1.0,112542.58,False,False,True,True,False
2,502,42.0,8,159660.8,3,1.0,0.0,113931.57,True,False,False,True,False
3,699,39.0,1,0.0,2,0.0,0.0,93826.63,True,False,False,True,False
4,850,43.0,2,125510.82,1,1.0,1.0,79084.1,False,False,True,True,False


In [5]:
X_test.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,707,32.0,9,0.0,2,1.0,0.0,126475.79,False,False,True,False,True
1,590,37.0,1,0.0,2,0.0,0.0,133535.99,False,False,True,False,True
2,603,57.0,6,105000.85,2,1.0,1.0,87412.24,False,False,True,False,True
3,615,45.0,5,0.0,2,1.0,1.0,164886.64,True,False,False,False,True
4,634,36.0,1,69518.95,1,1.0,0.0,116238.39,True,False,False,True,False


### Scaling

In [6]:
from scaler import Scaler

In [7]:
num_cols, _ = num_cat_cols(X_train)

#### MinMaxScale

$$x_{i,scaled} = \frac{x_{i} - x_{min}}{x_{max} - x_{min}}$$

In [8]:
scaler = Scaler("minmax")
X_train_min_max, X_test_min_max = scaler.scale(X_train, X_test, num_cols)

In [9]:
X_train_min_max[num_cols].head()

Unnamed: 0,Balance,NumOfProducts,Geography_Spain,Age,Geography_Germany,HasCrCard,Gender_Female,EstimatedSalary,Gender_Male,IsActiveMember,CreditScore,Tenure,Geography_France
0,0.0,0.0,0.0,0.324324,0.0,1.0,1.0,0.506735,0.0,1.0,0.538,0.2,1.0
1,0.334031,0.0,1.0,0.310811,0.0,0.0,1.0,0.562709,0.0,1.0,0.516,0.1,0.0
2,0.636357,0.666667,0.0,0.324324,0.0,1.0,1.0,0.569654,0.0,0.0,0.304,0.8,1.0
3,0.0,0.333333,0.0,0.283784,0.0,0.0,1.0,0.46912,0.0,0.0,0.698,0.1,1.0
4,0.500246,0.0,1.0,0.337838,0.0,1.0,1.0,0.3954,0.0,1.0,1.0,0.2,0.0


In [10]:
X_test_min_max[num_cols].head()

Unnamed: 0,Balance,NumOfProducts,Geography_Spain,Age,Geography_Germany,HasCrCard,Gender_Female,EstimatedSalary,Gender_Male,IsActiveMember,CreditScore,Tenure,Geography_France
0,0.0,0.333333,1.0,0.189189,0.0,1.0,0.0,0.632381,1.0,0.0,0.714,0.9,0.0
1,0.0,0.333333,1.0,0.256757,0.0,0.0,0.0,0.667686,1.0,0.0,0.48,0.1,0.0
2,0.4185,0.333333,1.0,0.527027,0.0,1.0,0.0,0.437045,1.0,1.0,0.506,0.6,0.0
3,0.0,0.333333,0.0,0.364865,0.0,1.0,0.0,0.824454,1.0,1.0,0.53,0.5,1.0
4,0.27708,0.0,0.0,0.243243,0.0,1.0,1.0,0.58119,0.0,0.0,0.568,0.1,1.0


#### StandardScale

$$x_{i,scaled} = \frac{x_{i} - \bar{x}}{s_{x}}$$

In [11]:
scaler = Scaler("standard")
X_train_std, X_test_std = scaler.scale(X_train, X_test, num_cols)

In [12]:
X_train_std[num_cols].head()

Unnamed: 0,Balance,NumOfProducts,Geography_Spain,Age,Geography_Germany,HasCrCard,Gender_Female,EstimatedSalary,Gender_Male,IsActiveMember,CreditScore,Tenure,Geography_France
0,-1.231136,-0.905678,-0.57634,0.291282,-0.576532,0.652064,1.089889,0.019822,-1.089889,0.974942,-0.322101,-1.044697,0.997628
1,0.112282,-0.905678,1.735087,0.196126,-0.576532,-1.533593,1.089889,0.214414,-1.089889,0.974942,-0.435693,-1.391412,-1.002378
2,1.328184,2.524539,-0.57634,0.291282,-0.576532,0.652064,1.089889,0.238561,-1.089889,-1.025702,-1.530301,1.035596,0.997628
3,-1.231136,0.80943,-0.57634,0.005815,-0.576532,-1.533593,1.089889,-0.110946,-1.089889,-1.025702,0.504019,-1.391412,0.997628
4,0.780769,-0.905678,1.735087,0.386437,-0.576532,0.652064,1.089889,-0.367231,-1.089889,0.974942,2.06332,-1.044697,-1.002378


In [13]:
X_test_std[num_cols].head()

Unnamed: 0,Balance,NumOfProducts,Geography_Spain,Age,Geography_Germany,HasCrCard,Gender_Female,EstimatedSalary,Gender_Male,IsActiveMember,CreditScore,Tenure,Geography_France
0,-1.231136,0.80943,1.735087,-0.660273,-0.576532,0.652064,-0.917524,0.45663,0.917524,-1.025702,0.58663,1.382312,-1.002378
1,-1.231136,0.80943,1.735087,-0.184496,-0.576532,-1.533593,-0.917524,0.579366,0.917524,-1.025702,-0.62157,-1.391412,-1.002378
2,0.452,0.80943,1.735087,1.718614,-0.576532,0.652064,-0.917524,-0.222454,0.917524,0.974942,-0.487325,0.342165,-1.002378
3,-1.231136,0.80943,-0.57634,0.576748,-0.576532,0.652064,-0.917524,1.124368,0.917524,0.974942,-0.363407,-0.00455,0.997628
4,-0.116766,-0.905678,-0.57634,-0.279651,-0.576532,0.652064,1.089889,0.278663,-1.089889,-1.025702,-0.167204,-1.391412,0.997628


#### RobustScale

$$x_{i,scaled} = \frac{x_{i} - x_{median}}{IQR}$$

IQR = 3rd quartile - 1st quartile = 75th quantile - 25th quantile

In [14]:
scaler = Scaler("robust")
X_train_robust, X_test_robust = scaler.scale(X_train, X_test, num_cols)

In [15]:
X_train_robust[num_cols].head()

Unnamed: 0,Balance,NumOfProducts,Geography_Spain,Age,Geography_Germany,HasCrCard,Gender_Female,EstimatedSalary,Gender_Male,IsActiveMember,CreditScore,Tenure,Geography_France
0,-0.764044,0.0,0.0,0.416667,0.0,0.0,1.0,0.01065,-1.0,0.0,-0.238806,-0.75,0.0
1,-0.108394,0.0,2.311427,0.333333,0.0,-1.0,1.0,0.12476,-1.0,0.0,-0.320896,-1.0,-1.0
2,0.485024,2.0,0.0,0.416667,0.0,0.0,1.0,0.13892,-1.0,-1.0,-1.11194,0.75,0.0
3,-0.764044,1.0,0.0,0.166667,0.0,-1.0,1.0,-0.066032,-1.0,-1.0,0.358209,-1.0,0.0
4,0.21786,0.0,2.311427,0.5,0.0,0.0,1.0,-0.216319,-1.0,0.0,1.485075,-0.75,-1.0


In [16]:
X_test_robust[num_cols].head()

Unnamed: 0,Balance,NumOfProducts,Geography_Spain,Age,Geography_Germany,HasCrCard,Gender_Female,EstimatedSalary,Gender_Male,IsActiveMember,CreditScore,Tenure,Geography_France
0,-0.764044,1.0,2.311427,-0.416667,0.0,0.0,0.0,0.266797,0.0,-1.0,0.41791,1.0,-1.0
1,-0.764044,1.0,2.311427,0.0,0.0,-1.0,0.0,0.338769,0.0,-1.0,-0.455224,-1.0,-1.0
2,0.057405,1.0,2.311427,1.666667,0.0,0.0,0.0,-0.131421,0.0,0.0,-0.358209,0.25,-1.0
3,-0.764044,1.0,0.0,0.666667,0.0,0.0,0.0,0.658361,0.0,0.0,-0.268657,0.0,0.0
4,-0.22018,0.0,0.0,-0.083333,0.0,0.0,1.0,0.162436,-1.0,-1.0,-0.126866,-1.0,0.0
