### Import Dependencies

In [13]:
import os
import seaborn as sns
import numpy as np  # alias
import pandas as pd  # alias
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

### 2. Import Concepts

#### 2.1 Normalization vs Standerization

#### 2.1.1 What is Normalization?<br/>
Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It is also known as Min–Max scaling.<br/>
#### 2.1.2 What is Standardization?<br/>
Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.

#### 3. Basic Processing

In [14]:
df = pd.read_csv('data/processed/ChurnModeling_Encoded.csv')
df.head(10)

Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins
0,France,Female,42.0,2,0.0,1,1,1,101348.88,1,1
1,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0,1
2,France,Female,42.0,8,159660.8,3,1,0,113931.57,1,0
3,France,Female,38.91,1,0.0,2,0,0,93826.63,0,2
4,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0,4
5,Spain,Male,44.0,8,113755.78,2,1,0,149756.71,1,1
6,France,Male,50.0,7,0.0,2,1,1,10062.8,0,4
7,Germany,Female,29.0,4,115046.74,4,1,0,119346.88,1,0
8,France,Male,44.0,4,142051.07,2,0,1,74940.5,0,0
9,France,Male,27.0,2,134603.88,1,1,1,71725.73,0,2


| Condition | Min-Max Scaling | Standardization (Z-score) |
|----------|----------------|---------------------------|
| Data has a known, fixed range | ✅ Yes | ❌ Not ideal |
| Data contains outliers | ❌ Sensitive to outliers | ✅ More robust to outliers |
| Data is normally distributed | ❌ Not necessary | ✅ Preferred |
| Data is not normally distributed (e.g., skewed) | ✅ If shape needs to be preserved | ✅ Often works well after log-transform |
| Model is distance-based (KNN, SVM) | ✅ Recommended | ✅ Also acceptable |
| Model is neural network | ✅ Strongly recommended | ❌ May slow training |
| Model is linear or uses regularization | ❌ Not ideal | ✅ Helps with convergence |
| Input features need bounded values (0–1) | ✅ Required | ❌ Not bounded |
| Applying PCA or LDA | ❌ May distort variance | ✅ Required (centering needed) |
| Want to preserve original distribution shape | ✅ Maintains feature shape | ✅ Maintains shape but centers data |
| Working with tree-based models | ❌ Not needed | ❌ Not needed |


In [16]:
column_need_to_be__scaled = ['Age', 'Tenure', 'Balance', 'EstimatedSalary']

for col in column_need_to_be__scaled:
    standard_scalar = StandardScaler()
    df[col] = standard_scalar.fit_transform(df[col].values.reshape(10000,1))
df

Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins
0,France,Female,0.302983,-1.041760,-1.225848,1,1,1,0.021886,1,1
1,Spain,Female,0.204867,-1.387538,0.117350,1,0,1,0.216534,0,1
2,France,Female,0.302983,1.032908,1.333053,3,1,0,0.240687,1,0
3,France,Female,-0.000196,-1.387538,-1.225848,2,0,0,-0.108918,0,2
4,Spain,Female,0.401100,-1.041760,0.785728,1,1,1,-0.365276,0,4
...,...,...,...,...,...,...,...,...,...,...,...
9995,France,Male,0.008634,-0.004426,-1.225848,2,1,0,-0.066419,0,3
9996,France,Male,-0.383831,1.724464,-0.306379,1,1,1,0.027988,0,0
9997,France,Female,-0.285715,0.687130,-1.225848,1,0,1,-1.008643,1,2
9998,Germany,Male,0.302983,-0.695982,-0.022608,2,1,0,-0.125231,1,3


In [17]:
df.to_csv('data/processed/ChurnModeling_Final.csv', index=False)