In [1]:
!pip install -r requirements.txt



### 1.Import Dependencies


In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### 2. Basic Processing

In [3]:
df = pd.read_csv('EDA/data/processed/FeautureEncoding_Applied.csv')
df.head(5)

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.0,2,0.0,1,1,1,101348.88,1,1,True,False,False,True,False
1,41.0,1,83807.86,1,0,1,112542.58,0,1,False,False,True,True,False
2,42.0,8,159660.8,3,1,0,113931.57,1,0,True,False,False,True,False
3,38.91,1,0.0,2,0,0,93826.63,0,2,True,False,False,True,False
4,43.0,2,125510.82,1,1,1,79084.1,0,4,False,False,True,True,False


### 3. Important Concepts

#### 2.1.1 What is Normalization?

Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It is also known as Min-Max scaling.

#### 2.1.2 What is Standardization?

Standardization is another scaling technique where the values are centered around the mean with aunit standard deviation. This means that the mean of the attribute becomes zero and the resultant distibution has a unit standard deviation

| Condition | Min-Max Scaling | Standardization (Z-score) |
|-----------|----------------|------------------------|
| Data has a known, fixed range | ✅ Yes | ❌ Not ideal |
| Data contains outliers | ❌ Sensitive to outliers | ✅ More robust to outliers |
| Data is normally distributed | ❌ Not necessary | ✅ Preferred |
| Data is not normally distributed (e.g., skewed) | ✅ If shape needs to be preserved | ✅ Often works well after log-transform |
| Model is distance-based (KNN, SVM) | ✅ Recommended | ✅ Also acceptable |
| Model is neural network | ✅ Strongly recommended | ❌ May slow training |
| Model is linear or uses regularization | ❌ Not ideal | ✅ Helps with convergence |
| Input features need bounded values (0–1) | ✅ Required | ❌ Not bounded |
| Applying PCA or LDA | ❌ May distort variance | ✅ Required (centering needed) |
| Want to preserve original distribution shape | ✅ Maintains feature shape | ✅ Maintains shape but centers data |
| Working with tree-based models | ❌ Not needed | ❌ Not needed |

In [4]:
Columns_need_to_be_scaled = ['Age', 'Tenure', 'Balance', 'EstimatedSalary']

for col in Columns_need_to_be_scaled:
 scalar = StandardScaler()
 #scalar = MinMaxScaler()
df[col] = scalar.fit_transform(df[col].values.reshape(10000, 1))

df


Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.00,2,0.00,1,1,1,0.021886,1,1,True,False,False,True,False
1,41.00,1,83807.86,1,0,1,0.216534,0,1,False,False,True,True,False
2,42.00,8,159660.80,3,1,0,0.240687,1,0,True,False,False,True,False
3,38.91,1,0.00,2,0,0,-0.108918,0,2,True,False,False,True,False
4,43.00,2,125510.82,1,1,1,-0.365276,0,4,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39.00,5,0.00,2,1,0,-0.066419,0,3,True,False,False,False,True
9996,35.00,10,57369.61,1,1,1,0.027988,0,0,True,False,False,False,True
9997,36.00,7,0.00,1,0,1,-1.008643,1,2,True,False,False,True,False
9998,42.00,3,75075.31,2,1,0,-0.125231,1,3,False,True,False,False,True


In [6]:
df.to_csv(
                  'EDA/data/processed/ChurnModelling_Final.csv',
                  index= False
                )