In [1]:
import numpy as np , pandas as pd , seaborn as sns , matplotlib.pyplot as plt

In [2]:
raw_data = pd.read_csv('../data/interim/loan.csv')

In [3]:
raw_data.sample(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
225,Male,Yes,0,Graduate,No,3250,0.0,170.0,360.0,1.0,Rural,N
222,Male,No,0,Graduate,No,2971,2791.0,144.0,360.0,1.0,Semiurban,Y
552,Male,Yes,1,Graduate,No,3333,3250.0,158.0,360.0,1.0,Urban,Y
339,Female,No,0,Graduate,No,4160,0.0,71.0,360.0,1.0,Semiurban,Y
73,Male,Yes,3+,Not Graduate,No,4755,0.0,95.0,,0.0,Semiurban,N
531,Male,Yes,3+,Graduate,No,4281,0.0,100.0,360.0,1.0,Urban,Y
410,Female,No,1,Not Graduate,Yes,3867,0.0,62.0,360.0,1.0,Semiurban,N
549,Male,Yes,0,Graduate,No,2785,2016.0,110.0,360.0,1.0,Rural,Y
6,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
529,Male,No,0,Not Graduate,No,6783,0.0,130.0,360.0,1.0,Semiurban,Y


In [31]:
raw_data.columns[raw_data.isna().any().tolist()].to_list()

['Gender',
 'Married',
 'Dependents',
 'Self_Employed',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [32]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [33]:
categorical_condition = (raw_data.dtypes == object).to_list()
categorical_cols = raw_data.columns[categorical_condition].to_list()

categorical_cols.remove("Loan_Status")

categorical_cols

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area']

In [34]:
numerical_condition = (raw_data.dtypes != object).to_list()
numerical_cols = raw_data.columns[numerical_condition].to_list()
numerical_cols

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [35]:
from sklearn.impute import SimpleImputer , KNNImputer
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
)

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest , chi2

## Preprocessing Data with and without Normalizing


<h2>the main benefits of normalizing numerical data</h2>
 
 
 
<h3>1- Improves model training stability:</h3>
 
Many machine learning algorithms work better when numerical features are on a similar scale.
 
Normalization prevents large-valued features from dominating the learning process.
 
 
<h3>2- Speeds up convergence:</h3>
 
Gradient-based models (like neural networks, logistic regression, SVM) converge faster when input features are scaled.
 
This means training takes fewer iterations.

 
<h3>3- Improves accuracy for distance-based models:</h3>
 
Algorithms like K-Nearest Neighbors or K-Means use distances between points.
Without normalization, features with large ranges overpower others, leading to biased results.
 
 
<h3>4- Makes weights more interpretable:</h3>
 
In linear models, if features are normalized, the learned coefficients (weights) become more comparable in terms of importance.
 
 
<h3>5- Reduces numerical instability:</h3>
 
Some optimization algorithms are sensitive to differences in magnitude, which can cause overflow/underflow errors.
Scaling helps avoid these issues.
 

In [36]:
num_pipeline_norm = Pipeline(
    steps=[("knn_imputer" , KNNImputer(n_neighbors=3)) , ('scale', MinMaxScaler())]

)

num_pipeline = Pipeline(
    steps=[("knn_imputer" , KNNImputer(n_neighbors=3)) ,]

)






ohe_cat_pipeline = Pipeline(
    steps=[
        ("simple_imputer" , SimpleImputer(strategy='most_frequent')),
        ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype='int')) 
    ]
)

In [37]:
col_trans_norm = ColumnTransformer(
    transformers=[
        ("num_pipeline_norm", num_pipeline_norm, numerical_cols),
        ("ohe_cat_pipeline" , ohe_cat_pipeline , categorical_cols)
    ],
    remainder= 'drop',
    n_jobs=-1,
    
)

col_trans = ColumnTransformer(
    transformers=[
        ("num_pipeline", num_pipeline, numerical_cols),
        ("ohe_cat_pipeline" , ohe_cat_pipeline , categorical_cols)
    ],
    remainder= 'drop',
    n_jobs=-1,
    
)




featurer_selector = Pipeline(steps=[("feat_selector" , SelectKBest(chi2 , k=20))])


In [38]:
preprocess_pipeline_norm = Pipeline(
    steps=[
        ("col_trans_norm" , col_trans_norm),
        ("selector" , featurer_selector),
    ]
)


preprocess_pipeline = Pipeline(
    steps=[
        ("col_trans" , col_trans),
        ("selector" , featurer_selector),
    ]
)


In [39]:
from sklearn import set_config

set_config(display="diagram")
display(preprocess_pipeline_norm)
display(preprocess_pipeline)


0,1,2
,steps,"[('col_trans_norm', ...), ('selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_pipeline_norm', ...), ('ohe_cat_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,3
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,'int'
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,steps,"[('feat_selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,score_func,<function chi...x729dad9772e0>
,k,20


0,1,2
,steps,"[('col_trans', ...), ('selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_pipeline', ...), ('ohe_cat_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,3
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,'int'
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,steps,"[('feat_selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,score_func,<function chi...x729dad9772e0>
,k,20


In [40]:
X = raw_data[numerical_cols + categorical_cols]
y = raw_data["Loan_Status"]


In [41]:
from sklearn.model_selection import train_test_split


X_tv , X_test , y_tv , y_test = train_test_split(X , y , test_size=.15 ,  stratify=y , random_state=42)

X_train , X_valid , y_train , y_valid = train_test_split(X_tv , y_tv , test_size= (.2/(.2+.65)),stratify=y_tv , random_state=42)



In [42]:
X_train_norm = preprocess_pipeline_norm.fit_transform(X_train,y_train)
X_valid_norm = preprocess_pipeline_norm.transform(X_valid)
X_test_norm = preprocess_pipeline_norm.transform(X_test)

X_train = preprocess_pipeline.fit_transform(X_train,y_train)
X_valid = preprocess_pipeline.transform(X_valid)
X_test = preprocess_pipeline.transform(X_test)



In [43]:
X_train_norm= pd.DataFrame(X_train_norm)
X_valid_norm= pd.DataFrame(X_valid_norm)
X_test_norm= pd.DataFrame(X_test_norm)

X_train= pd.DataFrame(X_train)
X_valid= pd.DataFrame(X_valid)
X_test= pd.DataFrame(X_test)



In [44]:

X_train_norm.to_csv("../data/processed/X_train_norm.csv",index=False)
X_valid_norm.to_csv("../data/processed/X_valid_norm.csv",index=False)
X_test_norm.to_csv("../data/processed/X_test_norm.csv",index=False)


X_train.to_csv("../data/processed/X_train.csv",index=False)
X_valid.to_csv("../data/processed/X_valid.csv",index=False)
X_test.to_csv("../data/processed/X_test.csv",index=False)


y_train.to_csv("../data/processed/y_train.csv",index=False)
y_valid.to_csv("../data/processed/y_valid.csv",index=False)
y_test.to_csv("../data/processed/y_test.csv",index=False)



In [45]:
X_train_norm


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.050390,0.00000,0.121523,0.74359,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.095594,0.00000,0.377745,0.74359,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.051244,0.00000,0.200586,0.74359,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3,0.032046,0.11265,0.153734,0.74359,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.036675,0.00000,0.083455,0.74359,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,0.019681,0.14670,0.111274,0.74359,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
394,0.028147,0.11510,0.175695,0.74359,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
395,0.033073,0.09215,0.155198,1.00000,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
396,0.042790,0.07295,0.185944,0.74359,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [46]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,4281.0,0.0,100.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,7933.0,0.0,275.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,4350.0,0.0,154.0,360.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3,2799.0,2253.0,122.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,3173.0,0.0,74.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,1800.0,2934.0,93.0,360.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
394,2484.0,2302.0,137.0,360.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
395,2882.0,1843.0,123.0,480.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
396,3667.0,1459.0,144.0,360.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [27]:
# list_x = ['X_train' , 'X_valid' , 'X_test']
# list_y = ['y_train' , 'y_valid' , 'y_test']



# for i ,j in zip(list_x , list_y):

#     i , j = globals()[i] , globals()[j]
#     preprocess_pipeline_norm.fit(i,j)
#     preprocess_pipeline.fit(i,j)