In [1]:
# Importing the dataset

import pandas as pd

path = "/Users/rohityadav/Desktop/Git Projects/ml-credit-risk-modeling/data/train.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
# Column names
df.columns.tolist()

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

In [2]:
# Dropping non predictive column

df.drop('Loan_ID', axis = 1, inplace = True)

In [5]:
# Encode Target for risk modeling
# default = 0 -> Loan rejected (Loan_Status = N)
# default = 1 -> Loan approved (Loan_Status = Y)

y = df['Loan_Status'].map({'Y': 0, 'N': 1})
X = df.drop('Loan_Status', axis = 1)

y.value_counts(normalize = True)

Loan_Status
0    0.687296
1    0.312704
Name: proportion, dtype: float64

In [10]:
# Train-Test split data (80/20 Stratified)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

X_train.shape, X_test.shape 

((491, 11), (123, 11))

In [15]:
# Quick check of missing values in train

X_train.isnull().sum()

Gender               11
Married               3
Dependents           10
Education             0
Self_Employed        26
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           19
Loan_Amount_Term      9
Credit_History       35
Property_Area         0
dtype: int64

In [16]:
# Quick check of missing values in test

X_test.isnull().sum()

Gender                2
Married               0
Dependents            5
Education             0
Self_Employed         6
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            3
Loan_Amount_Term      5
Credit_History       15
Property_Area         0
dtype: int64

In [11]:
# Define Feature groups
# Categorical and Numerical

categorical_features = [
    'Gender',
    'Married',
    'Dependents',
    'Education',
    'Self_Employed',
    'Property_Area'
]

numerical_features = [
    'ApplicantIncome',
    'CoapplicantIncome',
    'LoanAmount',
    'Loan_Amount_Term',
]

credit_history_feature = ['Credit_History']

In [18]:
# Handling missing values (mode/median)

from sklearn.impute import SimpleImputer

# Categorical: Mode
cat_imputer = SimpleImputer(strategy = "most_frequent")

# Numerical: Median
num_imputer = SimpleImputer(strategy = "median")

#Credit_History: Mode
ch_imputer = SimpleImputer(strategy = "most_frequent")

In [21]:
# Encode categorical variables (One Hot)

from sklearn.preprocessing import OneHotEncoder, StandardScaler

cat_encoder = OneHotEncoder(handle_unknown = 'ignore')
scaler = StandardScaler()

In [27]:
# Combine everything in one leakage-safe preprocessor

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_pipeline = Pipeline(steps = [
    ('imputer', cat_imputer),
    ('onehot', cat_encoder)
])

numerical_pipeline = Pipeline(steps = [
    ('imputer', num_imputer),
    ('onehot', scaler)
])

ch_pipeline = Pipeline(steps = [
    ('imputer', ch_imputer),
    ('onehot', scaler)
])

preprocessor = ColumnTransformer(
    transformers = [
        ('cat', categorical_pipeline, categorical_features),
        ('num', numerical_pipeline, numerical_features),
        ('ch', ch_pipeline, credit_history_feature)
    ],
    remainder = 'drop'
)

In [28]:
# Fit on train, transform train and test

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.fit_transform(X_test)

print('X_train_processed:', X_train_processed.shape)
print('X_test_processed:', X_test_processed.shape)
print('y_train:', y_train.shape, 'y_test:', y_test.shape)

X_train_processed: (491, 20)
X_test_processed: (123, 20)
y_train: (491,) y_test: (123,)


In [30]:
# Sanity Check: Get all feature names out

feature_names = preprocessor.get_feature_names_out()
len(feature_names), feature_names[:20]

(20,
 array(['cat__Gender_Female', 'cat__Gender_Male', 'cat__Married_No',
        'cat__Married_Yes', 'cat__Dependents_0', 'cat__Dependents_1',
        'cat__Dependents_2', 'cat__Dependents_3+',
        'cat__Education_Graduate', 'cat__Education_Not Graduate',
        'cat__Self_Employed_No', 'cat__Self_Employed_Yes',
        'cat__Property_Area_Rural', 'cat__Property_Area_Semiurban',
        'cat__Property_Area_Urban', 'num__ApplicantIncome',
        'num__CoapplicantIncome', 'num__LoanAmount',
        'num__Loan_Amount_Term', 'ch__Credit_History'], dtype=object))