In [13]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
import joblib
import os

In [4]:
#loading the dataset
data_path="../data/raw/loan_data.csv"
df=pd.read_csv(data_path)
print(df.shape)

(45000, 14)


In [5]:
#printing the data types
print(df.dtypes)

person_age                        float64
person_gender                      object
person_education                   object
person_income                     float64
person_emp_exp                      int64
person_home_ownership              object
loan_amnt                         float64
loan_intent                        object
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length        float64
credit_score                        int64
previous_loan_defaults_on_file     object
loan_status                         int64
dtype: object


In [None]:
#no missing and duplicate values are there so no need to check
print(df.isna().sum())
print(df.duplicated().sum())
#Handle missing values with advanced techniques (mean/median/group-based 
#imputation) This cant be done for this dataset.

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64
0


In [8]:
#feature engineering
df["debt_to_income"]=df["loan_amnt"]/df["person_income"]
df["age_group"]=pd.cut(df["person_age"],bins=[0,25,40,60,100],labels={"Young","Adult","MiddelAge","Senior"})


In [10]:
print(df.dtypes)

person_age                         float64
person_gender                       object
person_education                    object
person_income                      float64
person_emp_exp                       int64
person_home_ownership               object
loan_amnt                          float64
loan_intent                         object
loan_int_rate                      float64
loan_percent_income                float64
cb_person_cred_hist_length         float64
credit_score                         int64
previous_loan_defaults_on_file      object
loan_status                          int64
debt_to_income                     float64
age_group                         category
dtype: object


In [11]:
#splitting features and target variable
X=df.drop(columns=["loan_status"])
y=df["loan_status"]

In [None]:
#encoding cat columns and i used label encoder
categorical_cols = ['person_gender','person_education','person_home_ownership',
                    'loan_intent','previous_loan_defaults_on_file','age_group']

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le
print(encoders)


{'person_gender': LabelEncoder(), 'person_education': LabelEncoder(), 'person_home_ownership': LabelEncoder(), 'loan_intent': LabelEncoder(), 'previous_loan_defaults_on_file': LabelEncoder(), 'age_group': LabelEncoder()}


In [17]:
print(X.dtypes)

person_age                        float64
person_gender                       int64
person_education                    int64
person_income                     float64
person_emp_exp                      int64
person_home_ownership               int64
loan_amnt                         float64
loan_intent                         int64
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length        float64
credit_score                        int64
previous_loan_defaults_on_file      int64
debt_to_income                    float64
age_group                           int64
dtype: object


In [18]:
print(X.head())

   person_age  person_gender  person_education  person_income  person_emp_exp  \
0        22.0              0                 4        71948.0               0   
1        21.0              0                 3        12282.0               0   
2        25.0              0                 3        12438.0               3   
3        23.0              0                 1        79753.0               0   
4        24.0              1                 4        66135.0               1   

   person_home_ownership  loan_amnt  loan_intent  loan_int_rate  \
0                      3    35000.0            4          16.02   
1                      2     1000.0            1          11.14   
2                      0     5500.0            3          12.87   
3                      3    35000.0            3          15.23   
4                      3    35000.0            3          14.27   

   loan_percent_income  cb_person_cred_hist_length  credit_score  \
0                 0.49                    

In [19]:
scaling_cols=["person_age","person_income","loan_amnt","credit_score","loan_int_rate"]
scaler=StandardScaler()
X[scaling_cols]=scaler.fit_transform(X[scaling_cols])
print(X.head())

   person_age  person_gender  person_education  person_income  person_emp_exp  \
0   -0.953538              0                 4      -0.104090               0   
1   -1.118963              0                 3      -0.846005               0   
2   -0.457264              0                 3      -0.844065               3   
3   -0.788113              0                 1      -0.007039               0   
4   -0.622689              1                 4      -0.176371               1   

   person_home_ownership  loan_amnt  loan_intent  loan_int_rate  \
0                      3   4.024953            4       1.683039   
1                      2  -1.359209            1       0.044782   
2                      0  -0.646600            3       0.625557   
3                      3   4.024953            3       1.417829   
4                      3   4.024953            3       1.095549   

   loan_percent_income  cb_person_cred_hist_length  credit_score  \
0                 0.49                    

In [20]:
#saving to the models folder
joblib.dump(encoders,"../models/encoders_dict.pkl")
joblib.dump(scaler,"../models/scaler.pkl") 

['../models/scaler.pkl']

In [21]:
processed_df=X.copy()
processed_df["loan_status"]=y.values


In [22]:
print(processed_df.dtypes)

person_age                        float64
person_gender                       int64
person_education                    int64
person_income                     float64
person_emp_exp                      int64
person_home_ownership               int64
loan_amnt                         float64
loan_intent                         int64
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length        float64
credit_score                      float64
previous_loan_defaults_on_file      int64
debt_to_income                    float64
age_group                           int64
loan_status                         int64
dtype: object


In [23]:
#savinf the dataset
os.makedirs("../data/processed",exist_ok=True)
processed_df.to_csv("../data/processed/processed_loan_data.csv",index=False)

In [None]:
#i did not remvoe outliers as i think it is good enough