In [7]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd

In [10]:
df = pd.read_csv(r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\data\raw\credit_risk_dataset.csv")

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [12]:
# Clean column names
df.columns = df.columns.str.lower().str.strip().str.replace("-", "_")

# Fix known typos in column names
# df.rename(columns={"cb_preson_cred_hist_length": "cb_person_cred_hist_length"}, inplace=True)
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [13]:
# Check missing
print(df.isnull().sum())

# Option 1: Drop rows (if small % missing)
df.dropna(inplace=True)



person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64


In [14]:
# Check missing
print(df.isnull().sum())

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64


In [15]:
# Encode loan_grade as ordinal
grade_order = [["A", "B", "C", "D", "E", "F", "G"]]
ordinal_enc = OrdinalEncoder(categories=grade_order)
df["loan_grade_encoded"] = ordinal_enc.fit_transform(df[["loan_grade"]])

# Binary encode cb_person_default_on_file
df["cb_person_default_on_file"] = df["cb_person_default_on_file"].map({"Y": 1, "N": 0})

# One-hot encode nominal categories
df = pd.get_dummies(df, columns=["loan_intent", "person_home_ownership"], drop_first=True)

In [16]:
# Select numerical features to scale
num_features = [
    "person_age", "person_income", "person_emp_length",
    "loan_amnt", "loan_int_rate", "loan_percent_income",
    "cb_person_cred_hist_length"
]

scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])

In [17]:
# Drop original loan_grade
df.drop(columns=["loan_grade"], inplace=True)

# Define X and y
X = df.drop(columns=["loan_status"])
y = df["loan_status"]

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Save to processed folder
X_train.to_csv(r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\data\processed\X_train.csv", index=False)
X_test.to_csv(r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\data\processed\X_test.csv", index=False)
y_train.to_csv(r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\data\processed\y_train.csv", index=False)
y_test.to_csv(r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\data\processed\y_test.csv", index=False)

In [23]:
joblib.dump(scaler, r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\models\scaler.pkl")
joblib.dump(ordinal_enc, r"C:\Users\theow\Documents\Project\Explainable-Loan-Default\models\ordinal_encoder.pkl")

['C:\\Users\\theow\\Documents\\Project\\Explainable-Loan-Default\\models\\ordinal_encoder.pkl']