In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
data1 = pd.read_csv("financial_risk_analysis_large.csv.zip")
data1

Unnamed: 0,CreditScore,AnnualIncome,LoanAmount,LoanDuration,Age,EmploymentStatus,MaritalStatus,NumberOfDependents,EducationLevel,HomeOwnershipStatus,...,JobTenure,MonthlySavings,AnnualBonuses,AnnualExpenses,MonthlyHousingCosts,MonthlyTransportationCosts,MonthlyFoodCosts,MonthlyHealthcareCosts,MonthlyEntertainmentCosts,LoanApproved
0,402,63295,18830,13,29,Self-Employed,Widowed,2,Doctorate,Other,...,24,378,3741,40058,977,412,399,136,124,0
1,735,55936,23729,1,42,Self-Employed,Divorced,3,Master,Own,...,10,575,4115,16745,695,206,898,252,131,0
2,570,62547,19660,7,54,Self-Employed,Single,3,Doctorate,Mortgage,...,16,691,4105,23273,627,266,392,73,36,0
3,406,46129,21674,23,25,Self-Employed,Divorced,3,High School,Other,...,6,452,4559,42163,397,307,250,378,-32,0
4,371,57725,12189,26,42,Employed,Widowed,4,Master,Own,...,2,690,7856,30087,723,315,114,88,68,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,414,83679,19053,10,54,Unemployed,Married,2,Bachelor,Other,...,33,523,5740,31540,890,351,599,148,138,0
999996,333,79096,22567,4,21,Self-Employed,Married,2,Bachelor,Rent,...,35,525,5669,26843,2082,80,427,366,100,0
999997,668,55138,13939,28,52,Employed,Divorced,0,Bachelor,Rent,...,20,386,3475,27936,393,-85,534,8,98,0
999998,627,62867,19115,8,23,Employed,Married,4,Associate,Mortgage,...,18,287,6476,17752,1226,311,227,119,167,0


In [8]:
df = pd.DataFrame(data1)
df

Unnamed: 0,CreditScore,AnnualIncome,LoanAmount,LoanDuration,Age,EmploymentStatus,MaritalStatus,NumberOfDependents,EducationLevel,HomeOwnershipStatus,...,JobTenure,MonthlySavings,AnnualBonuses,AnnualExpenses,MonthlyHousingCosts,MonthlyTransportationCosts,MonthlyFoodCosts,MonthlyHealthcareCosts,MonthlyEntertainmentCosts,LoanApproved
0,402,63295,18830,13,29,Self-Employed,Widowed,2,Doctorate,Other,...,24,378,3741,40058,977,412,399,136,124,0
1,735,55936,23729,1,42,Self-Employed,Divorced,3,Master,Own,...,10,575,4115,16745,695,206,898,252,131,0
2,570,62547,19660,7,54,Self-Employed,Single,3,Doctorate,Mortgage,...,16,691,4105,23273,627,266,392,73,36,0
3,406,46129,21674,23,25,Self-Employed,Divorced,3,High School,Other,...,6,452,4559,42163,397,307,250,378,-32,0
4,371,57725,12189,26,42,Employed,Widowed,4,Master,Own,...,2,690,7856,30087,723,315,114,88,68,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,414,83679,19053,10,54,Unemployed,Married,2,Bachelor,Other,...,33,523,5740,31540,890,351,599,148,138,0
999996,333,79096,22567,4,21,Self-Employed,Married,2,Bachelor,Rent,...,35,525,5669,26843,2082,80,427,366,100,0
999997,668,55138,13939,28,52,Employed,Divorced,0,Bachelor,Rent,...,20,386,3475,27936,393,-85,534,8,98,0
999998,627,62867,19115,8,23,Employed,Married,4,Associate,Mortgage,...,18,287,6476,17752,1226,311,227,119,167,0


In [11]:
df.memory_usage(deep=True).sum() / 1e6

896.67585

In [13]:
ordinal_col = ['EducationLevel']

onehot_cols = [
    'EmploymentStatus',
    'HomeOwnershipStatus',
    'LoanPurpose',
    'HealthInsuranceStatus',
    'LifeInsuranceStatus',
    'CarInsuranceStatus',
    'HomeInsuranceStatus',
    'EmployerType'
]

target_col = 'LoanApproved'
drop_cols = ['MaritalStatus']  # explicitly removed


In [21]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb




education_order = [
    ['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']
]

ordinal_pipeline = Pipeline(steps=[
    ('ordinal_encoder', OrdinalEncoder(categories=education_order))
])

In [23]:
onehot_pipeline = Pipeline(steps=[
    ('onehot_encoder', OneHotEncoder(
        drop='first',
        handle_unknown='ignore',
        sparse_output=False
    ))
])

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', ordinal_pipeline, ordinal_col),
        ('ohe', onehot_pipeline, onehot_cols)
    ],
    remainder='passthrough'   # keeps numerical columns automatically
)

In [27]:
# Features and target
X = df.drop(columns=[target_col] + drop_cols)
y = df[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [29]:
xgb_model = xgb.XGBClassifier(
    n_estimators=800,
    learning_rate=0.01,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()
)


In [31]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', xgb_model)
])

In [33]:
model_pipeline.fit(X_train, y_train)

# Predict.
y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:, 1]

# Evaluation.
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

ROC-AUC: 0.8013482009041222
              precision    recall  f1-score   support

           0       1.00      0.60      0.75    125366
           1       0.60      1.00      0.75     74634

    accuracy                           0.75    200000
   macro avg       0.80      0.80      0.75    200000
weighted avg       0.85      0.75      0.75    200000



In [35]:
import joblib
joblib.dump(model_pipeline, "loan_pipeline.pkl")

['loan_pipeline.pkl']