In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
    LabelEncoder
)
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
)

In [2]:
columns_dict = {
    'level_0': 'level_0',
    'index': 'id',
    'chênh lệch thu chi': 'income_expenditure_difference',
    'thu nhập': 'income',
    'giới tính': 'gender',
    'cơ quan công tác': 'working_agency',
    'tổng chi phí': 'total_expenses',
    'loại hình cư trú': 'type_of_residence',
    'sản phẩm tại ngân hàng': 'bank_product',
    'tổng thu nhập': 'total_income',
    'thời hạn vay vốn': 'loan_term',
    'lương, phụ cấp': 'salary_allowance',
    'số lượng sản phẩm đang sử dụng': 'number_of_products_in_use',
    'hình thức nhận lương': 'salary_receiving_method',
    'số dư casa': 'casa_balance',
    'phân khúc khách hàng': 'customer_segment',
    'tình trạng hôn nhân': 'marital_status',
    'số người phụ thuộc': 'number_of_dependents',
    'độ tuổi': 'age',
    'số dư tiền gửi kỳ hạn': 'term_deposit_balance',
    'trình độ học vấn': 'educational_level',
    'bảo hiểm': 'insurance',
    'chức danh': 'position',
    'số sản phẩm phi tín dụng': 'number_of_non_credit_products',
    'nhà ở': 'housing',
    'nguồn trả nợ': 'debt_repayment_source',
    'thời gian công tác': 'work_tenure',
    'dư nợ tại ngân hàng': 'bank_debt_balance',
    'số ngân hàng đang phát sinh dư nợ': 'number_of_banks_with_outstanding_debt',
    'điện thoại liên lạc': 'contact_phone',
    'số tiền vay': 'loan_amount',
    'lãi vay dự kiến': 'expected_loan_interest',
    'hợp đồng lao động': 'labor_contract',
    'thời han duy trì quan hệ tại ngân hàng': 'duration_of_relationship_with_the_bank',
    'kỳ hạn đề xuất': 'proposed_term',
    'ngành kinh tế': 'economic_sector',
    'thông tin nhóm nợ': 'debt_group_information',
    'lịch sử quá hạn': 'overdue_history',
    'RESULT': 'result'
}

# Data Processing

## EDA

In [3]:
df = pd.read_csv('data_source/credit-dataset.csv', encoding='utf-8', usecols=lambda x: x not in ["level_0", "index", "giới tính", "điện thoại liên lạc", "hình thức nhận lương"])
df.head()

Unnamed: 0,chênh lệch thu chi,thu nhập,cơ quan công tác,tổng chi phí,loại hình cư trú,sản phẩm tại ngân hàng,tổng thu nhập,thời hạn vay vốn,"lương, phụ cấp",số lượng sản phẩm đang sử dụng,...,số ngân hàng đang phát sinh dư nợ,số tiền vay,lãi vay dự kiến,hợp đồng lao động,thời han duy trì quan hệ tại ngân hàng,kỳ hạn đề xuất,ngành kinh tế,thông tin nhóm nợ,lịch sử quá hạn,RESULT
0,17520000.0,30500000.0,cq hanh chi su nghiep,12980000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,30500000.0,12.0,10500000.0,0,...,0,100000000.0,980000.0,"cong chuc, vien chuc",3,12.0,van phong,khong co du no,da co quan he nhung chua tung qua han,AA-
1,4749121.67,7219121.67,cq hanh chi su nghiep,2470000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,7219121.67,60.0,7219121.67,0,...,0,100000000.0,2470000.0,"cong chuc, vien chuc",3,60.0,giao dich,khong co du no,da co quan he nhung chua tung qua han,A+
2,668138.0,5668138.0,cq hanh chi su nghiep,5000000.0,ho khau thuong tru,da tung co tien gui thanh toan hoac sp khac 12...,5668138.0,12.0,5668138.0,0,...,0,30000000.0,0.0,hop dong khong xac dinh thoi han,0,12.0,van phong,khong co thong tin,khong co thong tin,A+
3,9016754.0,22266754.0,cq hanh chi su nghiep,13250000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,22266754.0,12.0,14266754.0,0,...,0,100000000.0,9250000.0,hop dong khong xac dinh thoi han,0,12.0,van phong,khong co du no,khong co quan he tin dung,AA+
4,22579692.0,35614692.0,cq hanh chi su nghiep,13035000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,35614692.0,60.0,14614692.0,0,...,0,350000000.0,9035000.0,"cong chuc, vien chuc",3,60.0,van phong,khong co du no,da co quan he nhung chua tung qua han,A+


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120017 entries, 0 to 120016
Data columns (total 34 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   chênh lệch thu chi                      120017 non-null  float64
 1   thu nhập                                120017 non-null  float64
 2   cơ quan công tác                        120017 non-null  object 
 3   tổng chi phí                            120017 non-null  float64
 4   loại hình cư trú                        120017 non-null  object 
 5   sản phẩm tại ngân hàng                  120017 non-null  object 
 6   tổng thu nhập                           120017 non-null  float64
 7   thời hạn vay vốn                        120017 non-null  float64
 8   lương, phụ cấp                          120017 non-null  float64
 9   số lượng sản phẩm đang sử dụng          120017 non-null  int64  
 10  số dư casa                              1200

In [5]:
df.duplicated().sum()

5303

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.duplicated().sum()

0

In [8]:
df.replace("unknown", pd.NA, inplace=True)
df = df.dropna()

In [9]:
df.describe()

Unnamed: 0,chênh lệch thu chi,thu nhập,tổng chi phí,tổng thu nhập,thời hạn vay vốn,"lương, phụ cấp",số lượng sản phẩm đang sử dụng,số dư casa,số người phụ thuộc,độ tuổi,số dư tiền gửi kỳ hạn,số sản phẩm phi tín dụng,thời gian công tác,dư nợ tại ngân hàng,số ngân hàng đang phát sinh dư nợ,số tiền vay,lãi vay dự kiến,thời han duy trì quan hệ tại ngân hàng,kỳ hạn đề xuất
count,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0,88862.0
mean,-1592821000.0,527312200.0,2120132000.0,527312200.0,22.668126,20363930.0,0.429689,16129830.0,60251.17,39.832651,95270560.0,25324.28,163.682575,87950180.0,0.361898,87950180.0,6610657.0,10.115325,22.668126
std,245283100000.0,97488720000.0,225074100000.0,97488720000.0,21.920522,364580000.0,0.876914,124529100.0,9820678.0,7.583986,1337755000.0,4794236.0,89.440017,70604830.0,11.980785,70604830.0,1342220000.0,123.25636,21.920522
min,-39761540000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,30.0,0.0,30.0,0.0,0.0,0.0
25%,6662839.0,13011120.0,4000000.0,13011120.0,12.0,8500000.0,0.0,336366.2,0.0,34.0,0.0,4.0,100.0,50000000.0,0.0,50000000.0,0.0,0.0,12.0
50%,12000000.0,19805450.0,6330000.0,19805450.0,12.0,11153160.0,0.0,2349392.0,0.0,39.0,0.0,5.0,156.0,60000000.0,0.0,60000000.0,0.0,1.0,12.0
75%,19947750.0,28592350.0,10000000.0,28592350.0,36.0,15600000.0,1.0,10000000.0,1.0,45.0,0.0,5.0,219.0,100000000.0,0.0,100000000.0,2206250.0,4.0,36.0
max,27921030000000.0,27921040000000.0,39762050000000.0,27921040000000.0,360.0,60000000000.0,78.0,21560600000.0,2000000000.0,123.0,135000000000.0,1000000000.0,696.0,1500000000.0,3563.0,1500000000.0,400001500000.0,2023.0,360.0


In [10]:
df.drop(columns=['lương, phụ cấp', 'thu nhập'])

Unnamed: 0,chênh lệch thu chi,cơ quan công tác,tổng chi phí,loại hình cư trú,sản phẩm tại ngân hàng,tổng thu nhập,thời hạn vay vốn,số lượng sản phẩm đang sử dụng,số dư casa,phân khúc khách hàng,...,số ngân hàng đang phát sinh dư nợ,số tiền vay,lãi vay dự kiến,hợp đồng lao động,thời han duy trì quan hệ tại ngân hàng,kỳ hạn đề xuất,ngành kinh tế,thông tin nhóm nợ,lịch sử quá hạn,RESULT
0,17520000.00,cq hanh chi su nghiep,12980000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,3.050000e+07,12.0,0,31678621.0,khach hang cap 1,...,0,100000000.0,980000.0,"cong chuc, vien chuc",3,12.0,van phong,khong co du no,da co quan he nhung chua tung qua han,AA-
1,4749121.67,cq hanh chi su nghiep,2470000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,7.219122e+06,60.0,0,85675.0,khach hang cap 5,...,0,100000000.0,2470000.0,"cong chuc, vien chuc",3,60.0,giao dich,khong co du no,da co quan he nhung chua tung qua han,A+
2,668138.00,cq hanh chi su nghiep,5000000.0,ho khau thuong tru,da tung co tien gui thanh toan hoac sp khac 12...,5.668138e+06,12.0,0,31267083.0,khach hang cap 1,...,0,30000000.0,0.0,hop dong khong xac dinh thoi han,0,12.0,van phong,khong co thong tin,khong co thong tin,A+
3,9016754.00,cq hanh chi su nghiep,13250000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,2.226675e+07,12.0,0,14774112.0,khach hang cap 1,...,0,100000000.0,9250000.0,hop dong khong xac dinh thoi han,0,12.0,van phong,khong co du no,khong co quan he tin dung,AA+
4,22579692.00,cq hanh chi su nghiep,13035000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,3.561469e+07,60.0,0,99847.0,khach hang cap 1,...,0,350000000.0,9035000.0,"cong chuc, vien chuc",3,60.0,van phong,khong co du no,da co quan he nhung chua tung qua han,A+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120011,13000000.00,cq hanh chi su nghiep,5000000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,1.800000e+07,50.0,0,17732959.0,khach hang cap 2,...,0,50000000.0,0.0,hop dong khong xac dinh thoi han,0,50.0,y te,khong co du no,da co quan he nhung chua tung qua han,AA
120012,11046328.40,cq hanh chi su nghiep,4.6,ho khau thuong tru,co tien gui thanh toan hoac sp khac,1.104633e+07,12.0,0,109497.0,khach hang cap 1,...,1,50000000.0,0.0,hop dong khong xac dinh thoi han,8,12.0,nganh khac,chi co no nhom 1,da co quan he nhung chua tung qua han,A+
120014,40000000.00,doanh nghiep nhan nuoc tren 50% von dieu le,8000000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,4.800000e+07,0.0,1,15468384.0,khach hang cap 1,...,0,100000000.0,0.0,hop dong khong xac dinh thoi han,0,0.0,van phong,chi co no nhom 1,da co quan he nhung chua tung qua han,AA
120015,8783163.00,cq hanh chi su nghiep,6700000.0,ho khau thuong tru,chua su dung sp nao,1.548316e+07,24.0,1,3599633.0,khach hang cap 1,...,0,50000000.0,500000.0,hop dong khong xac dinh thoi han,3,24.0,van phong,chi co no nhom 1,da co quan he nhung chua tung qua han,A


In [11]:
df["RESULT"].value_counts()

AA+    19984
AA     18309
AA-    17837
A+     14204
A       6818
A-      4753
AAA     4441
BBB     1596
BB       728
B        192
Name: RESULT, dtype: int64

In [12]:
from sklearn.utils import resample

majority_class = df[df["RESULT"] == "AA+"]
minority_class = df[df["RESULT"].isin(["BBB", "BB", "B"])] # get lowest 3 classes

minority_upsampled = resample(
    minority_class,
    replace=True,
    n_samples=len(majority_class),  # match number of samples in majority class
    random_state=42,  # reproducible results
)

# Combine majority class with upsampled minority class
upsampled_data = pd.concat([minority_class, minority_upsampled, df])

# Shuffle the upsampled df
upsampled_data = upsampled_data.sample(frac=1, replace=True, random_state=42)

upsampled_data["RESULT"].value_counts()

AA+    19875
AA     18313
AA-    17768
BBB    16106
A+     14058
BB      7387
A       6655
A-      4858
AAA     4443
B       1899
Name: RESULT, dtype: int64

In [13]:
upsampled_data.isnull().sum()

chênh lệch thu chi                        0
thu nhập                                  0
cơ quan công tác                          0
tổng chi phí                              0
loại hình cư trú                          0
sản phẩm tại ngân hàng                    0
tổng thu nhập                             0
thời hạn vay vốn                          0
lương, phụ cấp                            0
số lượng sản phẩm đang sử dụng            0
số dư casa                                0
phân khúc khách hàng                      0
tình trạng hôn nhân                       0
số người phụ thuộc                        0
độ tuổi                                   0
số dư tiền gửi kỳ hạn                     0
trình độ học vấn                          0
bảo hiểm                                  0
chức danh                                 0
số sản phẩm phi tín dụng                  0
nhà ở                                     0
nguồn trả nợ                              0
thời gian công tác              

In [14]:
from datetime import datetime

upsampled_data = upsampled_data.rename(columns=columns_dict)
upsampled_data["event_timestamp"] = pd.Timestamp(datetime.now(), tz="UTC")
upsampled_data["id"] = upsampled_data.reset_index().index
upsampled_data.head()

Unnamed: 0,income_expenditure_difference,income,working_agency,total_expenses,type_of_residence,bank_product,total_income,loan_term,salary_allowance,number_of_products_in_use,...,expected_loan_interest,labor_contract,duration_of_relationship_with_the_bank,proposed_term,economic_sector,debt_group_information,overdue_history,result,event_timestamp,id
79968,8615330.0,14615330.0,cq hanh chi su nghiep,6000000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,14615330.0,36.0,12615330.0,0,...,0.0,hop dong khong xac dinh thoi han,2,36.0,y te,khong co du no,khong co quan he tin dung,BBB,2024-04-19 17:54:11.870477+00:00,0
33083,81762782.0,94962782.0,cq hanh chi su nghiep,13200000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,94962782.0,12.0,13962782.0,1,...,1200000.0,hop dong khong xac dinh thoi han,5,12.0,van phong,chi co no nhom 1,da co quan he nhung chua tung qua han,BBB,2024-04-19 17:54:11.870477+00:00,1
105175,181350000.0,205000000.0,doanh nghiep tu nhan,23650000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,205000000.0,0.0,135000000.0,0,...,4400000.0,hop dong khong xac dinh thoi han,0,0.0,nganh khac,chi co no nhom 1,da co quan he nhung chua tung qua han,A-,2024-04-19 17:54:11.870477+00:00,2
117977,28800000.0,35800000.0,cq hanh chi su nghiep,7000000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,35800000.0,60.0,15800000.0,1,...,0.0,"cong chuc, vien chuc",3,60.0,giao dich,chi co no nhom 1,da co quan he nhung chua tung qua han,A+,2024-04-19 17:54:11.870477+00:00,3
55147,13500000.0,30000000.0,cq hanh chi su nghiep,16500000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,30000000.0,12.0,20000000.0,0,...,1500000.0,hop dong khong xac dinh thoi han,5,12.0,van phong,chi co no nhom 1,da co quan he nhung chua tung qua han,AA-,2024-04-19 17:54:11.870477+00:00,4


In [15]:
# Save
# upsampled_data.to_parquet("credit-dataset.parquet", index=False)
# upsampled_data.to_csv("credit-dataset.csv", index=False)
# upsampled_data[:100].to_parquet("batch_request.csv", index=False)

In [22]:
columns = upsampled_data.columns[:-3]
for col in columns:
    if col in ordinal_cols or col in ohe_cols:
        # print(f'Field(name="{col}", dtype=String),')
        # print(f'"{col}": np.object_,')
        continue
    else:
        # print(f'Field(name="{col}", dtype=Float32),')
        print(f'"{col}": np.float64,')
    # print(f'"credit_stats:{col}",')

"income_expenditure_difference": np.float64,
"income": np.float64,
"total_expenses": np.float64,
"total_income": np.float64,
"loan_term": np.float64,
"salary_allowance": np.float64,
"number_of_products_in_use": np.float64,
"casa_balance": np.float64,
"number_of_dependents": np.float64,
"age": np.float64,
"term_deposit_balance": np.float64,
"number_of_non_credit_products": np.float64,
"work_tenure": np.float64,
"bank_debt_balance": np.float64,
"number_of_banks_with_outstanding_debt": np.float64,
"loan_amount": np.float64,
"expected_loan_interest": np.float64,
"duration_of_relationship_with_the_bank": np.float64,
"proposed_term": np.float64,


In [17]:
features = upsampled_data.drop(columns_dict["RESULT"], axis=1)
target = upsampled_data[columns_dict["RESULT"]]

## Data encoder

In [18]:
continuous_features = features.select_dtypes(include=[np.number]).columns
continuous_features

Index(['income_expenditure_difference', 'income', 'total_expenses',
       'total_income', 'loan_term', 'salary_allowance',
       'number_of_products_in_use', 'casa_balance', 'number_of_dependents',
       'age', 'term_deposit_balance', 'number_of_non_credit_products',
       'work_tenure', 'bank_debt_balance',
       'number_of_banks_with_outstanding_debt', 'loan_amount',
       'expected_loan_interest', 'duration_of_relationship_with_the_bank',
       'proposed_term', 'id'],
      dtype='object')

In [19]:
category_features = features.select_dtypes(include=[np.object_]).columns
features[category_features].describe()

Unnamed: 0,working_agency,type_of_residence,bank_product,customer_segment,marital_status,educational_level,insurance,position,housing,debt_repayment_source,labor_contract,economic_sector,debt_group_information,overdue_history
count,111362,111362,111362,111362,111362,111362,111362,111362,111362,111362,111362,111362,111362,111362
unique,5,3,5,4,6,6,6,6,5,7,4,13,5,5
top,cq hanh chi su nghiep,ho khau thuong tru,co tien gui thanh toan hoac sp khac,khach hang cap 1,da ket hon,dai hoc,nhieu hon 1 loai bao hiem,chuyen vien,so huu toan bo,luong,hop dong khong xac dinh thoi han,van phong,khong co du no,da co quan he nhung chua tung qua han
freq,81807,104912,95150,93208,73588,103002,67866,58483,98046,63414,53696,48570,60330,72791


In [20]:
ordinal_cols = [
    "phân khúc khách hàng",
    "trình độ học vấn",
    "bảo hiểm",
    "nhà ở",
    "lịch sử quá hạn",
]

ohe_cols = [
    "cơ quan công tác",
    "loại hình cư trú",
    "sản phẩm tại ngân hàng",
    "tình trạng hôn nhân",
    "chức danh",
    "nguồn trả nợ",
    "ngành kinh tế",
    "hợp đồng lao động",
    "thông tin nhóm nợ"
]
ordinal_cols = [columns_dict[col] for col in ordinal_cols]
ohe_cols = [columns_dict[col] for col in ohe_cols]
assert len(ordinal_cols) + len(ohe_cols) == len(category_features)
ordinal_cols

['customer_segment',
 'educational_level',
 'insurance',
 'housing',
 'overdue_history']

In [27]:
features.columns

Index(['income_expenditure_difference', 'income', 'working_agency',
       'total_expenses', 'type_of_residence', 'bank_product', 'total_income',
       'loan_term', 'salary_allowance', 'number_of_products_in_use',
       'casa_balance', 'customer_segment', 'marital_status',
       'number_of_dependents', 'age', 'term_deposit_balance',
       'educational_level', 'insurance', 'position',
       'number_of_non_credit_products', 'housing', 'debt_repayment_source',
       'work_tenure', 'bank_debt_balance',
       'number_of_banks_with_outstanding_debt', 'loan_amount',
       'expected_loan_interest', 'labor_contract',
       'duration_of_relationship_with_the_bank', 'proposed_term',
       'economic_sector', 'debt_group_information', 'overdue_history'],
      dtype='object')

# Modeling

## Pipeline for data transformers

In [36]:
continuous_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

category_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        # ("ohe", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
        # ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ordinal_cols),
        ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", continuous_transformer, continuous_features),
        ("category", category_transformer, category_features)
    ], remainder="passthrough"
)

In [37]:
label_encoder = LabelEncoder()

### Split data

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.2,
    random_state=42,
    stratify=target,
    shuffle=True,
)

In [39]:
X_train

Unnamed: 0,income_expenditure_difference,income,working_agency,total_expenses,type_of_residence,bank_product,total_income,loan_term,salary_allowance,number_of_products_in_use,...,bank_debt_balance,number_of_banks_with_outstanding_debt,loan_amount,expected_loan_interest,labor_contract,duration_of_relationship_with_the_bank,proposed_term,economic_sector,debt_group_information,overdue_history
75333,51548424.67,74048424.67,cq hanh chi su nghiep,22500000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,74048424.67,0.0,19048424.67,1,...,200000000.0,1,200000000.0,200000.0,"cong chuc, vien chuc",15,0.0,van phong,chi co no nhom 1,da co quan he nhung chua tung qua han
9492,46000000.00,51000000.00,doanh nghiep tu nhan,5000000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,51000000.00,12.0,50000000.00,1,...,50000000.0,0,50000000.0,0.0,hop dong khong xac dinh thoi han,0,12.0,cong nghe,khong co thong tin,da co quan he nhung chua tung qua han
35038,17000000.00,24000000.00,cq hanh chi su nghiep,7000000.0,ho khau thuong tru,da tung co tien gui thanh toan hoac sp khac 12...,24000000.00,12.0,10000000.00,0,...,40000000.0,0,40000000.0,0.0,hop dong khong xac dinh thoi han,0,12.0,giao dich,khong co du no,da co quan he nhung chua tung qua han
26554,6378233.00,11378233.00,cq hanh chi su nghiep,5000000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,11378233.00,12.0,11378233.00,0,...,30000000.0,0,30000000.0,0.0,hop dong khong xac dinh thoi han,4,12.0,van phong,khong co du no,da co quan he nhung chua tung qua han
91525,19546334.00,38093834.00,cq hanh chi su nghiep,18547500.0,ho khau thuong tru,da tung co tien gui thanh toan hoac sp khac 12...,38093834.00,60.0,11293834.00,0,...,250000000.0,2,250000000.0,6347500.0,hop dong khong xac dinh thoi han,2,60.0,giao dich,chi co no nhom 1,da co quan he nhung chua tung qua han
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81877,14650000.00,31000000.00,doanh nghiep tu nhan,16350000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,31000000.00,0.0,31000000.00,0,...,70000000.0,3,70000000.0,0.0,hop dong khong xac dinh thoi han,0,0.0,van phong,chi co no nhom 1,da co quan he nhung chua tung qua han
45145,33500000.00,40500000.00,doanh nghiep tu nhan,7000000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,40500000.00,0.0,40500000.00,0,...,50000000.0,0,50000000.0,0.0,hop dong khong xac dinh thoi han,0,0.0,nganh khac,khong co du no,khong co quan he tin dung
27800,11846153.00,23129153.00,cq hanh chi su nghiep,11283000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,23129153.00,48.0,9569153.00,0,...,200000000.0,0,200000000.0,6283000.0,"cong chuc, vien chuc",0,48.0,y te,khong co du no,khong co quan he tin dung
7715,1700000.00,9200000.00,cq hanh chi su nghiep,7500000.0,ho khau thuong tru,co tien gui thanh toan hoac sp khac,9200000.00,12.0,9200000.00,0,...,50000000.0,0,50000000.0,0.0,"cong chuc, vien chuc",0,12.0,van phong,khong co du no,khong co quan he tin dung


In [40]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [45]:
label_encoder.classes_

array(['A', 'A+', 'A-', 'AA', 'AA+', 'AA-', 'AAA', 'B', 'BB', 'BBB'],
      dtype=object)

In [48]:
# ohe_feature_names = preprocessor.named_transformers_['ohe'].get_feature_names_out(ohe_cols)

# all_feature_names = np.concatenate(
#     [continuous_features, ohe_feature_names, ordinal_cols]
# )

# X_train_processed_df = pd.DataFrame(X_train_processed, columns=all_feature_names)

# X_train_processed_df.head()

### Evaluation function

In [49]:
def evaluate_model(model_pipeline, X_val, Y_val):
    predictions = model_pipeline.predict(X_val)

    report = classification_report(Y_val, predictions, output_dict=True)
    f1_macro = report["macro avg"]["f1-score"]
    precision_macro = report["macro avg"]["precision"]
    recall_macro = report["macro avg"]["recall"]

    metrics_df = pd.DataFrame(
        {
            "F1-Score (Macro)": [f1_macro],
            "Precision (Macro)": [precision_macro],
            "Recall (Macro)": [recall_macro],
        }
    )

    return metrics_df

## Model pipelines

In [50]:
xgb_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "model",
            XGBClassifier(
                max_depth=10,
                n_estimators=200,
                learning_rate=0.05,
                random_state=42,
                subsample=0.8,
            ),
        ),
    ]
)

### Model fitting

In [51]:
xgb_pipeline.fit(X_train, y_train_encoded)

### Evaluation

In [52]:
results = []

for name, pipeline in [
    ("XGBoost Classifier", xgb_pipeline),
]:
    result = evaluate_model(pipeline, X_test, y_test_encoded)
    result.index = [name]
    results.append(result)

results_df = pd.concat(results)
results_df

Unnamed: 0,F1-Score (Macro),Precision (Macro),Recall (Macro)
XGBoost Classifier,0.820308,0.844845,0.805488
