In [1]:
!pwd

/home/quyanh/Projects/credit-ranking/src/training_pipeline/nbs


In [11]:
import pandas as pd
from pathlib import Path
import numpy as np

SEED = 43
np.random.seed(SEED)

# Load Data

In [5]:
DATA_DIR = Path("../data")
FILE_NAME = "credit-dataset.parquet"
DATA_PATH = DATA_DIR / FILE_NAME

if not DATA_PATH.is_file():
    raise Exception("DATA_PATH not found")

In [8]:
df = pd.read_parquet(DATA_PATH)
df.head()

Unnamed: 0,id,income_expenditure_difference,income,total_expenses,loan_term,expected_loan_interest,result,event_timestamp
0,0,17520000.0,30500000.0,12980000.0,12.0,980000.0,AA-,2024-03-30 14:45:28.686584+00:00
1,1,4749121.67,7219121.67,2470000.0,60.0,2470000.0,A+,2024-03-30 14:45:28.686584+00:00
2,2,668138.0,5668138.0,5000000.0,12.0,0.0,A+,2024-03-30 14:45:28.686584+00:00
3,3,9016754.0,22266754.0,13250000.0,12.0,9250000.0,AA+,2024-03-30 14:45:28.686584+00:00
4,4,22579692.0,35614692.0,13035000.0,60.0,9035000.0,A+,2024-03-30 14:45:28.686584+00:00


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120017 entries, 0 to 120016
Data columns (total 8 columns):
 #   Column                         Non-Null Count   Dtype              
---  ------                         --------------   -----              
 0   id                             120017 non-null  int64              
 1   income_expenditure_difference  120017 non-null  float64            
 2   income                         120017 non-null  float64            
 3   total_expenses                 120017 non-null  float64            
 4   loan_term                      120017 non-null  float64            
 5   expected_loan_interest         120017 non-null  float64            
 6   result                         120017 non-null  object             
 7   event_timestamp                120017 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(5), int64(1), object(1)
memory usage: 7.3+ MB


In [20]:
df.describe()

Unnamed: 0,id,income_expenditure_difference,income,total_expenses,loan_term,expected_loan_interest
count,120017.0,120017.0,120017.0,120017.0,120017.0,120017.0
mean,60014.127657,-1179054000.0,390924100.0,1569876000.0,44.229368,4949105.0
std,34656.541324,211060000000.0,83886500000.0,193671800000.0,70.941731,1155007000.0
min,0.0,-39761540000000.0,0.0,0.0,0.0,0.0
25%,30004.0,0.0,5100000.0,0.0,12.0,0.0
50%,60008.0,8419780.0,15380000.0,5000000.0,12.0,0.0
75%,90012.0,16583330.0,25000000.0,8500000.0,60.0,1000000.0
max,124049.0,27921030000000.0,27921040000000.0,39762050000000.0,1092.0,400001500000.0


# EDA

# Processing

In [25]:
features_name = df.columns[1:-2].tolist()
target_name = df.columns[-2]
features_name, target_name

(['income_expenditure_difference',
  'income',
  'total_expenses',
  'loan_term',
  'expected_loan_interest'],
 'result')

In [27]:
labels = df[target_name].unique()
labels

array(['AA-', 'A+', 'AA+', 'BBB', 'A-', 'A', 'AA', 'AAA', 'BB', 'B'],
      dtype=object)

In [31]:
# id_to_labels = dict(enumerate(labels))
label_to_ids = {l: i for i, l in enumerate(labels)}
label_to_ids

{'AA-': 0,
 'A+': 1,
 'AA+': 2,
 'BBB': 3,
 'A-': 4,
 'A': 5,
 'AA': 6,
 'AAA': 7,
 'BB': 8,
 'B': 9}

In [32]:
# Replace
df[target_name] = df[target_name].replace(label_to_ids)
df.head()

Unnamed: 0,id,income_expenditure_difference,income,total_expenses,loan_term,expected_loan_interest,result,event_timestamp
0,0,17520000.0,30500000.0,12980000.0,12.0,980000.0,0,2024-03-30 14:45:28.686584+00:00
1,1,4749121.67,7219121.67,2470000.0,60.0,2470000.0,1,2024-03-30 14:45:28.686584+00:00
2,2,668138.0,5668138.0,5000000.0,12.0,0.0,1,2024-03-30 14:45:28.686584+00:00
3,3,9016754.0,22266754.0,13250000.0,12.0,9250000.0,2,2024-03-30 14:45:28.686584+00:00
4,4,22579692.0,35614692.0,13035000.0,60.0,9035000.0,1,2024-03-30 14:45:28.686584+00:00


# Split

In [33]:
features_name = df.columns[1:-2].tolist()
target_name = df.columns[-2]
features_name, target_name

(['income_expenditure_difference',
  'income',
  'total_expenses',
  'loan_term',
  'expected_loan_interest'],
 'result')

In [40]:
df[target_name].value_counts()

9    29680
2    20362
6    18631
0    18195
1    14479
5     6939
4     4848
7     4516
3     1625
8      742
Name: result, dtype: int64

In [34]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

TEST_SIZE = 0.2
X, y = df[features_name], df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((96013, 5), (24004, 5), (96013,), (24004,))

# Training model

In [43]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)

# Evaluation

In [44]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.26      0.25      0.25      3581
           1       0.29      0.27      0.28      2981
           2       0.36      0.71      0.48      4061
           3       0.19      0.03      0.05       330
           4       0.26      0.11      0.15      1000
           5       0.20      0.06      0.09      1382
           6       0.26      0.21      0.23      3694
           7       0.23      0.02      0.04       889
           8       0.26      0.06      0.10       142
           9       1.00      0.99      1.00      5944

    accuracy                           0.48     24004
   macro avg       0.33      0.27      0.27     24004
weighted avg       0.46      0.48      0.45     24004

