In [1]:
!pwd

/home/quyanh/Projects/credit-ranking/src/training_pipeline/nbs


In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

SEED = 43
np.random.seed(SEED)

# Load Data

In [2]:
DATA_DIR = Path("../data")
FILE_NAME = "credit-dataset.parquet"
DATA_PATH = DATA_DIR / FILE_NAME

if not DATA_PATH.is_file():
    raise Exception("DATA_PATH not found")

In [3]:
df = pd.read_parquet(DATA_PATH)
df.head()

Unnamed: 0,id,income_expenditure_difference,income,total_expenses,loan_term,expected_loan_interest,result,event_timestamp
0,0,17520000.0,30500000.0,12980000.0,12.0,980000.0,AA-,2024-03-30 14:45:28.686584+00:00
1,1,4749121.67,7219121.67,2470000.0,60.0,2470000.0,A+,2024-03-30 14:45:28.686584+00:00
2,2,668138.0,5668138.0,5000000.0,12.0,0.0,A+,2024-03-30 14:45:28.686584+00:00
3,3,9016754.0,22266754.0,13250000.0,12.0,9250000.0,AA+,2024-03-30 14:45:28.686584+00:00
4,4,22579692.0,35614692.0,13035000.0,60.0,9035000.0,A+,2024-03-30 14:45:28.686584+00:00


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120017 entries, 0 to 120016
Data columns (total 8 columns):
 #   Column                         Non-Null Count   Dtype              
---  ------                         --------------   -----              
 0   id                             120017 non-null  int64              
 1   income_expenditure_difference  120017 non-null  float64            
 2   income                         120017 non-null  float64            
 3   total_expenses                 120017 non-null  float64            
 4   loan_term                      120017 non-null  float64            
 5   expected_loan_interest         120017 non-null  float64            
 6   result                         120017 non-null  object             
 7   event_timestamp                120017 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(5), int64(1), object(1)
memory usage: 7.3+ MB


In [5]:
df.describe()

Unnamed: 0,id,income_expenditure_difference,income,total_expenses,loan_term,expected_loan_interest
count,120017.0,120017.0,120017.0,120017.0,120017.0,120017.0
mean,60014.127657,-1179054000.0,390924100.0,1569876000.0,44.229368,4949105.0
std,34656.541324,211060000000.0,83886500000.0,193671800000.0,70.941731,1155007000.0
min,0.0,-39761540000000.0,0.0,0.0,0.0,0.0
25%,30004.0,0.0,5100000.0,0.0,12.0,0.0
50%,60008.0,8419780.0,15380000.0,5000000.0,12.0,0.0
75%,90012.0,16583330.0,25000000.0,8500000.0,60.0,1000000.0
max,124049.0,27921030000000.0,27921040000000.0,39762050000000.0,1092.0,400001500000.0


# EDA

# Processing

In [6]:
features_name = df.columns[1:-2].tolist()
target_name = df.columns[-2]
features_name, target_name

(['income_expenditure_difference',
  'income',
  'total_expenses',
  'loan_term',
  'expected_loan_interest'],
 'result')

In [7]:
labels = df[target_name].unique()
labels

array(['AA-', 'A+', 'AA+', 'BBB', 'A-', 'A', 'AA', 'AAA', 'BB', 'B'],
      dtype=object)

In [8]:
# id_to_labels = dict(enumerate(labels))
label_to_ids = {l: i for i, l in enumerate(labels)}
label_to_ids

{'AA-': 0,
 'A+': 1,
 'AA+': 2,
 'BBB': 3,
 'A-': 4,
 'A': 5,
 'AA': 6,
 'AAA': 7,
 'BB': 8,
 'B': 9}

In [9]:
# Replace
df[target_name] = df[target_name].replace(label_to_ids)
df.head()

Unnamed: 0,id,income_expenditure_difference,income,total_expenses,loan_term,expected_loan_interest,result,event_timestamp
0,0,17520000.0,30500000.0,12980000.0,12.0,980000.0,0,2024-03-30 14:45:28.686584+00:00
1,1,4749121.67,7219121.67,2470000.0,60.0,2470000.0,1,2024-03-30 14:45:28.686584+00:00
2,2,668138.0,5668138.0,5000000.0,12.0,0.0,1,2024-03-30 14:45:28.686584+00:00
3,3,9016754.0,22266754.0,13250000.0,12.0,9250000.0,2,2024-03-30 14:45:28.686584+00:00
4,4,22579692.0,35614692.0,13035000.0,60.0,9035000.0,1,2024-03-30 14:45:28.686584+00:00


# Split

In [10]:
features_name = df.columns[1:-2].tolist()
target_name = df.columns[-2]
features_name, target_name

(['income_expenditure_difference',
  'income',
  'total_expenses',
  'loan_term',
  'expected_loan_interest'],
 'result')

In [11]:
df[target_name].value_counts()

9    29680
2    20362
6    18631
0    18195
1    14479
5     6939
4     4848
7     4516
3     1625
8      742
Name: result, dtype: int64

In [12]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

TEST_SIZE = 0.2
X, y = df[features_name], df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((96013, 5), (24004, 5), (96013,), (24004,))

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def eval_metrics(actual, pred):
    accuracy = accuracy_score(actual, pred)
    precision = precision_score(actual, pred, average='macro')
    recall = recall_score(actual, pred, average='macro')
    f1 = f1_score(actual, pred, average='macro')
    return accuracy, precision, recall, f1

# Training model

In [19]:
from mlflow.tracking import MlflowClient
import mlflow

def yield_artifacts(run_id, path=None):
    """Yield all artifacts in the specified run"""
    client = MlflowClient()
    for item in client.list_artifacts(run_id, path):
        if item.is_dir:
            yield from yield_artifacts(run_id, item.path)
        else:
            yield item.path

def fetch_logged_data(run_id):
    """Fetch params, metrics, tags, and artifacts in the specified run"""
    client = MlflowClient()
    data = client.get_run(run_id).data
    # Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = list(yield_artifacts(run_id))
    return {
        "params": data.params,
        "metrics": data.metrics,
        "tags": tags,
        "artifacts": artifacts,
    }

MLFLOW_TRACKING_URI = "http://127.0.0.0:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("xgb")
print((mlflow.get_tracking_uri(), mlflow.get_artifact_uri()))
mlflow.sklearn.autolog()

2024/04/16 23:23:49 INFO mlflow.tracking.fluent: Experiment with name 'xgb' does not exist. Creating a new experiment.


('http://127.0.0.0:5000', 'mlflow-artifacts:/368780712213608333/2a55d712357c4e70a37b757777ded7f2/artifacts')


In [15]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)

# Evaluation

In [27]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

accuracy, precision, recall, f1 = eval_metrics(y_test, y_pred)
accuracy, precision, recall, f1

(0.4781286452257957, 0.332258913258397, 0.2712510156718058, 0.2680585041213551)

In [29]:
import uuid
from mlflow.models.signature import infer_signature

mlflow.set_tag("mlflow.runName", uuid.uuid1())
mlflow.log_param("features", features_name)
mlflow.log_metric("testing_accuracy", accuracy)
mlflow.log_metric("testing_precision", precision)
mlflow.log_metric("testing_recall", recall)
mlflow.log_metric("testing_recall", f1)

signature = infer_signature(X_train, model.predict(X_train))
mlflow.sklearn.log_model(
    sk_model=model,
    artifact_path="model",
    signature=signature,
)
mlflow.end_run()

run_id = mlflow.last_active_run().info.run_id
print("Logged data and model in run {}".format(run_id))
for key, data in fetch_logged_data(run_id).items():
    print("\n---------- logged {} ----------".format(key))
    print(data)



Logged data and model in run 2a55d712357c4e70a37b757777ded7f2

---------- logged params ----------
{'features': "['income_expenditure_difference', 'income', 'total_expenses', 'loan_term', 'expected_loan_interest']"}

---------- logged metrics ----------
{'testing_recall': 0.2680585041213551, 'testing_accuracy': 0.4781286452257957, 'testing_precision': 0.332258913258397}

---------- logged tags ----------
{}

---------- logged artifacts ----------
['model/MLmodel', 'model/conda.yaml', 'model/metadata/MLmodel', 'model/metadata/conda.yaml', 'model/metadata/python_env.yaml', 'model/metadata/requirements.txt', 'model/model.pkl', 'model/python_env.yaml', 'model/requirements.txt']


# Reload model

In [30]:
run_id

'2a55d712357c4e70a37b757777ded7f2'

In [31]:
loaded_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
loaded_model

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.sklearn
  run_id: 2a55d712357c4e70a37b757777ded7f2

In [32]:
predictions = loaded_model.predict(X_test[:2])
predictions

array([6, 2])