# Documentation

index one-hot encoding

# Import packages

In [1]:
import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import csv
import optuna
import xgboost as xgb
from optuna.integration import XGBoostPruningCallback
from train_evaluate import calculate_metrics
from ensemble_encode import index_encode

In [2]:
num_act = 29
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Set printed tensor format
torch.set_printoptions(sci_mode=False, precision=3)

# Load data

In [4]:
# define file path
train_prefix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/train_prefix_act_0616_l.pt'
train_prefix_time_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/train_prefix_time_0616_l.pt'
train_suffix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/train_suffix_act_0616.pt'

val_prefix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/val_prefix_act_0616_l.pt'
val_prefix_time_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/val_prefix_time_0616_l.pt'
val_suffix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/val_suffix_act_0616.pt'

test_prefix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/test_prefix_act_0616_l.pt'
test_prefix_time_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/test_prefix_time_0616_l.pt'
test_suffix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/test_suffix_act_0616.pt'

## X_train

In [5]:
train_prefix_act_tensor = torch.load(train_prefix_act_tensor_path)
print(train_prefix_act_tensor.shape)

train_prefix_time_tensor = torch.load(train_prefix_time_tensor_path)
print(train_prefix_time_tensor.shape)

  train_prefix_act_tensor = torch.load(train_prefix_act_tensor_path)


torch.Size([630994, 87])


  train_prefix_time_tensor = torch.load(train_prefix_time_tensor_path)


torch.Size([630994, 87, 2])


In [6]:
print(train_prefix_act_tensor[1])
print(train_prefix_time_tensor[1])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5])
tensor([[-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -1000

In [7]:
X_train = index_encode(train_prefix_act_tensor, train_prefix_time_tensor, num_act)
print(X_train.shape)
type(X_train)

(630994, 2697)


numpy.ndarray

In [8]:
X_train[1]

array([1.        , 0.        , 0.        , ..., 0.        , 0.0030776 ,
       0.00316715], dtype=float32)

## X_val

In [9]:
val_prefix_act_tensor = torch.load(val_prefix_act_tensor_path)
print(val_prefix_act_tensor.shape)

val_prefix_time_tensor = torch.load(val_prefix_time_tensor_path)
print(val_prefix_time_tensor.shape)

  val_prefix_act_tensor = torch.load(val_prefix_act_tensor_path)


torch.Size([150877, 87])
torch.Size([150877, 87, 2])


  val_prefix_time_tensor = torch.load(val_prefix_time_tensor_path)


In [10]:
print(val_prefix_act_tensor[1])
print(val_prefix_time_tensor[1])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5])
tensor([[-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -1000

In [11]:
X_val = index_encode(val_prefix_act_tensor, val_prefix_time_tensor, num_act)
print(X_val.shape)
type(X_val)

(150877, 2697)


numpy.ndarray

In [12]:
X_val[1]

array([1.        , 0.        , 0.        , ..., 0.        , 0.00276367,
       0.00284408], dtype=float32)

## X_test

In [13]:
test_prefix_act_tensor = torch.load(test_prefix_act_tensor_path)
print(test_prefix_act_tensor.shape)

test_prefix_time_tensor = torch.load(test_prefix_time_tensor_path)
print(test_prefix_time_tensor.shape)

  test_prefix_act_tensor = torch.load(test_prefix_act_tensor_path)


torch.Size([241181, 87])
torch.Size([241181, 87, 2])


  test_prefix_time_tensor = torch.load(test_prefix_time_tensor_path)


In [14]:
print(test_prefix_act_tensor[1])
print(test_prefix_time_tensor[1])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8])
tensor([[-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -1000

In [15]:
X_test = index_encode(test_prefix_act_tensor, test_prefix_time_tensor, num_act)
print(X_test.shape)
type(X_test)

(241181, 2697)


numpy.ndarray

In [16]:
X_test[1]

array([1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
       1.3115592e-04, 1.3497197e-04], dtype=float32)

## y_train

In [17]:
train_suffix_act_tensor = torch.load(train_suffix_act_tensor_path)
print(train_suffix_act_tensor.shape)

y_train_tensor = train_suffix_act_tensor[:, 0]
print(y_train_tensor.shape)

y_train = y_train_tensor.numpy()
type(y_train)

  train_suffix_act_tensor = torch.load(train_suffix_act_tensor_path)


torch.Size([630994, 87])
torch.Size([630994])


numpy.ndarray

In [18]:
y_train[0]

5

## y_val

In [19]:
val_suffix_act_tensor = torch.load(val_suffix_act_tensor_path)
print(val_suffix_act_tensor.shape)

y_val_tensor = val_suffix_act_tensor[:, 0]
print(y_val_tensor.shape)

y_val = y_val_tensor.numpy()
type(y_val)

  val_suffix_act_tensor = torch.load(val_suffix_act_tensor_path)


torch.Size([150877, 87])
torch.Size([150877])


numpy.ndarray

In [20]:
y_val[0]

5

## y_test

In [21]:
test_suffix_act_tensor = torch.load(test_suffix_act_tensor_path)
print(test_suffix_act_tensor.shape)

y_test_tensor = test_suffix_act_tensor[:, 0]
print(y_test_tensor.shape)

y_test = y_test_tensor.numpy()
type(y_test)

  test_suffix_act_tensor = torch.load(test_suffix_act_tensor_path)


torch.Size([241181, 87])
torch.Size([241181])


numpy.ndarray

In [22]:
y_test[0]

8

# Hyperparameter tuning

In [23]:
def objective(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': num_act,
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'eta': trial.suggest_float('eta', 0.01, 0.3, log=True)
    }

    # Convert the data into DMatrix format
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dvalid = xgb.DMatrix(data=X_val, label=y_val)

    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-mlogloss')

    # Train the model with early stopping
    model = xgb.train(param, 
                      dtrain, 
                      evals=[(dvalid, 'validation')], 
                      num_boost_round = 100000, 
                      early_stopping_rounds=20, 
                      callbacks=[pruning_callback],
                      verbose_eval=False)

    return model.best_score

In [24]:
# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=7))
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-06-21 19:24:33,958] A new study created in memory with name: no-name-16f7cd53-ae62-42aa-83ad-630ddf19c70c


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-06-21 23:43:47,520] Trial 0 finished with value: 0.29281825491937435 and parameters: {'max_depth': 3, 'min_child_weight': 8.019269130161032, 'subsample': 0.7192046157204468, 'colsample_bytree': 0.8617325889154706, 'lambda': 9.782096168766367, 'eta': 0.062434364533734486}. Best is trial 0 with value: 0.29281825491937435.


[I 2025-06-22 00:15:10,985] Trial 1 finished with value: 0.29304800637353273 and parameters: {'max_depth': 8, 'min_child_weight': 1.648460200237854, 'subsample': 0.6342194900509356, 'colsample_bytree': 0.74994125041278, 'lambda': 6.82437696159731, 'eta': 0.153893406647721}. Best is trial 0 with value: 0.29281825491937435.


[I 2025-06-22 02:37:14,880] Trial 2 finished with value: 0.289645412499421 and parameters: {'max_depth': 6, 'min_child_weight': 1.593427122153146, 'subsample': 0.6440727996539968, 'colsample_bytree': 0.9547967638598068, 'lambda': 2.212515000441164, 'eta': 0.046541664776787166}. Best is trial 2 with value: 0.289645412499421.


[I 2025-06-22 03:15:34,299] Trial 3 finished with value: 0.2907469851373043 and parameters: {'max_depth': 12, 'min_child_weight': 1.224093047953132, 'subsample': 0.8002744587320613, 'colsample_bytree': 0.9750647502068228, 'lambda': 2.3799985023075516, 'eta': 0.06459309846458268}. Best is trial 2 with value: 0.289645412499421.


[I 2025-06-22 04:13:43,583] Trial 4 finished with value: 0.2891276865867367 and parameters: {'max_depth': 12, 'min_child_weight': 2.1985250118332513, 'subsample': 0.761706290336883, 'colsample_bytree': 0.8752049295510174, 'lambda': 6.723231084750746, 'eta': 0.04908261480571623}. Best is trial 4 with value: 0.2891276865867367.


[I 2025-06-22 04:37:58,203] Trial 5 pruned. Trial was pruned at iteration 146.


[I 2025-06-22 04:38:19,658] Trial 6 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 04:59:46,238] Trial 7 finished with value: 0.2934248396373904 and parameters: {'max_depth': 9, 'min_child_weight': 4.333159746892317, 'subsample': 0.7295464889457157, 'colsample_bytree': 0.8596620612545207, 'lambda': 4.188619108226963, 'eta': 0.21822126884966023}. Best is trial 4 with value: 0.2891276865867367.


[I 2025-06-22 05:00:06,038] Trial 8 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 05:50:11,116] Trial 9 finished with value: 0.2919650588929062 and parameters: {'max_depth': 7, 'min_child_weight': 1.0128419250648237, 'subsample': 0.546131172923376, 'colsample_bytree': 0.8546971968626064, 'lambda': 5.291021407975452, 'eta': 0.10673661296072598}. Best is trial 4 with value: 0.2891276865867367.


[I 2025-06-22 05:50:33,086] Trial 10 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 05:50:53,145] Trial 11 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 05:51:14,021] Trial 12 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 05:51:33,814] Trial 13 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 06:48:07,069] Trial 14 finished with value: 0.2900458993273704 and parameters: {'max_depth': 7, 'min_child_weight': 5.756512909572162, 'subsample': 0.6449059270855146, 'colsample_bytree': 0.9198297640193214, 'lambda': 2.115630095260182, 'eta': 0.09763705328094417}. Best is trial 4 with value: 0.2891276865867367.


[I 2025-06-22 06:48:28,115] Trial 15 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 06:48:48,199] Trial 16 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 07:40:38,196] Trial 17 finished with value: 0.28867654652693947 and parameters: {'max_depth': 8, 'min_child_weight': 5.361891734274888, 'subsample': 0.9618841777880232, 'colsample_bytree': 0.6582241534026132, 'lambda': 8.002051385874758, 'eta': 0.10167697633502167}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 07:57:18,819] Trial 18 finished with value: 0.2928298552002892 and parameters: {'max_depth': 11, 'min_child_weight': 5.940036564846178, 'subsample': 0.9447342964452043, 'colsample_bytree': 0.6285098329592018, 'lambda': 7.955084400764418, 'eta': 0.2922977813893516}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 07:57:39,285] Trial 19 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 07:58:00,329] Trial 20 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 07:58:20,236] Trial 21 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 07:58:39,030] Trial 22 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 07:58:58,595] Trial 23 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 08:26:41,674] Trial 24 finished with value: 0.29003299323215104 and parameters: {'max_depth': 11, 'min_child_weight': 1.8674076626915284, 'subsample': 0.8350222840273821, 'colsample_bytree': 0.6930094877928307, 'lambda': 8.61748217546385, 'eta': 0.12501447588573117}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 08:27:02,067] Trial 25 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 08:27:22,627] Trial 26 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 08:27:45,242] Trial 27 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 09:02:00,579] Trial 28 pruned. Trial was pruned at iteration 174.


[I 2025-06-22 09:02:21,160] Trial 29 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 09:02:40,252] Trial 30 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 09:29:34,171] Trial 31 finished with value: 0.290577535929854 and parameters: {'max_depth': 11, 'min_child_weight': 1.6910040560603639, 'subsample': 0.8197142286177913, 'colsample_bytree': 0.6967999586470214, 'lambda': 8.282960862310418, 'eta': 0.12828231309773894}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 09:49:26,992] Trial 32 finished with value: 0.29168081752985525 and parameters: {'max_depth': 12, 'min_child_weight': 1.9269070749473243, 'subsample': 0.8418719663027149, 'colsample_bytree': 0.6838498412182695, 'lambda': 9.849185508886809, 'eta': 0.17914137983552253}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 10:14:07,174] Trial 33 finished with value: 0.29188110993423405 and parameters: {'max_depth': 11, 'min_child_weight': 1.431095948196229, 'subsample': 0.7125343952920604, 'colsample_bytree': 0.7566911929332866, 'lambda': 7.0811863782447295, 'eta': 0.14295291954657957}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 10:14:28,995] Trial 34 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:33:01,111] Trial 35 finished with value: 0.2927635385193692 and parameters: {'max_depth': 10, 'min_child_weight': 3.6406892272699247, 'subsample': 0.8220973961799578, 'colsample_bytree': 0.7587948003637034, 'lambda': 7.5732871750599315, 'eta': 0.23054887870055552}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 10:33:23,116] Trial 36 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:33:43,656] Trial 37 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:34:04,174] Trial 38 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:54:05,261] Trial 39 finished with value: 0.2918177976161026 and parameters: {'max_depth': 11, 'min_child_weight': 1.4396064088715315, 'subsample': 0.7344107974844878, 'colsample_bytree': 0.7261820491584801, 'lambda': 7.897380874805487, 'eta': 0.17224656478912012}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 10:54:25,529] Trial 40 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:54:45,739] Trial 41 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:55:05,706] Trial 42 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:55:26,175] Trial 43 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:55:46,531] Trial 44 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:56:06,222] Trial 45 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:56:27,298] Trial 46 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 10:56:48,221] Trial 47 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 11:02:49,170] Trial 48 pruned. Trial was pruned at iteration 36.


[I 2025-06-22 11:23:37,507] Trial 49 finished with value: 0.2923785825541405 and parameters: {'max_depth': 12, 'min_child_weight': 5.569308315086402, 'subsample': 0.7216649831330747, 'colsample_bytree': 0.9242985489114115, 'lambda': 1.7328605557591312, 'eta': 0.14878599580636395}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 11:23:57,622] Trial 50 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 11:50:47,463] Trial 51 finished with value: 0.2904439994982651 and parameters: {'max_depth': 11, 'min_child_weight': 1.7067186991626209, 'subsample': 0.8276888421575018, 'colsample_bytree': 0.6965845825650367, 'lambda': 8.327507369703344, 'eta': 0.12961030032731954}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 11:51:09,284] Trial 52 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 12:10:21,142] Trial 53 finished with value: 0.2912669564988149 and parameters: {'max_depth': 10, 'min_child_weight': 1.9260657217057209, 'subsample': 0.8460890711545402, 'colsample_bytree': 0.7815475342859417, 'lambda': 7.873726145476933, 'eta': 0.18509420179264915}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 12:10:47,768] Trial 54 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 12:17:07,138] Trial 55 pruned. Trial was pruned at iteration 35.


[I 2025-06-22 12:17:27,886] Trial 56 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 12:17:48,052] Trial 57 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 12:31:33,532] Trial 58 pruned. Trial was pruned at iteration 69.


[I 2025-06-22 13:00:13,130] Trial 59 finished with value: 0.2905396686521372 and parameters: {'max_depth': 12, 'min_child_weight': 5.460701371957095, 'subsample': 0.7393532109923847, 'colsample_bytree': 0.6186698349434505, 'lambda': 8.953388245403577, 'eta': 0.1337605283619569}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 13:00:34,042] Trial 60 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 13:00:55,325] Trial 61 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 13:01:17,358] Trial 62 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 13:25:55,610] Trial 63 finished with value: 0.29055902263424527 and parameters: {'max_depth': 12, 'min_child_weight': 5.008555570669058, 'subsample': 0.7925256329691098, 'colsample_bytree': 0.9740117198307867, 'lambda': 9.02569758469706, 'eta': 0.14206727256965}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 13:26:16,512] Trial 64 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 13:37:05,560] Trial 65 pruned. Trial was pruned at iteration 55.


[I 2025-06-22 14:01:02,268] Trial 66 finished with value: 0.29159888325610417 and parameters: {'max_depth': 11, 'min_child_weight': 5.579655475584983, 'subsample': 0.73887736039029, 'colsample_bytree': 0.7133048678321241, 'lambda': 6.649521433911328, 'eta': 0.1619195718111225}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 14:01:23,564] Trial 67 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 14:01:44,555] Trial 68 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 14:16:18,819] Trial 69 pruned. Trial was pruned at iteration 71.


[I 2025-06-22 14:16:39,534] Trial 70 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 14:41:38,260] Trial 71 finished with value: 0.2900325480060671 and parameters: {'max_depth': 12, 'min_child_weight': 4.747134572444972, 'subsample': 0.7945627235224213, 'colsample_bytree': 0.9862586954777832, 'lambda': 9.204648739347649, 'eta': 0.13760189033907874}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 15:03:47,476] Trial 72 finished with value: 0.29047762515827125 and parameters: {'max_depth': 12, 'min_child_weight': 4.1069492899879725, 'subsample': 0.796119637708633, 'colsample_bytree': 0.9403751265411532, 'lambda': 9.861264502454208, 'eta': 0.14156930286878613}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 15:04:10,988] Trial 73 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 15:21:46,318] Trial 74 finished with value: 0.29161408754385326 and parameters: {'max_depth': 12, 'min_child_weight': 3.869155950851884, 'subsample': 0.7972166802205117, 'colsample_bytree': 0.9535259226300793, 'lambda': 9.97624934563475, 'eta': 0.190804486213419}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 15:22:07,131] Trial 75 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 15:22:28,021] Trial 76 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 15:22:49,209] Trial 77 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 15:47:08,829] Trial 78 finished with value: 0.2904243638654457 and parameters: {'max_depth': 10, 'min_child_weight': 4.132390094103499, 'subsample': 0.8993273122996658, 'colsample_bytree': 0.9087939041310749, 'lambda': 2.9854643354936115, 'eta': 0.16735612676114167}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 16:10:08,653] Trial 79 finished with value: 0.29034723898722226 and parameters: {'max_depth': 10, 'min_child_weight': 3.263536891778843, 'subsample': 0.999929881550403, 'colsample_bytree': 0.9639870735699917, 'lambda': 2.865840745108778, 'eta': 0.16428337254979788}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 16:31:00,661] Trial 80 finished with value: 0.290483821734899 and parameters: {'max_depth': 10, 'min_child_weight': 3.5482479307679955, 'subsample': 0.9426412746148711, 'colsample_bytree': 0.8884356663366519, 'lambda': 3.066846008880979, 'eta': 0.16428091126703193}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 16:31:21,837] Trial 81 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 16:31:42,389] Trial 82 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 16:32:03,601] Trial 83 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 16:42:19,216] Trial 84 pruned. Trial was pruned at iteration 55.


[I 2025-06-22 16:51:38,998] Trial 85 pruned. Trial was pruned at iteration 51.


[I 2025-06-22 16:51:59,142] Trial 86 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 17:06:44,177] Trial 87 pruned. Trial was pruned at iteration 74.


[I 2025-06-22 17:07:04,167] Trial 88 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 17:07:24,158] Trial 89 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 17:21:35,331] Trial 90 pruned. Trial was pruned at iteration 76.


[I 2025-06-22 17:42:14,721] Trial 91 finished with value: 0.2911398729245011 and parameters: {'max_depth': 12, 'min_child_weight': 3.9844829322246014, 'subsample': 0.7702449474336215, 'colsample_bytree': 0.9406176058757088, 'lambda': 3.989698131647266, 'eta': 0.14963448790949133}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 17:42:35,260] Trial 92 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 17:42:57,407] Trial 93 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 17:43:17,856] Trial 94 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 17:43:39,237] Trial 95 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 18:01:51,195] Trial 96 finished with value: 0.2916859928447727 and parameters: {'max_depth': 12, 'min_child_weight': 2.869411932057625, 'subsample': 0.970654747865119, 'colsample_bytree': 0.9277464369828698, 'lambda': 2.1145159415607253, 'eta': 0.1708337060130787}. Best is trial 17 with value: 0.28867654652693947.


[I 2025-06-22 18:02:12,579] Trial 97 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 18:02:32,783] Trial 98 pruned. Trial was pruned at iteration 0.


[I 2025-06-22 18:02:54,149] Trial 99 pruned. Trial was pruned at iteration 0.


In [25]:
# Print the best hyperparameters and the best RMSE
best_params = study.best_params
best_loss = study.best_value
print("Best Hyperparameters: ", best_params)
print("Best loss: ", best_loss)

Best Hyperparameters:  {'max_depth': 8, 'min_child_weight': 5.361891734274888, 'subsample': 0.9618841777880232, 'colsample_bytree': 0.6582241534026132, 'lambda': 8.002051385874758, 'eta': 0.10167697633502167}
Best loss:  0.28867654652693947


# Performance metrics

## Retrain the model with best hyperparameters

Best Hyperparameters:    
{'max_depth': 8,   
'min_child_weight': 5.361891734274888,   
'subsample': 0.9618841777880232,   
'colsample_bytree': 0.6582241534026132,   
'lambda': 8.002051385874758,   
'eta': 0.10167697633502167}   
Best loss:  0.28867654652693947

In [23]:
param = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': num_act,
        'max_depth': 8,
        'min_child_weight': 5.361891734274888,
        'subsample': 0.9618841777880232,
        'colsample_bytree': 0.6582241534026132,
        'lambda': 8.002051385874758,
        'eta': 0.10167697633502167
    }

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dvalid = xgb.DMatrix(data=X_val, label=y_val)

model = xgb.train(param, 
                   dtrain, 
                   evals=[(dtrain, 'train'), (dvalid, 'validation')], 
                   num_boost_round = 100000,
                   early_stopping_rounds=20,
                   verbose_eval=10)

[0]	train-mlogloss:2.27924	validation-mlogloss:2.27826
[10]	train-mlogloss:0.86822	validation-mlogloss:0.87787
[20]	train-mlogloss:0.52777	validation-mlogloss:0.54007
[30]	train-mlogloss:0.39563	validation-mlogloss:0.41083
[40]	train-mlogloss:0.33701	validation-mlogloss:0.35479
[50]	train-mlogloss:0.30929	validation-mlogloss:0.32869
[60]	train-mlogloss:0.29446	validation-mlogloss:0.31527
[70]	train-mlogloss:0.28559	validation-mlogloss:0.30757
[80]	train-mlogloss:0.27969	validation-mlogloss:0.30293
[90]	train-mlogloss:0.27527	validation-mlogloss:0.29973
[100]	train-mlogloss:0.27176	validation-mlogloss:0.29753
[110]	train-mlogloss:0.26821	validation-mlogloss:0.29562
[120]	train-mlogloss:0.26534	validation-mlogloss:0.29443
[130]	train-mlogloss:0.26253	validation-mlogloss:0.29344
[140]	train-mlogloss:0.25977	validation-mlogloss:0.29254
[150]	train-mlogloss:0.25704	validation-mlogloss:0.29177
[160]	train-mlogloss:0.25449	validation-mlogloss:0.29134
[170]	train-mlogloss:0.25191	validation-ml

In [24]:
model.save_model("xgb_2017_20250621_2.json")

In [24]:
model = xgb.Booster()
model.load_model("xgb_2017_20250621_2.json")

## Validation set

In [25]:
dvalid = xgb.DMatrix(X_val)

preds = model.predict(dvalid)  # shape: (num_samples, num_class)
print(preds.shape)
print(preds[0])
pred_labels = preds.argmax(axis=1)  # shape: (num_samples,)
print(pred_labels.shape)
print(pred_labels[0])
pred_labels_tensor = torch.from_numpy(pred_labels)

(150877, 29)
[8.66868595e-06 8.66874325e-06 8.66873506e-06 5.62735404e-05
 8.66880146e-06 6.42681837e-01 2.98163050e-05 2.37068117e-01
 1.19678110e-01 1.16176365e-04 7.69391845e-05 4.12472946e-06
 1.47358805e-05 2.47646894e-05 6.11837231e-06 4.30179389e-05
 1.92481366e-05 6.19082402e-06 1.21121366e-05 8.09033554e-06
 1.75395053e-05 6.59329180e-06 2.70536166e-05 1.36669469e-05
 3.87922091e-06 1.63274581e-05 4.30776117e-06 2.12152354e-05
 9.03822092e-06]
(150877,)
5


In [26]:
accuracy, precision_macro, recall_macro, f1_macro = calculate_metrics(pred_labels_tensor, y_val_tensor, device, num_act)
print('Validation Accuracy:', accuracy)
print('Validation Macro Precision:', precision_macro)
print('Validation Macro Recall:', recall_macro)
print('Validation Macro F1 score:', f1_macro)

Validation Accuracy: tensor(0.892)
Validation Macro Precision: tensor(0.818)
Validation Macro Recall: tensor(0.751)
Validation Macro F1 score: tensor(0.762)


## Test set

In [27]:
dtest = xgb.DMatrix(X_test)

preds = model.predict(dtest)  # shape: (num_samples, num_class)
print(preds.shape)
print(preds[0])
pred_labels = preds.argmax(axis=1)  # shape: (num_samples,)
print(pred_labels.shape)
print(pred_labels[0])
pred_labels_tensor = torch.from_numpy(pred_labels)

(241181, 29)
[8.66868595e-06 8.66874325e-06 8.66873506e-06 5.62735404e-05
 8.66880146e-06 6.42681837e-01 2.98163050e-05 2.37068117e-01
 1.19678110e-01 1.16176365e-04 7.69391845e-05 4.12472946e-06
 1.47358805e-05 2.47646894e-05 6.11837231e-06 4.30179389e-05
 1.92481366e-05 6.19082402e-06 1.21121366e-05 8.09033554e-06
 1.75395053e-05 6.59329180e-06 2.70536166e-05 1.36669469e-05
 3.87922091e-06 1.63274581e-05 4.30776117e-06 2.12152354e-05
 9.03822092e-06]
(241181,)
5


In [28]:
accuracy, precision_macro, recall_macro, f1_macro = calculate_metrics(pred_labels_tensor, y_test_tensor, device, num_act)
print('Test Accuracy:', accuracy)
print('Test Macro Precision:', precision_macro)
print('Test Macro Recall:', recall_macro)
print('Test Macro F1 score:', f1_macro)

Test Accuracy: tensor(0.887)
Test Macro Precision: tensor(0.813)
Test Macro Recall: tensor(0.756)
Test Macro F1 score: tensor(0.765)


In [29]:
y_true_np = y_test_tensor.cpu().numpy()
y_pred_np = pred_labels_tensor.cpu().numpy()

In [30]:
from sklearn.metrics import classification_report
report_dict = classification_report(y_true_np, y_pred_np, output_dict=True)
# Convert to DataFrame for tabular view
report_df = pd.DataFrame(report_dict).transpose()
report_df.to_csv('2017_xgb.csv')
report_df

Unnamed: 0,precision,recall,f1-score,support
3,0.955735,0.986687,0.970964,7812.0
5,0.639579,1.0,0.780175,3707.0
6,0.974335,0.978366,0.976346,8459.0
7,0.851255,0.884059,0.867347,26936.0
8,0.998284,0.903398,0.948474,5797.0
9,0.727885,0.400067,0.516339,5944.0
10,0.956242,0.720722,0.821943,8035.0
11,1.0,1.0,1.0,8073.0
12,0.833699,0.973436,0.898165,7416.0
13,0.913787,0.992104,0.951336,37232.0


In [31]:
# Only keep rows for actual class labels (exclude "accuracy", "macro avg", etc.)
report_df = report_df[report_df.index.str.isdigit()]

# Display the result
print(report_df[['precision', 'recall', 'f1-score']])

    precision    recall  f1-score
3    0.955735  0.986687  0.970964
5    0.639579  1.000000  0.780175
6    0.974335  0.978366  0.976346
7    0.851255  0.884059  0.867347
8    0.998284  0.903398  0.948474
9    0.727885  0.400067  0.516339
10   0.956242  0.720722  0.821943
11   1.000000  1.000000  1.000000
12   0.833699  0.973436  0.898165
13   0.913787  0.992104  0.951336
14   1.000000  1.000000  1.000000
15   0.967631  0.836642  0.897381
16   0.819271  0.908009  0.861360
17   1.000000  0.999883  0.999941
18   0.878400  0.868499  0.873421
19   0.646154  0.310345  0.419301
20   0.870101  0.878102  0.874083
21   1.000000  0.999815  0.999907
22   0.439024  0.043186  0.078637
23   0.388087  0.054211  0.095133
24   1.000000  1.000000  1.000000
25   0.000000  0.000000  0.000000
26   0.977906  0.869342  0.920434
27   0.905728  0.904648  0.905188
28   0.571429  0.400000  0.470588


In [32]:
total_support = report_df['support'].sum()
report_df['support (%)'] = 100.0 * report_df['support'] / total_support
report_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  report_df['support (%)'] = 100.0 * report_df['support'] / total_support


Unnamed: 0,precision,recall,f1-score,support,support (%)
3,0.955735,0.986687,0.970964,7812.0,3.239061
5,0.639579,1.0,0.780175,3707.0,1.53702
6,0.974335,0.978366,0.976346,8459.0,3.507324
7,0.851255,0.884059,0.867347,26936.0,11.168376
8,0.998284,0.903398,0.948474,5797.0,2.403589
9,0.727885,0.400067,0.516339,5944.0,2.464539
10,0.956242,0.720722,0.821943,8035.0,3.331523
11,1.0,1.0,1.0,8073.0,3.347279
12,0.833699,0.973436,0.898165,7416.0,3.074869
13,0.913787,0.992104,0.951336,37232.0,15.437369


In [33]:
def get_freq(y):
    unique, counts = np.unique(y, return_counts=True)
    percentages = counts / counts.sum() * 100
    distribution = dict(zip(unique, percentages))
    return distribution

In [34]:
dist_train = get_freq(y_train)
dist_train 

{3: 2.683543742095804,
 5: 1.7253729829443705,
 6: 4.013033404438078,
 7: 12.784907621942523,
 8: 2.683543742095804,
 9: 2.683543742095804,
 10: 3.5952798283343426,
 11: 3.5952798283343426,
 12: 3.3475754127614525,
 13: 16.33628845916126,
 14: 2.6719746938956632,
 15: 1.7092080114866386,
 16: 18.628227843687895,
 17: 3.1218997328025306,
 18: 1.9692738758213233,
 19: 0.14374146188394818,
 20: 11.82055613841019,
 21: 1.6706973441902775,
 22: 0.8805155041093893,
 23: 1.4809966497304254,
 24: 1.4809966497304254,
 25: 0.32203158825598976,
 26: 0.4050751671172785,
 27: 0.21901951524103241,
 28: 0.027417059433211727}

In [35]:
dist_train = get_freq(y_test)
dist_train 

{3: 3.2390611200716473,
 5: 1.5370199145040446,
 6: 3.5073243746397935,
 7: 11.16837561831156,
 8: 2.4035890057674525,
 9: 2.4645390806075107,
 10: 3.331522798230375,
 11: 3.3472785998897097,
 12: 3.0748690817270017,
 13: 15.437368615272348,
 14: 2.4661975860453356,
 15: 2.0888875989402154,
 16: 16.586712883684868,
 17: 3.543811494271937,
 18: 2.0967654997698824,
 19: 0.16833830193920749,
 20: 15.871067787263508,
 21: 2.235250703828245,
 22: 0.8640813331066709,
 23: 1.6444081416031944,
 24: 1.6912609202217421,
 25: 0.39472429420227956,
 26: 0.48552746692318216,
 27: 0.34787151558373175,
 28: 0.004146263594561761}