# Documentation

aggregation encoding

# Import packages

In [1]:
import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import csv
import optuna
import xgboost as xgb
from optuna.integration import XGBoostPruningCallback
from train_evaluate import calculate_metrics
from ensemble_encode import agg_encode

In [2]:
num_act = 29
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Set printed tensor format
torch.set_printoptions(sci_mode=False, precision=3)

# Load data

In [4]:
# define file path
train_prefix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/train_prefix_act_0616_l.pt'
train_prefix_time_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/train_prefix_time_0616_l.pt'
train_suffix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/train_suffix_act_0616.pt'

val_prefix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/val_prefix_act_0616_l.pt'
val_prefix_time_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/val_prefix_time_0616_l.pt'
val_suffix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/val_suffix_act_0616.pt'

test_prefix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/test_prefix_act_0616_l.pt'
test_prefix_time_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/test_prefix_time_0616_l.pt'
test_suffix_act_tensor_path = '/scratch/leuven/370/vsc37039/tensor_2017_0616/test_suffix_act_0616.pt'

## X_train

In [5]:
train_prefix_act_tensor = torch.load(train_prefix_act_tensor_path)
print(train_prefix_act_tensor.shape)

train_prefix_time_tensor = torch.load(train_prefix_time_tensor_path)
print(train_prefix_time_tensor.shape)

  train_prefix_act_tensor = torch.load(train_prefix_act_tensor_path)


torch.Size([630994, 87])


  train_prefix_time_tensor = torch.load(train_prefix_time_tensor_path)


torch.Size([630994, 87, 2])


In [6]:
print(train_prefix_act_tensor[1])
print(train_prefix_time_tensor[1])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5])
tensor([[-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -1000

In [7]:
X_train = agg_encode(train_prefix_act_tensor, train_prefix_time_tensor, num_act)
print(X_train.shape)
type(X_train)

(630994, 30)


numpy.ndarray

In [8]:
X_train[1]

array([0.        , 0.        , 0.        , 1.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.0015388 , 0.00158357],
      dtype=float32)

## X_val

In [9]:
val_prefix_act_tensor = torch.load(val_prefix_act_tensor_path)
print(val_prefix_act_tensor.shape)

val_prefix_time_tensor = torch.load(val_prefix_time_tensor_path)
print(val_prefix_time_tensor.shape)

torch.Size([150877, 87])


  val_prefix_act_tensor = torch.load(val_prefix_act_tensor_path)
  val_prefix_time_tensor = torch.load(val_prefix_time_tensor_path)


torch.Size([150877, 87, 2])


In [10]:
print(val_prefix_act_tensor[1])
print(val_prefix_time_tensor[1])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5])
tensor([[-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -1000

In [11]:
X_val = agg_encode(val_prefix_act_tensor, val_prefix_time_tensor, num_act)
print(X_val.shape)
type(X_val)

(150877, 30)


numpy.ndarray

In [12]:
X_val[1]

array([0.        , 0.        , 0.        , 1.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00138183, 0.00142204],
      dtype=float32)

## X_test

In [13]:
test_prefix_act_tensor = torch.load(test_prefix_act_tensor_path)
print(test_prefix_act_tensor.shape)

test_prefix_time_tensor = torch.load(test_prefix_time_tensor_path)
print(test_prefix_time_tensor.shape)

  test_prefix_act_tensor = torch.load(test_prefix_act_tensor_path)


torch.Size([241181, 87])


  test_prefix_time_tensor = torch.load(test_prefix_time_tensor_path)


torch.Size([241181, 87, 2])


In [14]:
print(test_prefix_act_tensor[1])
print(test_prefix_time_tensor[1])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8])
tensor([[-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -10000.000],
        [-10000.000, -1000

In [15]:
X_test = agg_encode(test_prefix_act_tensor, test_prefix_time_tensor, num_act)
print(X_test.shape)
type(X_test)

(241181, 30)


numpy.ndarray

In [16]:
X_test[1]

array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       6.5577959e-05, 6.7485984e-05], dtype=float32)

## y_train

In [17]:
train_suffix_act_tensor = torch.load(train_suffix_act_tensor_path)
print(train_suffix_act_tensor.shape)

y_train_tensor = train_suffix_act_tensor[:, 0]
print(y_train_tensor.shape)

y_train = y_train_tensor.numpy()
type(y_train)

  train_suffix_act_tensor = torch.load(train_suffix_act_tensor_path)


torch.Size([630994, 87])
torch.Size([630994])


numpy.ndarray

In [18]:
y_train[0]

5

## y_val

In [19]:
val_suffix_act_tensor = torch.load(val_suffix_act_tensor_path)
print(val_suffix_act_tensor.shape)

y_val_tensor = val_suffix_act_tensor[:, 0]
print(y_val_tensor.shape)

y_val = y_val_tensor.numpy()
type(y_val)

torch.Size([150877, 87])
torch.Size([150877])


  val_suffix_act_tensor = torch.load(val_suffix_act_tensor_path)


numpy.ndarray

In [20]:
y_val[0]

5

## y_test

In [21]:
test_suffix_act_tensor = torch.load(test_suffix_act_tensor_path)
print(test_suffix_act_tensor.shape)

y_test_tensor = test_suffix_act_tensor[:, 0]
print(y_test_tensor.shape)

y_test = y_test_tensor.numpy()
type(y_test)

  test_suffix_act_tensor = torch.load(test_suffix_act_tensor_path)


torch.Size([241181, 87])
torch.Size([241181])


numpy.ndarray

In [22]:
y_test[0]

8

# Hyperparameter tuning

In [23]:
def objective(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': num_act,
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'eta': trial.suggest_float('eta', 0.01, 0.3, log=True)
    }

    # Convert the data into DMatrix format
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dvalid = xgb.DMatrix(data=X_val, label=y_val)

    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-mlogloss')

    # Train the model with early stopping
    model = xgb.train(param, 
                      dtrain, 
                      evals=[(dvalid, 'validation')], 
                      num_boost_round = 100000, 
                      early_stopping_rounds=20, 
                      callbacks=[pruning_callback],
                      verbose_eval=False)

    return model.best_score

In [24]:
# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=7))
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-07-09 16:31:08,835] A new study created in memory with name: no-name-7df7258c-5ab9-4f65-b0b5-329a43b6b81d


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-07-09 16:40:22,776] Trial 0 finished with value: 0.4159197700436676 and parameters: {'max_depth': 3, 'min_child_weight': 8.019269130161032, 'subsample': 0.7192046157204468, 'colsample_bytree': 0.8617325889154706, 'lambda': 9.782096168766367, 'eta': 0.062434364533734486}. Best is trial 0 with value: 0.4159197700436676.
[I 2025-07-09 16:41:24,029] Trial 1 finished with value: 0.41115179380153266 and parameters: {'max_depth': 8, 'min_child_weight': 1.648460200237854, 'subsample': 0.6342194900509356, 'colsample_bytree': 0.74994125041278, 'lambda': 6.82437696159731, 'eta': 0.153893406647721}. Best is trial 1 with value: 0.41115179380153266.
[I 2025-07-09 16:48:40,224] Trial 2 finished with value: 0.4099238406247269 and parameters: {'max_depth': 6, 'min_child_weight': 1.593427122153146, 'subsample': 0.6440727996539968, 'colsample_bytree': 0.9547967638598068, 'lambda': 2.212515000441164, 'eta': 0.046541664776787166}. Best is trial 2 with value: 0.4099238406247269.
[I 2025-07-09 16:49:

In [25]:
# Print the best hyperparameters and the best RMSE
best_params = study.best_params
best_loss = study.best_value
print("Best Hyperparameters: ", best_params)
print("Best loss: ", best_loss)

Best Hyperparameters:  {'max_depth': 8, 'min_child_weight': 5.361891734274888, 'subsample': 0.9618841777880232, 'colsample_bytree': 0.6582241534026132, 'lambda': 3.3025003636212737, 'eta': 0.10167697633502167}
Best loss:  0.4097789016480817


# Performance metrics

## Retrain the model with best hyperparameters

Best Hyperparameters:  
{'max_depth': 8,   
'min_child_weight': 5.361891734274888,   
'subsample': 0.9618841777880232,   
'colsample_bytree': 0.6582241534026132,   
'lambda': 3.3025003636212737,   
'eta': 0.10167697633502167}  
Best loss:  0.4097789016480817

In [26]:
param = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': num_act,
        'max_depth': 8,
        'min_child_weight': 5.361891734274888,
        'subsample': 0.9618841777880232,
        'colsample_bytree': 0.6582241534026132,
        'lambda': 3.3025003636212737,
        'eta': 0.10167697633502167
    }

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dvalid = xgb.DMatrix(data=X_val, label=y_val)

model = xgb.train(param, 
                   dtrain, 
                   evals=[(dtrain, 'train'), (dvalid, 'validation')], 
                   num_boost_round = 100000,
                   early_stopping_rounds=20,
                   verbose_eval=10)

[0]	train-mlogloss:2.57191	validation-mlogloss:2.58177
[10]	train-mlogloss:1.10607	validation-mlogloss:1.11573
[20]	train-mlogloss:0.72497	validation-mlogloss:0.73976
[30]	train-mlogloss:0.56771	validation-mlogloss:0.58561
[40]	train-mlogloss:0.49389	validation-mlogloss:0.51408
[50]	train-mlogloss:0.45366	validation-mlogloss:0.47573
[60]	train-mlogloss:0.43069	validation-mlogloss:0.45422
[70]	train-mlogloss:0.41629	validation-mlogloss:0.44144
[80]	train-mlogloss:0.40691	validation-mlogloss:0.43336
[90]	train-mlogloss:0.40055	validation-mlogloss:0.42799
[100]	train-mlogloss:0.39579	validation-mlogloss:0.42425
[110]	train-mlogloss:0.39185	validation-mlogloss:0.42147
[120]	train-mlogloss:0.38866	validation-mlogloss:0.41950
[130]	train-mlogloss:0.38579	validation-mlogloss:0.41791
[140]	train-mlogloss:0.38313	validation-mlogloss:0.41641
[150]	train-mlogloss:0.38058	validation-mlogloss:0.41532
[160]	train-mlogloss:0.37838	validation-mlogloss:0.41445
[170]	train-mlogloss:0.37633	validation-ml

In [27]:
model.save_model("xgb_20250621_1.json")

In [28]:
model = xgb.Booster()
model.load_model("xgb_20250621_1.json")

## Validation set

In [29]:
dvalid = xgb.DMatrix(X_val)

preds = model.predict(dvalid)  # shape: (num_samples, num_class)
print(preds.shape)
print(preds[0])
pred_labels = preds.argmax(axis=1)  # shape: (num_samples,)
print(pred_labels.shape)
print(pred_labels[0])
pred_labels_tensor = torch.from_numpy(pred_labels)

(150877, 29)
[7.4788336e-06 7.4788691e-06 7.4787977e-06 7.0097049e-06 7.4787476e-06
 6.4275593e-01 8.4082894e-05 2.3697212e-01 1.1990678e-01 4.0533152e-05
 7.6094493e-06 3.5545177e-06 1.3474127e-05 4.5101438e-06 7.3853821e-06
 1.3625157e-05 8.2178321e-06 5.4330235e-06 5.8030305e-06 2.1547412e-05
 4.5708689e-06 4.9924170e-06 1.3318877e-05 5.7432894e-06 1.1641090e-06
 1.2536317e-05 1.1035225e-06 4.2435786e-05 2.6627506e-05]
(150877,)
5


In [30]:
accuracy, precision_macro, recall_macro, f1_macro = calculate_metrics(pred_labels_tensor, y_val_tensor, device, num_act)
print('Validation Accuracy:', accuracy)
print('Validation Macro Precision:', precision_macro)
print('Validation Macro Recall:', recall_macro)
print('Validation Macro F1 score:', f1_macro)

Validation Accuracy: tensor(0.844)
Validation Macro Precision: tensor(0.778)
Validation Macro Recall: tensor(0.723)
Validation Macro F1 score: tensor(0.726)


## Test set

In [31]:
dtest = xgb.DMatrix(X_test)

preds = model.predict(dtest)  # shape: (num_samples, num_class)
print(preds.shape)
print(preds[0])
pred_labels = preds.argmax(axis=1)  # shape: (num_samples,)
print(pred_labels.shape)
print(pred_labels[0])
pred_labels_tensor = torch.from_numpy(pred_labels)

(241181, 29)
[7.4788336e-06 7.4788691e-06 7.4787977e-06 7.0097049e-06 7.4787476e-06
 6.4275593e-01 8.4082894e-05 2.3697212e-01 1.1990678e-01 4.0533152e-05
 7.6094493e-06 3.5545177e-06 1.3474127e-05 4.5101438e-06 7.3853821e-06
 1.3625157e-05 8.2178321e-06 5.4330235e-06 5.8030305e-06 2.1547412e-05
 4.5708689e-06 4.9924170e-06 1.3318877e-05 5.7432894e-06 1.1641090e-06
 1.2536317e-05 1.1035225e-06 4.2435786e-05 2.6627506e-05]
(241181,)
5


In [32]:
accuracy, precision_macro, recall_macro, f1_macro = calculate_metrics(pred_labels_tensor, y_test_tensor, device, num_act)
print('Test Accuracy:', accuracy)
print('Test Macro Precision:', precision_macro)
print('Test Macro Recall:', recall_macro)
print('Test Macro F1 score:', f1_macro)

Test Accuracy: tensor(0.833)
Test Macro Precision: tensor(0.781)
Test Macro Recall: tensor(0.714)
Test Macro F1 score: tensor(0.720)


In [33]:
y_true_np = y_test_tensor.cpu().numpy()
y_pred_np = pred_labels_tensor.cpu().numpy()

In [34]:
from sklearn.metrics import classification_report


In [35]:
report_dict = classification_report(y_true_np, y_pred_np, output_dict=True)

In [36]:
# Convert to DataFrame for tabular view
report_df = pd.DataFrame(report_dict).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
3,0.869969,0.910394,0.889723,7812.0
5,0.639579,1.0,0.780175,3707.0
6,0.989881,0.971391,0.980549,8459.0
7,0.82628,0.857477,0.841589,26936.0
8,0.999427,0.902708,0.948609,5797.0
9,0.797218,0.318136,0.454786,5944.0
10,0.914259,0.732545,0.813377,8035.0
11,1.0,0.999381,0.99969,8073.0
12,0.797598,0.967098,0.874208,7416.0
13,0.888421,0.975183,0.929782,37232.0


In [37]:
# Only keep rows for actual class labels (exclude "accuracy", "macro avg", etc.)
report_df = report_df[report_df.index.str.isdigit()]

# Display the result
print(report_df[['precision', 'recall', 'f1-score']])

    precision    recall  f1-score
3    0.869969  0.910394  0.889723
5    0.639579  1.000000  0.780175
6    0.989881  0.971391  0.980549
7    0.826280  0.857477  0.841589
8    0.999427  0.902708  0.948609
9    0.797218  0.318136  0.454786
10   0.914259  0.732545  0.813377
11   1.000000  0.999381  0.999690
12   0.797598  0.967098  0.874208
13   0.888421  0.975183  0.929782
14   1.000000  1.000000  1.000000
15   0.941139  0.882295  0.910767
16   0.708003  0.808919  0.755104
17   0.838507  0.828010  0.833225
18   0.849132  0.880364  0.864466
19   0.397727  0.172414  0.240550
20   0.797281  0.772141  0.784509
21   0.865920  0.802634  0.833077
22   0.364865  0.012956  0.025023
23   0.193548  0.022693  0.040623
24   1.000000  1.000000  1.000000
25   0.125000  0.005252  0.010081
26   0.996596  1.000000  0.998295
27   0.724466  0.727056  0.725758
28   1.000000  0.300000  0.461538


In [38]:
total_support = report_df['support'].sum()
report_df['support (%)'] = 100.0 * report_df['support'] / total_support
report_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  report_df['support (%)'] = 100.0 * report_df['support'] / total_support


Unnamed: 0,precision,recall,f1-score,support,support (%)
3,0.869969,0.910394,0.889723,7812.0,3.239061
5,0.639579,1.0,0.780175,3707.0,1.53702
6,0.989881,0.971391,0.980549,8459.0,3.507324
7,0.82628,0.857477,0.841589,26936.0,11.168376
8,0.999427,0.902708,0.948609,5797.0,2.403589
9,0.797218,0.318136,0.454786,5944.0,2.464539
10,0.914259,0.732545,0.813377,8035.0,3.331523
11,1.0,0.999381,0.99969,8073.0,3.347279
12,0.797598,0.967098,0.874208,7416.0,3.074869
13,0.888421,0.975183,0.929782,37232.0,15.437369
