In [1]:
import sys
sys.path.append('/home/apoorva/Desktop/Work/olr/')

In [2]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import f1_score
from scripts.utils.load import load_pca_anomaly

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
pca_x, olr_labels = load_pca_anomaly()

In [3]:
pca_x.shape, olr_labels.shape

((5920, 5920), (40, 134))

In [4]:
pca_x_50 = pca_x[:, :50]
pca_x_50.shape

(5920, 50)

In [5]:
def xgb(X_train, y_train, X_test, y_test):
    def objective(trial):
        """Define the objective function"""

        params = {
            'max_depth': trial.suggest_int('max_depth', 1, 9),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'subsample': trial.suggest_float('subsample', 0.01, 1.0, log=True),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'eval_metric': 'mlogloss'
        }

        # Fit the model
        optuna_model = XGBClassifier(**params)
        optuna_model.fit(X_train, y_train)

        # Make predictions
        y_pred = optuna_model.predict(X_test)

        # Evaluate predictions
        accuracy = f1_score(y_test, y_pred, average='macro')
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    print('Number of finished trials: {}'.format(len(study.trials)))
    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))
    print('  Params: ')

    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))
    params = trial.params
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_train)
    print("Training\n")
    print(confusion_matrix(y_pred=y_pred, y_true=y_train))
    print(classification_report(y_pred=y_pred, y_true=y_train))

    y_pred = model.predict(X_test)
    print("Testing\n")
    print(confusion_matrix(y_pred=y_pred, y_true=y_test))
    print(classification_report(y_pred=y_pred, y_true=y_test))

In [6]:
def pentad_data(count):
    '''
    count is 0-indexed
    count = 0 corresponds to first leading pentad
    count = 1 corresponds to second leading pentad
    count = 2 corresponds to third leading pentad
    '''
    global olr_labels, pca_x
    assert count == 0 or count == 1 or count == 2
    pca_x_50 = pca_x[:, :50]
    pca_x_50 = np.array([pca_x_50[i*40+j:i*40+j+15, :] for i in range(40) for j in range(134 - (5*count))])
    labels = np.reshape(np.reshape(olr_labels, (40, 134))[:, (5*count):], (-1))
    X_train, X_test, y_train, y_test = train_test_split(pca_x_50, labels, random_state=1337, train_size=0.875, stratify=labels)
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    return X_train, X_test, y_train, y_test

# XGBoost-Optuna

## First Pentad

In [7]:
X_train, X_test, y_train, y_test = pentad_data(0)
xgb(X_train, y_train, X_test, y_test)

[I 2024-05-25 03:28:45,872] A new study created in memory with name: no-name-9d67b130-70e4-4998-ab04-2c313332ced5


[I 2024-05-25 03:29:24,460] Trial 0 finished with value: 0.3581055525480625 and parameters: {'max_depth': 7, 'learning_rate': 0.5821693355023613, 'n_estimators': 456, 'min_child_weight': 5, 'gamma': 0.009729813145320136, 'subsample': 0.17940244936564992, 'colsample_bytree': 0.018394605734343348, 'reg_alpha': 0.00017203484646750018, 'reg_lambda': 0.45634054428021686}. Best is trial 0 with value: 0.3581055525480625.
[I 2024-05-25 03:29:57,079] Trial 1 finished with value: 0.35673206087713866 and parameters: {'max_depth': 7, 'learning_rate': 0.5104840972918051, 'n_estimators': 358, 'min_child_weight': 9, 'gamma': 0.0005804237126166922, 'subsample': 0.27104362473328364, 'colsample_bytree': 0.27228768368566925, 'reg_alpha': 3.4144573024364596e-05, 'reg_lambda': 1.0370418634532727e-08}. Best is trial 0 with value: 0.3581055525480625.
[I 2024-05-25 03:30:15,208] Trial 2 finished with value: 0.3502615246196201 and parameters: {'max_depth': 7, 'learning_rate': 0.8281635732527914, 'n_estimators'

Number of finished trials: 50
Best trial:
  Value: 0.4090466149382361
  Params: 
    max_depth: 3
    learning_rate: 0.18268651366344776
    n_estimators: 285
    min_child_weight: 8
    gamma: 0.002998293326224193
    subsample: 0.35144208731398047
    colsample_bytree: 0.23895341821871915
    reg_alpha: 2.0435257900668234e-06
    reg_lambda: 8.393075923753608e-07
Training

[[ 217  360   37]
 [  91 3206  104]
 [  36  429  210]]
              precision    recall  f1-score   support

         0.0       0.63      0.35      0.45       614
         1.0       0.80      0.94      0.87      3401
         2.0       0.60      0.31      0.41       675

    accuracy                           0.77      4690
   macro avg       0.68      0.54      0.58      4690
weighted avg       0.75      0.77      0.75      4690

Testing

[[ 21  59   8]
 [ 42 397  47]
 [  8  73  15]]
              precision    recall  f1-score   support

         0.0       0.30      0.24      0.26        88
         1.0       0.7

## Second Pentad

In [8]:
X_train, X_test, y_train, y_test = pentad_data(1)
xgb(X_train, y_train, X_test, y_test)

[I 2024-05-25 03:43:59,557] A new study created in memory with name: no-name-9c1462b3-f27f-4602-bfbd-ef1196ffb89c


[I 2024-05-25 03:44:23,147] Trial 0 finished with value: 0.3864953527520923 and parameters: {'max_depth': 6, 'learning_rate': 0.11370108516672386, 'n_estimators': 385, 'min_child_weight': 4, 'gamma': 0.0014006354834385979, 'subsample': 0.04705108907769987, 'colsample_bytree': 0.031035524438344276, 'reg_alpha': 1.2443599916453006e-05, 'reg_lambda': 0.05340986782636175}. Best is trial 0 with value: 0.3864953527520923.
[I 2024-05-25 03:44:31,974] Trial 1 finished with value: 0.3570035500666701 and parameters: {'max_depth': 4, 'learning_rate': 0.7407058284827324, 'n_estimators': 424, 'min_child_weight': 3, 'gamma': 3.8538605089335395e-08, 'subsample': 0.022803349585781903, 'colsample_bytree': 0.8422379662535426, 'reg_alpha': 0.000630230055800853, 'reg_lambda': 7.716708949874697e-05}. Best is trial 0 with value: 0.3864953527520923.
[I 2024-05-25 03:44:42,253] Trial 2 finished with value: 0.2981230008200926 and parameters: {'max_depth': 2, 'learning_rate': 0.04219390862670692, 'n_estimators'

Number of finished trials: 50
Best trial:
  Value: 0.4143957356860583
  Params: 
    max_depth: 5
    learning_rate: 0.09691643484055391
    n_estimators: 286
    min_child_weight: 9
    gamma: 0.9247357806545483
    subsample: 0.05418253443274064
    colsample_bytree: 0.05003591739365998
    reg_alpha: 1.5969654931312175e-05
    reg_lambda: 0.033878590287384526
Training

[[ 114  440   36]
 [  52 3121   77]
 [  29  513  133]]
              precision    recall  f1-score   support

         0.0       0.58      0.19      0.29       590
         1.0       0.77      0.96      0.85      3250
         2.0       0.54      0.20      0.29       675

    accuracy                           0.75      4515
   macro avg       0.63      0.45      0.48      4515
weighted avg       0.71      0.75      0.69      4515

Testing

[[ 14  63   7]
 [ 14 418  33]
 [  5  77  14]]
              precision    recall  f1-score   support

         0.0       0.42      0.17      0.24        84
         1.0       0.75  

## Third Pentad

In [9]:
X_train, X_test, y_train, y_test = pentad_data(2)
xgb(X_train, y_train, X_test, y_test)

[I 2024-05-25 03:57:09,421] A new study created in memory with name: no-name-ef50ac40-8347-43b2-a7da-72d8ec6b8428


[I 2024-05-25 03:57:22,390] Trial 0 finished with value: 0.38324889025823605 and parameters: {'max_depth': 3, 'learning_rate': 0.3958103895276559, 'n_estimators': 380, 'min_child_weight': 1, 'gamma': 0.00018587074493888798, 'subsample': 0.08076672556216737, 'colsample_bytree': 0.019035456576298564, 'reg_alpha': 0.032049299134038645, 'reg_lambda': 4.193518708773556e-06}. Best is trial 0 with value: 0.38324889025823605.
[I 2024-05-25 03:57:36,657] Trial 1 finished with value: 0.3472838874680307 and parameters: {'max_depth': 6, 'learning_rate': 0.07483511763793442, 'n_estimators': 457, 'min_child_weight': 7, 'gamma': 0.41962322316264644, 'subsample': 0.016417470510183576, 'colsample_bytree': 0.7046475018196833, 'reg_alpha': 0.00012699859813369445, 'reg_lambda': 0.9274622041774321}. Best is trial 0 with value: 0.38324889025823605.
[I 2024-05-25 03:58:07,005] Trial 2 finished with value: 0.3639180307018448 and parameters: {'max_depth': 6, 'learning_rate': 0.032224635394821725, 'n_estimators

Number of finished trials: 50
Best trial:
  Value: 0.42456168060801697
  Params: 
    max_depth: 1
    learning_rate: 0.1630372114999095
    n_estimators: 462
    min_child_weight: 10
    gamma: 0.9282139015361749
    subsample: 0.05405069793500808
    colsample_bytree: 0.7069936647123539
    reg_alpha: 0.12362002443054881
    reg_lambda: 1.3852945789235005e-06
Training

[[ 127  399   42]
 [ 135 2804  158]
 [  41  486  148]]
              precision    recall  f1-score   support

         0.0       0.42      0.22      0.29       568
         1.0       0.76      0.91      0.83      3097
         2.0       0.43      0.22      0.29       675

    accuracy                           0.71      4340
   macro avg       0.53      0.45      0.47      4340
weighted avg       0.66      0.71      0.67      4340

Testing

[[ 13  61   7]
 [ 28 391  24]
 [  5  72  19]]
              precision    recall  f1-score   support

         0.0       0.28      0.16      0.20        81
         1.0       0.75   