In [1]:
import sys
sys.path.append('/home/apoorva/Desktop/olr_baseline/')

In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import f1_score
from scripts.utils.load import load_pca_anomaly

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pca_x, olr_labels = load_pca_anomaly()
pca_x.shape, olr_labels.shape

((5960, 5960), (40, 135))

In [5]:
pca_x_50 = pca_x[:, :50]
pca_x_50.shape

(5960, 50)

In [6]:
def xgb(X_train, y_train, X_test, y_test):
    def objective(trial):
        """Define the objective function"""

        params = {
            'max_depth': trial.suggest_int('max_depth', 1, 9),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'subsample': trial.suggest_float('subsample', 0.01, 1.0, log=True),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'eval_metric': 'mlogloss'
        }

        # Fit the model
        optuna_model = XGBClassifier(**params)
        optuna_model.fit(X_train, y_train)

        # Make predictions
        y_pred = optuna_model.predict(X_test)

        # Evaluate predictions
        accuracy = f1_score(y_test, y_pred, average='macro')
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)
    print('Number of finished trials: {}'.format(len(study.trials)))
    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))
    print('  Params: ')

    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))
    params = trial.params
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_train)
    print("Training\n")
    print(confusion_matrix(y_pred=y_pred, y_true=y_train))
    print(classification_report(y_pred=y_pred, y_true=y_train))

    y_pred = model.predict(X_test)
    print("Testing\n")
    print(confusion_matrix(y_pred=y_pred, y_true=y_test))
    print(classification_report(y_pred=y_pred, y_true=y_test))

In [7]:
def pentad_data(count):
    '''
    count is 0-indexed
    count = 0 corresponds to first leading pentad
    count = 1 corresponds to second leading pentad
    count = 2 corresponds to third leading pentad
    '''
    global olr_labels, pca_x
    assert count == 0 or count == 1 or count == 2
    pca_x_50 = pca_x[:, :50]
    pca_x_50 = np.array([pca_x_50[i*40+j:i*40+j+15, :] for j in range(134 - (5*count)) for i in range(40)])
    labels = np.reshape(np.reshape(olr_labels, (40, 135))[:, 1+(5*count):], (-1))
    X_train, X_test, y_train, y_test = train_test_split(pca_x_50, labels, random_state=1337, train_size=0.875, stratify=labels)
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    return X_train, X_test, y_train, y_test

# XGBoost-Optuna

## First Pentad

In [8]:
X_train, X_test, y_train, y_test = pentad_data(0)
xgb(X_train, y_train, X_test, y_test)

[I 2024-01-12 21:17:34,639] A new study created in memory with name: no-name-58ec0946-e649-4352-a79a-740bbcdf13a7


[I 2024-01-12 21:17:48,749] Trial 0 finished with value: 0.33009765889631243 and parameters: {'max_depth': 3, 'learning_rate': 0.11562898800169173, 'n_estimators': 370, 'min_child_weight': 4, 'gamma': 0.00016306359435604688, 'subsample': 0.052780136711504874, 'colsample_bytree': 0.20297036037815877, 'reg_alpha': 0.1863664391368987, 'reg_lambda': 2.6205567065971126e-07}. Best is trial 0 with value: 0.33009765889631243.
[I 2024-01-12 21:18:08,995] Trial 1 finished with value: 0.3032440810218588 and parameters: {'max_depth': 9, 'learning_rate': 0.02059521907044512, 'n_estimators': 146, 'min_child_weight': 5, 'gamma': 3.241168063772057e-08, 'subsample': 0.5637509093478659, 'colsample_bytree': 0.026008490778288984, 'reg_alpha': 3.092010662856703e-06, 'reg_lambda': 0.04303961208482573}. Best is trial 0 with value: 0.33009765889631243.
[I 2024-01-12 21:18:11,670] Trial 2 finished with value: 0.28027681660899656 and parameters: {'max_depth': 2, 'learning_rate': 0.014923944222583226, 'n_estimat

Number of finished trials: 100
Best trial:
  Value: 0.3695033006994905
  Params: 
    max_depth: 5
    learning_rate: 0.10560565743164968
    n_estimators: 450
    min_child_weight: 1
    gamma: 0.012751472230109246
    subsample: 0.03127736194247236
    colsample_bytree: 0.028886963481484302
    reg_alpha: 7.287349136824299e-07
    reg_lambda: 0.0023336701558706228
Training

[[ 210  340   64]
 [ 289 2824  288]
 [  77  385  213]]
              precision    recall  f1-score   support

         0.0       0.36      0.34      0.35       614
         1.0       0.80      0.83      0.81      3401
         2.0       0.38      0.32      0.34       675

    accuracy                           0.69      4690
   macro avg       0.51      0.50      0.50      4690
weighted avg       0.68      0.69      0.68      4690

Testing

[[ 18  56  14]
 [ 58 350  78]
 [ 16  62  18]]
              precision    recall  f1-score   support

         0.0       0.20      0.20      0.20        88
         1.0       0.

## Second Pentad

In [9]:
X_train, X_test, y_train, y_test = pentad_data(1)
xgb(X_train, y_train, X_test, y_test)

[I 2024-01-12 21:57:04,234] A new study created in memory with name: no-name-45ddb7a4-8622-4f07-a098-41584067aa6b
[I 2024-01-12 21:57:14,296] Trial 0 finished with value: 0.28851369920302067 and parameters: {'max_depth': 1, 'learning_rate': 0.05042422007548241, 'n_estimators': 497, 'min_child_weight': 6, 'gamma': 0.0007207430928831768, 'subsample': 0.034799825540364156, 'colsample_bytree': 0.5646196925150743, 'reg_alpha': 0.00166286455375801, 'reg_lambda': 2.3054671233694634e-05}. Best is trial 0 with value: 0.28851369920302067.
[I 2024-01-12 21:57:20,942] Trial 1 finished with value: 0.27927927927927926 and parameters: {'max_depth': 2, 'learning_rate': 0.012791322062867296, 'n_estimators': 238, 'min_child_weight': 7, 'gamma': 3.245156248330105e-05, 'subsample': 0.0158245175074423, 'colsample_bytree': 0.13511245203960318, 'reg_alpha': 0.00014932580829747448, 'reg_lambda': 1.711064652028969e-08}. Best is trial 0 with value: 0.28851369920302067.
[I 2024-01-12 21:57:23,545] Trial 2 finish

Number of finished trials: 100
Best trial:
  Value: 0.3750746181366797
  Params: 
    max_depth: 6
    learning_rate: 0.6124550561214089
    n_estimators: 169
    min_child_weight: 7
    gamma: 4.039758168352528e-06
    subsample: 0.0294508202584288
    colsample_bytree: 0.1349192800503863
    reg_alpha: 0.28187118519009
    reg_lambda: 4.4884047946473477e-07
Training

[[  68  417  105]
 [ 277 2466  507]
 [  63  473  139]]
              precision    recall  f1-score   support

         0.0       0.17      0.12      0.14       590
         1.0       0.73      0.76      0.75      3250
         2.0       0.19      0.21      0.19       675

    accuracy                           0.59      4515
   macro avg       0.36      0.36      0.36      4515
weighted avg       0.58      0.59      0.58      4515

Testing

[[ 11  62  11]
 [ 31 340  94]
 [  8  62  26]]
              precision    recall  f1-score   support

         0.0       0.22      0.13      0.16        84
         1.0       0.73     

## Third Pentad

In [10]:
X_train, X_test, y_train, y_test = pentad_data(2)
xgb(X_train, y_train, X_test, y_test)

[I 2024-01-12 22:14:09,890] A new study created in memory with name: no-name-5abda2d7-9541-451e-8d9e-3dcdc63e0db3
[I 2024-01-12 22:14:14,553] Trial 0 finished with value: 0.3226960540070537 and parameters: {'max_depth': 6, 'learning_rate': 0.4649901288526891, 'n_estimators': 151, 'min_child_weight': 5, 'gamma': 0.0005212251274516677, 'subsample': 0.02565343129585531, 'colsample_bytree': 0.03388625546219889, 'reg_alpha': 8.64414117699916e-08, 'reg_lambda': 1.0429200126191378e-07}. Best is trial 0 with value: 0.3226960540070537.
[I 2024-01-12 22:14:51,855] Trial 1 finished with value: 0.32650559933587625 and parameters: {'max_depth': 9, 'learning_rate': 0.6717476767495242, 'n_estimators': 273, 'min_child_weight': 1, 'gamma': 0.0012906665651359647, 'subsample': 0.3315726175935281, 'colsample_bytree': 0.05355682612137961, 'reg_alpha': 1.836532404688506e-08, 'reg_lambda': 8.404511744821143e-06}. Best is trial 1 with value: 0.32650559933587625.
[I 2024-01-12 22:14:57,080] Trial 2 finished wi

Number of finished trials: 100
Best trial:
  Value: 0.3806377891106079
  Params: 
    max_depth: 6
    learning_rate: 0.711147171755325
    n_estimators: 236
    min_child_weight: 6
    gamma: 1.434604718480666e-06
    subsample: 0.21914896518456406
    colsample_bytree: 0.05991325325914827
    reg_alpha: 1.4502476023842539e-08
    reg_lambda: 1.8531870754657047e-07
Training

[[ 223  288   57]
 [ 239 2592  266]
 [  64  318  293]]
              precision    recall  f1-score   support

         0.0       0.42      0.39      0.41       568
         1.0       0.81      0.84      0.82      3097
         2.0       0.48      0.43      0.45       675

    accuracy                           0.72      4340
   macro avg       0.57      0.55      0.56      4340
weighted avg       0.71      0.72      0.71      4340

Testing

[[ 15  53  13]
 [ 51 339  53]
 [ 10  68  18]]
              precision    recall  f1-score   support

         0.0       0.20      0.19      0.19        81
         1.0       0.