In [1]:
import json

import numpy as np
import pandas as pd

In [2]:
filename = "../bayesian_optimization_undersampling_no_lambda_logs.json"

In [3]:
with open(filename, "r") as infile:
    json_list = [json.loads(line) for line in infile]
df = pd.json_normalize(json_list)

**Current best mcc, f1_macro and accuracy results, respectively.**

In [4]:
best_mcc_f1_accuracy_indices = [
    df["matthews_corrcoef"].idxmax(),
    df["f1_macro"].idxmax(),
    df["balanced_accuracy"].idxmax(),
    df["accuracy"].idxmax(),
]
df.iloc[best_mcc_f1_accuracy_indices][
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "matthews_corrcoef",
        "f1_macro", 
        "balanced_accuracy",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,scaled,pca_components,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
53,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,0.60335,0.640675,0.642713,0.642659
53,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,0.60335,0.640675,0.642713,0.642659
53,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,0.60335,0.640675,0.642713,0.642659
53,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,0.60335,0.640675,0.642713,0.642659


**Distribution of Datasets in Top n Results**

In [5]:
df.sort_values("matthews_corrcoef", ascending=False)[0:500].value_counts("dataset")

dataset
dataset_00_all.pickle    500
dtype: int64

**Detailed summary of current best mcc result.**

In [6]:
df.iloc[best_mcc_f1_accuracy_indices[0]]

model                    XGBoostClassifier
dataset              dataset_00_all.pickle
timestamp                       1658751145
scaled                                 yes
pca_components                         0.0
resampling                            none
learning_rate                     0.181088
n_estimators                           419
max_depth                                3
gamma                             0.174787
reg_alpha                           0.4416
seed                                  1962
matthews_corrcoef                  0.60335
f1_macro                          0.640675
balanced_accuracy                 0.642713
accuracy                          0.642659
Name: 53, dtype: object

**Number of trials logged.**

In [7]:
df.shape[0]

530

**Group results by timestamp, resampling and pca_components, then show current best mcc by group.**

In [8]:
indices_for_best_mcc = df.groupby(
    ["timestamp", "pca_components", "scaled"]
)["matthews_corrcoef"].idxmax()
df.iloc[indices_for_best_mcc][
    [
        "model",
        "dataset",
        "pca_components",
        "matthews_corrcoef",
        "f1_macro",
        "balanced_accuracy",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,pca_components,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
53,XGBoostClassifier,dataset_00_all.pickle,0.0,0.60335,0.640675,0.642713,0.642659
63,XGBoostClassifier,dataset_00_all.pickle,0.95,0.52851,0.566887,0.575038,0.574792
163,XGBoostClassifier,dataset_00_all.pickle,0.0,0.60335,0.640675,0.642713,0.642659
268,XGBoostClassifier,dataset_00_all.pickle,0.0,0.60335,0.640675,0.642713,0.642659
373,XGBoostClassifier,dataset_00_all.pickle,0.0,0.585327,0.616935,0.626408,0.626039
478,XGBoostClassifier,dataset_00_all.pickle,0.0,0.60335,0.640675,0.642713,0.642659


**Best Result by Dataset**

In [9]:
indices_for_best_mcc = df.groupby(["dataset"])["matthews_corrcoef"].idxmax()
df.iloc[indices_for_best_mcc][
    [
        "model",
        "dataset",
        "pca_components",
        "matthews_corrcoef",
        "f1_macro",
        "balanced_accuracy",
        "accuracy",
    ]
].sort_values("dataset")

Unnamed: 0,model,dataset,pca_components,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
53,XGBoostClassifier,dataset_00_all.pickle,0.0,0.60335,0.640675,0.642713,0.642659


**Top Results (mcc > .6)**

In [10]:
df[df.matthews_corrcoef > 0.6].sort_values("matthews_corrcoef", ascending=False)[
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "learning_rate",
        "n_estimators",
        "max_depth",
        "gamma",
        "reg_alpha",
        "matthews_corrcoef",
        "f1_macro",
        "balanced_accuracy",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,scaled,pca_components,learning_rate,n_estimators,max_depth,gamma,reg_alpha,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
53,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,0.181088,419,3,0.174787,0.4416,0.60335,0.640675,0.642713,0.642659
163,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,0.181088,419,3,0.174787,0.4416,0.60335,0.640675,0.642713,0.642659
268,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,0.181088,419,3,0.174787,0.4416,0.60335,0.640675,0.642713,0.642659
478,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,0.181088,419,3,0.174787,0.4416,0.60335,0.640675,0.642713,0.642659


In [11]:
df.value_counts("dataset").sort_index(ascending=True)

dataset
dataset_00_all.pickle    530
dtype: int64

In [12]:
df


Unnamed: 0,model,dataset,timestamp,scaled,pca_components,resampling,learning_rate,n_estimators,max_depth,gamma,reg_alpha,seed,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
0,XGBoostClassifier,dataset_00_all.pickle,1658751145,yes,0.0,none,0.106177,847,7,0.809896,15.199520,1962,0.523757,0.564450,0.570795,0.570637
1,XGBoostClassifier,dataset_00_all.pickle,1658751145,yes,0.0,none,0.817638,202,4,3.180340,14.920890,1962,0.455661,0.509784,0.509874,0.509695
2,XGBoostClassifier,dataset_00_all.pickle,1658751145,yes,0.0,none,0.912352,988,8,4.064809,17.454995,1962,0.434301,0.496031,0.490411,0.490305
3,XGBoostClassifier,dataset_00_all.pickle,1658751145,yes,0.0,none,0.556415,993,3,3.588053,1.935608,1962,0.522013,0.568127,0.569349,0.569252
4,XGBoostClassifier,dataset_00_all.pickle,1658751145,yes,0.0,none,0.114840,426,8,0.327162,6.880454,1962,0.552933,0.592201,0.597051,0.596953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,XGBoostClassifier,dataset_00_all.pickle,1658770467,yes,0.0,none,0.484099,416,3,0.099597,0.336338,1962,0.563459,0.603580,0.606678,0.606648
526,XGBoostClassifier,dataset_00_all.pickle,1658770467,yes,0.0,none,0.350578,424,7,1.124034,0.705625,1962,0.529525,0.572768,0.576256,0.576177
527,XGBoostClassifier,dataset_00_all.pickle,1658770467,yes,0.0,none,0.165103,952,8,0.269528,5.824548,1962,0.546525,0.588078,0.591610,0.591413
528,XGBoostClassifier,dataset_00_all.pickle,1658770467,yes,0.0,none,0.056950,417,4,1.005806,2.689339,1962,0.573014,0.608962,0.614992,0.614958
