In [1]:
import json

import numpy as np
import pandas as pd

In [2]:
filename = "../bayesian_optimization_undersampling_logs.json"

In [3]:
with open(filename, "r") as infile:
    json_list = [json.loads(line) for line in infile]
df = pd.json_normalize(json_list)

**Current best mcc, f1_macro and accuracy results, respectively.**

In [4]:
best_mcc_f1_accuracy_indices = [
    df["matthews_corrcoef"].idxmax(),
    df["f1_macro"].idxmax(),
    df["balanced_accuracy"].idxmax(),
    df["accuracy"].idxmax(),
]
df.iloc[best_mcc_f1_accuracy_indices][
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "resampling",
        "matthews_corrcoef",
        "f1_macro", 
        "balanced_accuracy",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,scaled,pca_components,resampling,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
163,XGBoostClassifier,dataset_00_all.pickle,yes,0,SMOTE,0.655902,0.61415,0.592576,0.719438
157,XGBoostClassifier,dataset_00_all.pickle,yes,0,SMOTE,0.653104,0.616732,0.595196,0.717096
188,XGBoostClassifier,dataset_00_all.pickle,yes,0,SMOTE,0.632559,0.602107,0.603063,0.698126
163,XGBoostClassifier,dataset_00_all.pickle,yes,0,SMOTE,0.655902,0.61415,0.592576,0.719438


**Distribution of Datasets in Top n Results**

In [5]:
df.sort_values("matthews_corrcoef", ascending=False)[0:500].value_counts("dataset")

dataset
dataset_00_all.pickle    195
dtype: int64

**Detailed summary of current best mcc result.**

In [6]:
df.iloc[best_mcc_f1_accuracy_indices[0]]

model                    XGBoostClassifier
dataset              dataset_00_all.pickle
timestamp                       1659044338
scaled                                 yes
pca_components                           0
resampling                           SMOTE
learning_rate                     0.080805
n_estimators                           423
max_depth                                8
gamma                             0.051367
reg_alpha                         8.308685
seed                                  1962
matthews_corrcoef                 0.655902
f1_macro                           0.61415
balanced_accuracy                 0.592576
accuracy                          0.719438
Name: 163, dtype: object

**Number of trials logged.**

In [7]:
df.shape[0]

195

**Group results by timestamp, resampling and pca_components, then show current best mcc by group.**

In [8]:
indices_for_best_mcc = df.groupby(
    ["timestamp", "pca_components", "scaled"]
)["matthews_corrcoef"].idxmax()
df.iloc[indices_for_best_mcc][
    [
        "model",
        "dataset",
        "pca_components",
        "resampling",
        "matthews_corrcoef",
        "f1_macro",
        "balanced_accuracy",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,pca_components,resampling,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
52,XGBoostClassifier,dataset_00_all.pickle,0,under,0.498629,0.481393,0.597625,0.559719
163,XGBoostClassifier,dataset_00_all.pickle,0,SMOTE,0.655902,0.61415,0.592576,0.719438


**Best Result by Dataset**

In [9]:
indices_for_best_mcc = df.groupby(["dataset"])["matthews_corrcoef"].idxmax()
df.iloc[indices_for_best_mcc][
    [
        "model",
        "dataset",
        "pca_components",
        "resampling",
        "matthews_corrcoef",
        "f1_macro",
        "balanced_accuracy",
        "accuracy",
    ]
].sort_values("dataset")

Unnamed: 0,model,dataset,pca_components,resampling,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
163,XGBoostClassifier,dataset_00_all.pickle,0,SMOTE,0.655902,0.61415,0.592576,0.719438


**Top Results (mcc > .6)**

In [10]:
df[df.matthews_corrcoef > 0.6].sort_values("matthews_corrcoef", ascending=False)[
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "learning_rate",
        "n_estimators",
        "max_depth",
        "gamma",
        "reg_alpha",
        "matthews_corrcoef",
        "f1_macro",
        "balanced_accuracy",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,scaled,pca_components,learning_rate,n_estimators,max_depth,gamma,reg_alpha,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
163,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.080805,423,8,0.051367,8.308685,0.655902,0.614150,0.592576,0.719438
157,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.061025,425,9,0.151822,7.722392,0.653104,0.616732,0.595196,0.717096
139,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.090870,846,8,0.103085,14.734839,0.652459,0.612167,0.594148,0.716393
191,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.039355,844,7,0.267192,17.418931,0.652186,0.609997,0.594894,0.715925
149,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.145262,847,8,0.221591,15.625838,0.650990,0.612869,0.595310,0.715222
...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.700857,573,5,1.740721,9.994474,0.605884,0.575305,0.575305,0.676581
147,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.491075,416,5,2.916760,6.902364,0.605706,0.581288,0.577480,0.676581
110,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.404746,454,4,3.724345,10.717888,0.604997,0.572945,0.575105,0.675410
113,XGBoostClassifier,dataset_00_all.pickle,yes,0,0.019368,431,4,3.893144,2.932606,0.603990,0.570943,0.589466,0.672834


In [11]:
df.value_counts("dataset").sort_index(ascending=True)

dataset
dataset_00_all.pickle    195
dtype: int64

In [12]:
df

Unnamed: 0,model,dataset,timestamp,scaled,pca_components,resampling,learning_rate,n_estimators,max_depth,gamma,reg_alpha,seed,matthews_corrcoef,f1_macro,balanced_accuracy,accuracy
0,XGBoostClassifier,dataset_00_all.pickle,1659037616,yes,0,under,0.106177,847,7,0.809896,15.199520,1962,0.462909,0.444006,0.563480,0.525059
1,XGBoostClassifier,dataset_00_all.pickle,1659037616,yes,0,under,0.817638,202,4,3.180340,14.920890,1962,0.398066,0.390400,0.491584,0.464871
2,XGBoostClassifier,dataset_00_all.pickle,1659037616,yes,0,under,0.912352,988,8,4.064809,17.454995,1962,0.376729,0.378115,0.478421,0.447775
3,XGBoostClassifier,dataset_00_all.pickle,1659037616,yes,0,under,0.556415,993,3,3.588053,1.935608,1962,0.462287,0.445214,0.555581,0.526464
4,XGBoostClassifier,dataset_00_all.pickle,1659037616,yes,0,under,0.114840,426,8,0.327162,6.880454,1962,0.480628,0.461960,0.575894,0.542623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,XGBoostClassifier,dataset_00_all.pickle,1659044338,yes,0,SMOTE,0.491149,845,3,1.311293,18.846047,1962,0.618475,0.593878,0.594387,0.686651
191,XGBoostClassifier,dataset_00_all.pickle,1659044338,yes,0,SMOTE,0.039355,844,7,0.267192,17.418931,1962,0.652186,0.609997,0.594894,0.715925
192,XGBoostClassifier,dataset_00_all.pickle,1659044338,yes,0,SMOTE,0.077993,845,6,0.328737,16.845840,1962,0.648087,0.607945,0.592136,0.712646
193,XGBoostClassifier,dataset_00_all.pickle,1659044338,yes,0,SMOTE,0.516273,845,6,0.776966,18.385006,1962,0.613421,0.582941,0.576564,0.683372
