In [1]:
import json

import pandas as pd

In [2]:
filename = "../merged_logs.json"

In [3]:
with open(filename, "r") as infile:
    json_list = [json.loads(line) for line in infile]
df = pd.json_normalize(json_list)

**Current best mcc, f1_macro and accuracy results, respectively.**

In [4]:
best_mcc_f1_accuracy_indices = [
    df["matthews_corrcoef"].idxmax(),
    df["f1_macro"].idxmax(),
    df["accuracy"].idxmax(),
]
df.iloc[best_mcc_f1_accuracy_indices][
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "resampling",
        "matthews_corrcoef",
        "f1_macro",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,scaled,pca_components,resampling,matthews_corrcoef,f1_macro,accuracy
266,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTETOMEK,0.666237,0.608285,0.729215
139,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTE,0.637998,0.62041,0.702635
266,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTETOMEK,0.666237,0.608285,0.729215


**Distribution of Datasets in Top n Results**

In [5]:
df.sort_values("matthews_corrcoef", ascending=False)[0:400][
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "resampling",
        "matthews_corrcoef",
        "f1_macro",
        "accuracy",
    ]
].value_counts("dataset")

dataset
dataset_00_all.pickle              325
dataset_07_pvtt_mean_cov.pickle     53
dataset_04_mean_cov_icov.pickle     22
dtype: int64

**Detailed summary of current best mcc result.**

In [6]:
df.iloc[best_mcc_f1_accuracy_indices[0]]

model                    XGBoostClassifier
dataset              dataset_00_all.pickle
timestamp                       1658136355
scaled                                 yes
pca_components                         0.0
resampling                      SMOTETOMEK
learning_rate                     0.221461
n_estimators                           827
max_depth                                4
gamma                             0.524969
reg_alpha                         4.327827
seed                                  1962
fit_tim                         193.808802
core_tim                           0.37062
matthews_corrcoef                 0.666237
f1_macro                          0.608285
accuracy                          0.729215
Name: 266, dtype: object

**Number of trials logged.**

In [7]:
df.shape[0]

3569

**Group results by timestamp, resampling and pca_components, then show current best mcc by group.**

In [8]:
indices_for_best_mcc = df.groupby(
    ["timestamp", "resampling", "pca_components", "scaled"]
)["matthews_corrcoef"].idxmax()
df.iloc[indices_for_best_mcc][
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "resampling",
        "matthews_corrcoef",
        "f1_macro",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,scaled,pca_components,resampling,matthews_corrcoef,f1_macro,accuracy
573,XGBoostClassifier,dataset_01_mean.pickle,no,0.00,SMOTE,0.417633,0.391702,0.512646
1013,XGBoostClassifier,dataset_01_mean.pickle,yes,0.00,SMOTE,0.422323,0.397869,0.517096
816,XGBoostClassifier,dataset_01_mean.pickle,no,0.95,SMOTE,0.296870,0.273581,0.380386
1239,XGBoostClassifier,dataset_01_mean.pickle,yes,0.95,SMOTE,0.396496,0.371039,0.495785
641,XGBoostClassifier,dataset_01_mean.pickle,no,0.00,SMOTEENN,0.360060,0.335622,0.428981
...,...,...,...,...,...,...,...,...
3034,XGBoostClassifier,dataset_06_pvtt_mean.pickle,yes,0.95,SMOTEENN,0.401433,0.369256,0.468208
2895,XGBoostClassifier,dataset_06_pvtt_mean.pickle,yes,0.00,SMOTETOMEK,0.511877,0.431531,0.606850
3115,XGBoostClassifier,dataset_06_pvtt_mean.pickle,yes,0.95,SMOTETOMEK,0.466482,0.385388,0.571311
2730,XGBoostClassifier,dataset_06_pvtt_mean.pickle,yes,0.00,none,0.511877,0.431531,0.606850


**Best Result by Dataset**

In [9]:
indices_for_best_mcc = df.groupby(["dataset"])["matthews_corrcoef"].idxmax()
df.iloc[indices_for_best_mcc][
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "resampling",
        "matthews_corrcoef",
        "f1_macro",
        "accuracy",
    ]
].sort_values("dataset")

Unnamed: 0,model,dataset,scaled,pca_components,resampling,matthews_corrcoef,f1_macro,accuracy
266,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTETOMEK,0.666237,0.608285,0.729215
527,XGBoostClassifier,dataset_01_mean.pickle,no,0.0,none,0.446103,0.363238,0.556733
1463,XGBoostClassifier,dataset_03_mean_icov.pickle,yes,0.0,SMOTE,0.541714,0.509942,0.624063
1861,XGBoostClassifier,dataset_04_mean_cov_icov.pickle,yes,0.0,none,0.557263,0.491526,0.643267
2282,XGBoostClassifier,dataset_05_pvtt.pickle,yes,0.0,none,0.438515,0.334268,0.549824
2730,XGBoostClassifier,dataset_06_pvtt_mean.pickle,yes,0.0,none,0.511877,0.431531,0.60685
3162,XGBoostClassifier,dataset_07_pvtt_mean_cov.pickle,yes,0.0,none,0.56761,0.496467,0.650995


**Top Results (mcc > .645)**

In [30]:
df[df.matthews_corrcoef > 0.645].sort_values("matthews_corrcoef", ascending=False)[
    [
        "model",
        "dataset",
        "scaled",
        "pca_components",
        "resampling",
        "matthews_corrcoef",
        "f1_macro",
        "accuracy",
    ]
]

Unnamed: 0,model,dataset,scaled,pca_components,resampling,matthews_corrcoef,f1_macro,accuracy
266,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTETOMEK,0.666237,0.608285,0.729215
262,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTETOMEK,0.65956,0.597724,0.723888
48,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,none,0.65956,0.597724,0.723888
4,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,none,0.658313,0.598556,0.723009
218,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTETOMEK,0.658313,0.598556,0.723009
53,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,none,0.658313,0.598556,0.723009
263,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTETOMEK,0.652382,0.583239,0.718326
214,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,SMOTETOMEK,0.648024,0.579273,0.714871
0,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,none,0.648024,0.579273,0.714871
49,XGBoostClassifier,dataset_00_all.pickle,yes,0.0,none,0.648024,0.579273,0.714871


In [10]:
df.value_counts("dataset")

dataset
dataset_01_mean.pickle             880
dataset_00_all.pickle              489
dataset_03_mean_icov.pickle        440
dataset_04_mean_cov_icov.pickle    440
dataset_05_pvtt.pickle             440
dataset_06_pvtt_mean.pickle        440
dataset_07_pvtt_mean_cov.pickle    440
dtype: int64