In [1]:
import sys

sys.path.append("..")

In [2]:
import os
import random
import statistics
from itertools import product

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from src.data_loaders.cvs_loader import CVSLoader
from src.training.experiment_conventional_multiclass import experiment, init_data
from src.utils.label_convertors import convert2vec
from src.utils.training_utils import open_log

In [3]:
# from functools import partial

# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.model_selection import cross_validate
# import numpy as np

# from utils.data_loaders.cvs_loader import CVSLoader
# from utils.label_convertors import convert2vec

In [4]:
DATAPATH = "../data/fromraw_cid_inchi_smiles_fp_labels_onehots.csv"
LOGPATH = os.path.join("..", "logs", "convention", "adaboost")

In [8]:
def experiment_adaboost(
    data_path,
    log_path,
    n_estimators,
    max_depth,
    learning_rate,
    splitter,
    max_features,
    rand_seed=None,
):
    model = AdaBoostClassifier(
        DecisionTreeClassifier(
            max_depth=max_depth, splitter=splitter, max_features=max_features,
        ),
        n_estimators=n_estimators,
        learning_rate=learning_rate,
    )
    acc, model = experiment(data_path, model, log_path, rand_seed)
    return acc, model


def do_experiment(
    data_path,
    log_path,
    n_estimators,
    max_depth,
    learning_rate,
    splitter,
    max_features,
    rand_seed=None,
):
    best = dict()
    for ne, md, lr, sp, mf, rs in product(
        n_estimators, max_depth, learning_rate, splitter, max_features, rand_seed
    ):
        best.setdefault(rs, [0, 0, 0, 0, 0, 0])
        acc, _ = experiment_adaboost(
            data_path=data_path,
            log_path=log_path,
            n_estimators=ne,
            max_depth=md,
            learning_rate=lr,
            splitter=sp,
            max_features=mf,
            rand_seed=rs,
        )
        print(
            f"num estimators: {ne}, "
            f"max depth: {md}, learning rate: {lr}, splitter: {sp}, max features: {mf}, acc: {acc}"
        )
        if acc > best.get(rs)[-1]:
            best.get(rs)[0] = ne
            best.get(rs)[1] = md
            best.get(rs)[2] = lr
            best.get(rs)[3] = sp
            best.get(rs)[4] = mf
            best.get(rs)[5] = acc
    print("=" * 80)
    for k, v in best.items():
        print(f"Random seed: {str(k)}")
        print(f"Accuracy: {v[-1]:.5f}")
        print("Best params:")
        print(f"  num estimators: {v[0]}")
        print(f"  max depth: {v[1]}")
        print(f"  learning rate: {v[2]}")
        print(f"  splitter: {v[3]}")
        print(f"  max features: {v[4]}")
        print("=" * 80)

In [9]:
experiment_id = 0

In [11]:
logp = os.path.join(LOGPATH, "experiment_" + str(experiment_id))

n_estimators = [500, 1000, 1500, 2000]
max_depth = [20]
learning_rate = [0.1, 0.2, 0.3, 0.4]
splitter = ["best"]
max_features = [512]
rand_seed = [0]

do_experiment(
    data_path=DATAPATH,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    splitter=splitter,
    max_features=max_features,
    rand_seed=rand_seed,
)

experiment_id += 1

num estimators: 500, max depth: 20, learning rate: 0.1, splitter: best, max features: 512, acc: 0.5183989670755326
num estimators: 500, max depth: 20, learning rate: 0.2, splitter: best, max features: 512, acc: 0.5125887669464171
num estimators: 500, max depth: 20, learning rate: 0.3, splitter: best, max features: 512, acc: 0.5125887669464171
num estimators: 500, max depth: 20, learning rate: 0.4, splitter: best, max features: 512, acc: 0.5248547449967721
num estimators: 1000, max depth: 20, learning rate: 0.1, splitter: best, max features: 512, acc: 0.5235635894125242
num estimators: 1000, max depth: 20, learning rate: 0.2, splitter: best, max features: 512, acc: 0.5009683666881859
num estimators: 1000, max depth: 20, learning rate: 0.3, splitter: best, max features: 512, acc: 0.5164622336991608
num estimators: 1000, max depth: 20, learning rate: 0.4, splitter: best, max features: 512, acc: 0.5248547449967721
num estimators: 1500, max depth: 20, learning rate: 0.1, splitter: best, max

In [12]:
logp = os.path.join(LOGPATH, "experiment_" + str(experiment_id))

n_estimators = [500]
max_depth = [15, 20, 25, 30, 35]
learning_rate = [0.4]
splitter = ["best"]
max_features = [512, 1024, 2048]
# rand_seeds=[0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
rand_seed = [0]

do_experiment(
    data_path=DATAPATH,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    splitter=splitter,
    max_features=max_features,
    rand_seed=rand_seed,
)

experiment_id += 1

num estimators: 500, max depth: 15, learning rate: 0.4, splitter: best, max features: 512, acc: 0.5087153001936734
num estimators: 500, max depth: 15, learning rate: 0.4, splitter: best, max features: 1024, acc: 0.5203357004519045
num estimators: 500, max depth: 15, learning rate: 0.4, splitter: best, max features: 2048, acc: 0.5203357004519045
num estimators: 500, max depth: 20, learning rate: 0.4, splitter: best, max features: 512, acc: 0.5235635894125242
num estimators: 500, max depth: 20, learning rate: 0.4, splitter: best, max features: 1024, acc: 0.5222724338282763
num estimators: 500, max depth: 20, learning rate: 0.4, splitter: best, max features: 2048, acc: 0.5196901226597805
num estimators: 500, max depth: 25, learning rate: 0.4, splitter: best, max features: 512, acc: 0.5145255003227889
num estimators: 500, max depth: 25, learning rate: 0.4, splitter: best, max features: 1024, acc: 0.5209812782440284
num estimators: 500, max depth: 25, learning rate: 0.4, splitter: best, max

In [14]:
logp = os.path.join(LOGPATH, "experiment_" + str(experiment_id))

n_estimators = [500]
max_depth = [25]
learning_rate = [0.1, 0.2, 0.3, 0.4, 0.5]
splitter = ["best"]
max_features = [1024, 2048]
# rand_seeds=[0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
rand_seed = [0]

do_experiment(
    data_path=DATAPATH,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    splitter=splitter,
    max_features=max_features,
    rand_seed=rand_seed,
)

experiment_id += 1

num estimators: 500, max depth: 25, learning rate: 0.1, splitter: best, max features: 1024, acc: 0.525500322788896
num estimators: 500, max depth: 25, learning rate: 0.1, splitter: best, max features: 2048, acc: 0.5145255003227889
num estimators: 500, max depth: 25, learning rate: 0.2, splitter: best, max features: 1024, acc: 0.5196901226597805
num estimators: 500, max depth: 25, learning rate: 0.2, splitter: best, max features: 2048, acc: 0.5196901226597805
num estimators: 500, max depth: 25, learning rate: 0.3, splitter: best, max features: 1024, acc: 0.5196901226597805
num estimators: 500, max depth: 25, learning rate: 0.3, splitter: best, max features: 2048, acc: 0.5242091672046482
num estimators: 500, max depth: 25, learning rate: 0.4, splitter: best, max features: 1024, acc: 0.525500322788896
num estimators: 500, max depth: 25, learning rate: 0.4, splitter: best, max features: 2048, acc: 0.5242091672046482
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, ma

In [15]:
logp = os.path.join(LOGPATH, "experiment_" + str(experiment_id))

n_estimators = [500]
max_depth = [25]
learning_rate = [0.5]
splitter = ["best"]
max_features = [1024]
rand_seed = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
# rand_seed=[0]

do_experiment(
    data_path=DATAPATH,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    splitter=splitter,
    max_features=max_features,
    rand_seed=rand_seed,
)

experiment_id += 1

num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, max features: 1024, acc: 0.5293737895416397
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, max features: 1024, acc: 0.5274370561652679
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, max features: 1024, acc: 0.5358295674628792
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, max features: 1024, acc: 0.5100064557779213
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, max features: 1024, acc: 0.5293737895416397
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, max features: 1024, acc: 0.5125887669464171
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, max features: 1024, acc: 0.5351839896707553
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, max features: 1024, acc: 0.5300193673337638
num estimators: 500, max depth: 25, learning rate: 0.5, splitter: best, 