In [1]:
import sys

sys.path.append("..")

In [2]:
import os
import random
import statistics
from itertools import product

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from src.training.experiment_conventional_multiclass import experiment, init_data
from src.utils.label_convertors import convert2vec
from src.utils.training_utils import open_log

In [3]:
DATAPATH = "../data/cyp450_smiles_GINfp_labels.json"
LOGPATH = os.path.join("..", "logs", "convention", "adaboost_ginfp")

In [4]:
def experiment_adaboost(
    data_path,
    log_path,
    n_estimators,
    max_depth,
    learning_rate,
    splitter,
    max_features,
    rand_seed=None,
):
    model = AdaBoostClassifier(
        DecisionTreeClassifier(
            max_depth=max_depth, splitter=splitter, max_features=max_features,
        ),
        n_estimators=n_estimators,
        learning_rate=learning_rate,
    )
    acc, model = experiment(data_path, model, log_path, rand_seed)
    return acc, model


def do_experiment(
    data_path,
    log_path,
    n_estimators,
    max_depth,
    learning_rate,
    splitter,
    max_features,
    rand_seed=None,
):
    best = dict()
    for ne, md, lr, sp, mf, rs in product(
        n_estimators, max_depth, learning_rate, splitter, max_features, rand_seed
    ):
        best.setdefault(rs, [0, 0, 0, 0, 0, 0])
        acc, _ = experiment_adaboost(
            data_path=data_path,
            log_path=log_path,
            n_estimators=ne,
            max_depth=md,
            learning_rate=lr,
            splitter=sp,
            max_features=mf,
            rand_seed=rs,
        )
        print(
            f"num estimators: {ne}, "
            f"max depth: {md}, learning rate: {lr}, splitter: {sp}, max features: {mf}, acc: {acc}"
        )
        if acc > best.get(rs)[-1]:
            best.get(rs)[0] = ne
            best.get(rs)[1] = md
            best.get(rs)[2] = lr
            best.get(rs)[3] = sp
            best.get(rs)[4] = mf
            best.get(rs)[5] = acc
    print("=" * 80)
    for k, v in best.items():
        print(f"Random seed: {str(k)}")
        print(f"Accuracy: {v[-1]:.5f}")
        print("Best params:")
        print(f"  num estimators: {v[0]}")
        print(f"  max depth: {v[1]}")
        print(f"  learning rate: {v[2]}")
        print(f"  splitter: {v[3]}")
        print(f"  max features: {v[4]}")
        print("=" * 80)

In [5]:
experiment_id = 0

In [None]:
logp = os.path.join(LOGPATH, "experiment_" + str(experiment_id))

n_estimators = [500, 1000, 1500, 2000]
max_depth = [20]
learning_rate = [0.1, 0.2, 0.3, 0.4, 0.5]
splitter = ["best"]
max_features = [50, 100, 200, 300]
rand_seed = [0]

do_experiment(
    data_path=DATAPATH,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    splitter=splitter,
    max_features=max_features,
    rand_seed=rand_seed,
)

experiment_id += 1

num estimators: 500, max depth: 20, learning rate: 0.1, splitter: best, max features: 50, acc: 0.486765655261459
num estimators: 500, max depth: 20, learning rate: 0.1, splitter: best, max features: 100, acc: 0.486765655261459
num estimators: 500, max depth: 20, learning rate: 0.1, splitter: best, max features: 200, acc: 0.49580374435119434
num estimators: 500, max depth: 20, learning rate: 0.1, splitter: best, max features: 300, acc: 0.49063912201420273
num estimators: 500, max depth: 20, learning rate: 0.2, splitter: best, max features: 50, acc: 0.486765655261459
num estimators: 500, max depth: 20, learning rate: 0.2, splitter: best, max features: 100, acc: 0.48870238863783083
num estimators: 500, max depth: 20, learning rate: 0.2, splitter: best, max features: 200, acc: 0.5048418334409296
num estimators: 500, max depth: 20, learning rate: 0.2, splitter: best, max features: 300, acc: 0.4925758553905746
num estimators: 500, max depth: 20, learning rate: 0.3, splitter: best, max featur

In [6]:
logp = os.path.join(LOGPATH, "experiment_" + str(experiment_id))

n_estimators = [500]
max_depth = [10, 15, 20, 25]
learning_rate = [0.2]
splitter = ["best"]
max_features = [200]
rand_seed = [0]

do_experiment(
    data_path=DATAPATH,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    splitter=splitter,
    max_features=max_features,
    rand_seed=rand_seed,
)

experiment_id += 1

num estimators: 500, max depth: 10, learning rate: 0.2, splitter: best, max features: 200, acc: 0.48482892188508714
num estimators: 500, max depth: 15, learning rate: 0.2, splitter: best, max features: 200, acc: 0.4790187217559716
num estimators: 500, max depth: 20, learning rate: 0.2, splitter: best, max features: 200, acc: 0.48999354422207875
num estimators: 500, max depth: 25, learning rate: 0.2, splitter: best, max features: 200, acc: 0.49580374435119434
Random seed: 0
Accuracy: 0.49580
Best params:
  num estimators: 500
  max depth: 25
  learning rate: 0.2
  splitter: best
  max features: 200


In [7]:
logp = os.path.join(LOGPATH, "experiment_" + str(experiment_id))

n_estimators = [500]
max_depth = [25, 30, 35, 40]
learning_rate = [0.2]
splitter = ["best"]
max_features = [200]
rand_seed = [0]

do_experiment(
    data_path=DATAPATH,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    splitter=splitter,
    max_features=max_features,
    rand_seed=rand_seed,
)

experiment_id += 1

num estimators: 500, max depth: 25, learning rate: 0.2, splitter: best, max features: 200, acc: 0.4970948999354422
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.500322788896062
num estimators: 500, max depth: 35, learning rate: 0.2, splitter: best, max features: 200, acc: 0.38089089735313103
num estimators: 500, max depth: 40, learning rate: 0.2, splitter: best, max features: 200, acc: 0.4357650096836669
Random seed: 0
Accuracy: 0.50032
Best params:
  num estimators: 500
  max depth: 30
  learning rate: 0.2
  splitter: best
  max features: 200


In [8]:
logp = os.path.join(LOGPATH, "experiment_" + str(experiment_id))

n_estimators = [500]
max_depth = [30]
learning_rate = [0.2]
splitter = ["best"]
max_features = [200]
rand_seed = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]

do_experiment(
    data_path=DATAPATH,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    splitter=splitter,
    max_features=max_features,
    rand_seed=rand_seed,
)

experiment_id += 1

num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.5035506778566817
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.3653970303421562
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.5061329890251776
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.42156229825693997
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.41897998708844414
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.49386701097482244
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.513879922530665
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max features: 200, acc: 0.5061329890251776
num estimators: 500, max depth: 30, learning rate: 0.2, splitter: best, max fe