In [1]:
import sys
sys.path.append("..")

In [2]:
import os
import random
import statistics
from itertools import product

import xgboost as xgb
from sklearn.metrics import accuracy_score

from src.training.experiment_conventional_multiclass import init_data
from src.utils.training_utils import open_log

In [3]:
def experiment_xgboost(
    data_path,
    log_path,
    learning_rate,
    max_depth,
    gamma,
    lambda_,
    num_class=32,
    n_round=17,
    rand_seed=None,
):
    # init data
    x_train, y_train, x_test, y_test = init_data(data_path, rand_seed)
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dtest = xgb.DMatrix(x_test, label=y_test)
    # setup parameters
    param = {}
    param["objective"] = "multi:softmax"
    param["eta"] = learning_rate
    param["max_depth"] = max_depth
    param["gamma"] = gamma
    param["lambda"] = lambda_
    #     param['silent'] = 1
    #     param['nthread'] = int(os.cpu_count()/2)
    param["gpu_id"] = 0
    param["tree_method"] = "gpu_hist"
    param["num_class"] = num_class
    bst = xgb.train(param, dtrain, n_round)
    preds = bst.predict(dtest).astype(int)
    acc = accuracy_score(preds, y_test)
    # Logging the experiment results
    log_f, log_path = open_log(log_path)
    log_f.write("Experiment with xgboost. Accuracy is: {}\n".format(acc))
    # Write prediction and true label
    log_f.write("@prediction-truth\n")
    for p, t in zip(preds, y_test):
        log_f.write(str(p) + " " + str(t) + "\n")
    log_f.write("=" * 80 + "\n")
    log_f.close()
    return acc, bst


def do_experiment(
    data_path, log_path, learining_rate, gamma, lambda_, max_depth, n_round, rand_seeds
):
    best = dict()
    for lr, g, l, m, n, r in product(
        learning_rate, gamma, lambda_, max_depth, n_round, rand_seeds
    ):
        best.setdefault(r, [0, 0, 0, 0, 0, 0])
        acc, _ = experiment_xgboost(
            data_path=datap,
            log_path=logp,
            learning_rate=lr,
            gamma=g,
            lambda_=l,
            max_depth=m,
            n_round=n,
            rand_seed=r,
        )
        print(
            f"learning_rate: {lr}, gamma: {g}, lambda: {l}, max depth: {m}, rounds: {n}, acc: {acc}"
        )
        if acc > best.get(r)[5]:
            best.get(r)[0] = lr
            best.get(r)[1] = g
            best.get(r)[2] = l
            best.get(r)[3] = m
            best.get(r)[4] = n
            best.get(r)[5] = acc
    print("=" * 80)
    for k, v in best.items():
        print(f"Random seed: {str(k)}")
        print("Best params:")
        print(f"  learning rate: {v[0]}")
        print(f"  gamma: {v[1]}")
        print(f"  lambda: {v[2]}")
        print(f"  max_depth: {v[3]}")
        print(f"  n_round: {v[4]}")
        print(f"  accuracy: {v[5]}")
    print("=" * 80)

In [4]:
datap = os.path.join("..", "data", "cyp450_smiles_GINfp_labels.json")
LOGPATH = os.path.join("..", "logs", "convention", "xgboost_ginfp")

In [5]:
experiment_id = 0

In [6]:
logp = os.path.join(LOGPATH, f"experiment_{experiment_id}")

learning_rate = [0.1, 0.2, 0.3, 0.4, 0.5]
gamma = [0, 2, 4, 8, 16]
lambda_ = [0, 1, 2]
max_depth = [6, 7, 8]
n_round = [17]
rand_seeds = [0]

do_experiment(
    datap,
    logp,
    learning_rate,
    gamma,
    lambda_,
    max_depth,
    n_round,
    rand_seeds
)

experiment_id += 1

learning_rate: 0.1, gamma: 0, lambda: 0, max depth: 6, rounds: 17, acc: 0.5326016785022595
learning_rate: 0.1, gamma: 0, lambda: 0, max depth: 7, rounds: 17, acc: 0.5293737895416397
learning_rate: 0.1, gamma: 0, lambda: 0, max depth: 8, rounds: 17, acc: 0.5332472562943835
learning_rate: 0.1, gamma: 0, lambda: 1, max depth: 6, rounds: 17, acc: 0.5242091672046482
learning_rate: 0.1, gamma: 0, lambda: 1, max depth: 7, rounds: 17, acc: 0.5319561007101355
learning_rate: 0.1, gamma: 0, lambda: 1, max depth: 8, rounds: 17, acc: 0.5364751452550033
learning_rate: 0.1, gamma: 0, lambda: 2, max depth: 6, rounds: 17, acc: 0.5261459005810201
learning_rate: 0.1, gamma: 0, lambda: 2, max depth: 7, rounds: 17, acc: 0.5351839896707553
learning_rate: 0.1, gamma: 0, lambda: 2, max depth: 8, rounds: 17, acc: 0.5345384118786314
learning_rate: 0.1, gamma: 2, lambda: 0, max depth: 6, rounds: 17, acc: 0.5293737895416397
learning_rate: 0.1, gamma: 2, lambda: 0, max depth: 7, rounds: 17, acc: 0.5280826339573919

In [8]:
logp = os.path.join(LOGPATH, f"experiment_{experiment_id}")

learning_rate = [0.2]
gamma = [0, 1, 2]
lambda_ = [1]
max_depth = [8, 10, 15, 17, 20]
n_round = [10, 15, 17, 20]
rand_seeds = [0]

do_experiment(
    datap,
    logp,
    learning_rate,
    gamma,
    lambda_,
    max_depth,
    n_round,
    rand_seeds
)

experiment_id += 1

learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 8, rounds: 10, acc: 0.5280826339573919
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 8, rounds: 15, acc: 0.5397030342156229
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 8, rounds: 17, acc: 0.5422853453841188
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 8, rounds: 20, acc: 0.5429309231762427
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 10, rounds: 10, acc: 0.5319561007101355
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 10, rounds: 15, acc: 0.539057456423499
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 10, rounds: 17, acc: 0.5416397675919948
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 10, rounds: 20, acc: 0.5468043899289864
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 10, acc: 0.5248547449967721
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 15, acc: 0.5397030342156229
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 17, acc: 0.5397030342

In [9]:
logp = os.path.join(LOGPATH, f"experiment_{experiment_id}")

learning_rate = [0.2]
gamma = [0]
lambda_ = [1]
max_depth = [15, 16, 17, 18, 19]
n_round = [15, 20, 25, 30]
rand_seeds = [0]

do_experiment(
    datap,
    logp,
    learning_rate,
    gamma,
    lambda_,
    max_depth,
    n_round,
    rand_seeds
)

experiment_id += 1

learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 15, acc: 0.5397030342156229
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 20, acc: 0.5377663008392511
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 25, acc: 0.5493867010974822
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 30, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 15, acc: 0.5384118786313751
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 20, acc: 0.5513234344738541
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 25, acc: 0.5558424790187217
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 30, acc: 0.5584247901872176
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 17, rounds: 15, acc: 0.5429309231762427
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 17, rounds: 20, acc: 0.5493867010974822
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 17, rounds: 25, acc: 0.55003

In [10]:
logp = os.path.join(LOGPATH, f"experiment_{experiment_id}")

learning_rate = [0.2]
gamma = [0]
lambda_ = [1]
max_depth = [15, 16, 17, 18, 19]
n_round = [30, 35, 40, 45, 50]
rand_seeds = [0]

do_experiment(
    datap,
    logp,
    learning_rate,
    gamma,
    lambda_,
    max_depth,
    n_round,
    rand_seeds
)

experiment_id += 1

learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 30, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 35, acc: 0.5545513234344739
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 40, acc: 0.5551969012265978
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 45, acc: 0.5577792123950936
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 50, acc: 0.5545513234344739
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 30, acc: 0.5584247901872176
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 35, acc: 0.5564880568108457
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 40, acc: 0.5597159457714654
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 45, acc: 0.5577792123950936
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 50, acc: 0.5577792123950936
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 17, rounds: 30, acc: 0.55261

In [11]:
logp = os.path.join(LOGPATH, f"experiment_{experiment_id}")

learning_rate = [0.2]
gamma = [0]
lambda_ = [1]
max_depth = [15, 16, 17, 18, 19]
n_round = [50, 55, 60, 65, 70]
rand_seeds = [0]

do_experiment(
    datap,
    logp,
    learning_rate,
    gamma,
    lambda_,
    max_depth,
    n_round,
    rand_seeds
)

experiment_id += 1

learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 50, acc: 0.5545513234344739
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 55, acc: 0.55390574564235
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 60, acc: 0.5571336346029696
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 65, acc: 0.5590703679793415
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 70, acc: 0.5597159457714654
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 50, acc: 0.5577792123950936
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 55, acc: 0.5597159457714654
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 60, acc: 0.5577792123950936
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 65, acc: 0.5610071013557134
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 70, acc: 0.565526145900581
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 17, rounds: 50, acc: 0.56488056

In [12]:
logp = os.path.join(LOGPATH, f"experiment_{experiment_id}")

learning_rate = [0.2]
gamma = [0]
lambda_ = [1]
max_depth = [15, 16, 17, 18, 19]
n_round = [70, 75, 80, 85, 90]
rand_seeds = [0]

do_experiment(
    datap,
    logp,
    learning_rate,
    gamma,
    lambda_,
    max_depth,
    n_round,
    rand_seeds
)

experiment_id += 1

learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 70, acc: 0.5597159457714654
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 75, acc: 0.5584247901872176
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 80, acc: 0.5603615235635894
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 85, acc: 0.5597159457714654
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 15, rounds: 90, acc: 0.5577792123950936
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 70, acc: 0.565526145900581
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.566171723692705
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 80, acc: 0.5629438347320852
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 85, acc: 0.5635894125242091
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 90, acc: 0.5648805681084571
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 17, rounds: 70, acc: 0.5610071

In [13]:
logp = os.path.join(LOGPATH, f"experiment_{experiment_id}")

learning_rate = [0.2]
gamma = [0]
lambda_ = [1]
max_depth = [16]
n_round = [75]
rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]

do_experiment(
    datap,
    logp,
    learning_rate,
    gamma,
    lambda_,
    max_depth,
    n_round,
    rand_seeds
)

experiment_id += 1

learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.566171723692705
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.5513234344738541
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.5487411233053583
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.5429309231762427
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.5409941897998709
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.5506778566817302
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.5377663008392511
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.552614590058102
learning_rate: 0.2, gamma: 0, lambda: 1, max depth: 16, rounds: 75, acc: 0.5513234344738541
Random seed: 0
Best params:
  learning rate: 0.2
  gamma: 0
  lambda: 1
  max_dept