In [47]:
import sys
sys.path.append("..")

In [48]:
import os
import random
import statistics
from itertools import product

import xgboost as xgb
from sklearn.metrics import accuracy_score

from src.data_loaders.cvs_loader import CVSLoader
from src.training.experiment_conventional_multiclass import init_data
from src.utils.training_utils import open_log

In [23]:
def experiment_xgboost(data_path,
                       log_path,
                       learning_rate,
                       max_depth,
                       gamma,
                       lambda_,
                       num_class=32,
                       n_round=17,
                       rand_seed=None):
    # init data
    x_train, y_train, x_test, y_test = init_data(data_path, rand_seed)
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dtest = xgb.DMatrix(x_test, label=y_test)
    # setup parameters
    param = {}
    param['objective'] = 'multi:softmax'
    param['eta'] = learning_rate
    param['max_depth'] = max_depth
    param['gamma'] = gamma
    param['lambda'] = lambda_
#     param['silent'] = 1
#     param['nthread'] = int(os.cpu_count()/2)
    param['gpu_id'] = 0
    param['tree_method'] = 'gpu_hist'
    param['num_class'] = num_class
    bst = xgb.train(param, dtrain, n_round)
    preds = bst.predict(dtest).astype(int)
    acc = accuracy_score(preds, y_test)
    # Logging the experiment results
    log_f, log_path = open_log(log_path)
    log_f.write(
        "Experiment with xgboost. Accuracy is: {}\n".format(acc))
    # Write prediction and true label
    log_f.write("@prediction-truth\n")
    for p, t in zip(preds, y_test):
        log_f.write(str(p)+" "+str(t)+"\n")
    log_f.write("="*80+"\n")
    log_f.close()
    return acc, bst

In [28]:
datap = os.path.join("..", "data", "fromraw_cid_inchi_smiles_fp_labels_onehots.csv")
logp = os.path.join("..", "logs", "convention", "xgboost")

learning_rate = [0.1, 0.2, 0.3, 0.4, 0.5]
gamma = [0, 2, 4, 8, 16]
lambda_ = [0, 1, 2]
max_depth = [6, 7, 8]
n_round = [17]
rand_seeds = [0]

best = dict()
for lr, g, l, m, n, r in product(learning_rate, gamma, lambda_, max_depth, n_round, rand_seeds):
    best.setdefault(r, [0, 0, 0, 0, 0, 0])
    acc, _ = experiment_xgboost(
        data_path=datap,
        log_path=logp,
        learning_rate=lr,
        gamma=g,
        lambda_=l,
        max_depth=m,
        n_round=n,
        rand_seed=r
    )
    print(f"learning_rate: {lr}, gamma: {g}, lambda: {l}, max depth: {m}, rounds: {n}, acc: {acc}")
    if acc > best.get(r)[5]:
        best.get(r)[0] = lr
        best.get(r)[1] = g
        best.get(r)[2] = l
        best.get(r)[3] = m
        best.get(r)[4] = n
        best.get(r)[5] = acc
print("="*80)
for k, v in best.items():
    print(f"Random seed: {str(k)}")
    print("Best params:")
    print(f"  learning rate: {v[0]}")
    print(f"  gamma: {v[1]}")
    print(f"  lambda: {v[2]}")
    print(f"  max_depth: {v[3]}")
    print(f"  n_round: {v[4]}")
    print(f"  accuracy: {v[5]}")
    print("="*80)


learning_rate: 0.1, gamma: 0, lambda: 0, max depth: 6, rounds: 17, acc: 0.5345384118786314
learning_rate: 0.1, gamma: 0, lambda: 0, max depth: 7, rounds: 17, acc: 0.5442220787604907
learning_rate: 0.1, gamma: 0, lambda: 0, max depth: 8, rounds: 17, acc: 0.5468043899289864
learning_rate: 0.1, gamma: 0, lambda: 1, max depth: 6, rounds: 17, acc: 0.5326016785022595
learning_rate: 0.1, gamma: 0, lambda: 1, max depth: 7, rounds: 17, acc: 0.540348612007747
learning_rate: 0.1, gamma: 0, lambda: 1, max depth: 8, rounds: 17, acc: 0.539057456423499
learning_rate: 0.1, gamma: 0, lambda: 2, max depth: 6, rounds: 17, acc: 0.5261459005810201
learning_rate: 0.1, gamma: 0, lambda: 2, max depth: 7, rounds: 17, acc: 0.5351839896707553
learning_rate: 0.1, gamma: 0, lambda: 2, max depth: 8, rounds: 17, acc: 0.5422853453841188
learning_rate: 0.1, gamma: 2, lambda: 0, max depth: 6, rounds: 17, acc: 0.5326016785022595
learning_rate: 0.1, gamma: 2, lambda: 0, max depth: 7, rounds: 17, acc: 0.5429309231762427
l

In [30]:
learning_rate = [0.2]
gamma = [2]
lambda_ = [1, 2, 3]
max_depth = [8, 10, 15, 17, 20]
n_round = [10, 15, 17, 20]
rand_seeds = [0]

best = dict()
for lr, g, l, m, n, r in product(learning_rate, gamma, lambda_, max_depth, n_round, rand_seeds):
    best.setdefault(r, [0, 0, 0, 0, 0, 0])
    acc, _ = experiment_xgboost(
        data_path=datap,
        log_path=logp,
        learning_rate=lr,
        gamma=g,
        lambda_=l,
        max_depth=m,
        n_round=n,
        rand_seed=r
    )
    print(f"learning_rate: {lr}, gamma: {g}, lambda: {l}, max depth: {m}, rounds: {n}, acc: {acc}")
    if acc > best.get(r)[5]:
        best.get(r)[0] = lr
        best.get(r)[1] = g
        best.get(r)[2] = l
        best.get(r)[3] = m
        best.get(r)[4] = n
        best.get(r)[5] = acc
print("="*80)
for k, v in best.items():
    print(f"Random seed: {str(k)}")
    print("Best params:")
    print(f"  learning rate: {v[0]}")
    print(f"  gamma: {v[1]}")
    print(f"  lambda: {v[2]}")
    print(f"  max_depth: {v[3]}")
    print(f"  n_round: {v[4]}")
    print(f"  accuracy: {v[5]}")
    print("="*80)

learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 8, rounds: 10, acc: 0.5448676565526146
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 8, rounds: 15, acc: 0.5513234344738541
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 8, rounds: 17, acc: 0.5551969012265978
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 8, rounds: 20, acc: 0.5584247901872176
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 10, rounds: 10, acc: 0.5338928340865075
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 10, rounds: 15, acc: 0.540348612007747
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 10, rounds: 17, acc: 0.5409941897998709
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 10, rounds: 20, acc: 0.5442220787604907
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 15, rounds: 10, acc: 0.539057456423499
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 15, rounds: 15, acc: 0.551969012265978
learning_rate: 0.2, gamma: 2, lambda: 1, max depth: 15, rounds: 17, acc: 0.551969012265

In [31]:
learning_rate = [0.2]
gamma = [2]
lambda_ = [0, 1]
max_depth = [8, 10, 20]
n_round = [10, 15, 17, 20]
rand_seeds = [0]

best = dict()
for lr, g, l, m, n, r in product(learning_rate, gamma, lambda_, max_depth, n_round, rand_seeds):
    best.setdefault(r, [0, 0, 0, 0, 0, 0])
    acc, _ = experiment_xgboost(
        data_path=datap,
        log_path=logp,
        learning_rate=lr,
        gamma=g,
        lambda_=l,
        max_depth=m,
        n_round=n,
        rand_seed=r
    )
    print(f"learning_rate: {lr}, gamma: {g}, lambda: {l}, max depth: {m}, rounds: {n}, acc: {acc}")
    if acc > best.get(r)[5]:
        best.get(r)[0] = lr
        best.get(r)[1] = g
        best.get(r)[2] = l
        best.get(r)[3] = m
        best.get(r)[4] = n
        best.get(r)[5] = acc
print("="*80)
for k, v in best.items():
    print(f"Random seed: {str(k)}")
    print("Best params:")
    print(f"  learning rate: {v[0]}")
    print(f"  gamma: {v[1]}")
    print(f"  lambda: {v[2]}")
    print(f"  max_depth: {v[3]}")
    print(f"  n_round: {v[4]}")
    print(f"  accuracy: {v[5]}")
    print("="*80)

learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 8, rounds: 20, acc: 0.5493867010974822
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 8, rounds: 21, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 8, rounds: 22, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 8, rounds: 23, acc: 0.5493867010974822
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 8, rounds: 24, acc: 0.5513234344738541
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 10, rounds: 20, acc: 0.5448676565526146
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 10, rounds: 21, acc: 0.5455132343447385
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 10, rounds: 22, acc: 0.5448676565526146
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 10, rounds: 23, acc: 0.5455132343447385
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 10, rounds: 24, acc: 0.5455132343447385
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 20, rounds: 20, acc: 0.5597159457

In [32]:
learning_rate = [0.2]
gamma = [2]
lambda_ = [0]
max_depth = [20, 21, 22, 23]
n_round = [22, 23, 24, 25, 26, 27]
rand_seeds = [0]

best = dict()
for lr, g, l, m, n, r in product(learning_rate, gamma, lambda_, max_depth, n_round, rand_seeds):
    best.setdefault(r, [0, 0, 0, 0, 0, 0])
    acc, _ = experiment_xgboost(
        data_path=datap,
        log_path=logp,
        learning_rate=lr,
        gamma=g,
        lambda_=l,
        max_depth=m,
        n_round=n,
        rand_seed=r
    )
    print(f"learning_rate: {lr}, gamma: {g}, lambda: {l}, max depth: {m}, rounds: {n}, acc: {acc}")
    if acc > best.get(r)[5]:
        best.get(r)[0] = lr
        best.get(r)[1] = g
        best.get(r)[2] = l
        best.get(r)[3] = m
        best.get(r)[4] = n
        best.get(r)[5] = acc
print("="*80)
for k, v in best.items():
    print(f"Random seed: {str(k)}")
    print("Best params:")
    print(f"  learning rate: {v[0]}")
    print(f"  gamma: {v[1]}")
    print(f"  lambda: {v[2]}")
    print(f"  max_depth: {v[3]}")
    print(f"  n_round: {v[4]}")
    print(f"  accuracy: {v[5]}")
    print("="*80)

learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 20, rounds: 22, acc: 0.5610071013557134
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 20, rounds: 23, acc: 0.5603615235635894
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 20, rounds: 24, acc: 0.5597159457714654
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 20, rounds: 25, acc: 0.5590703679793415
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 20, rounds: 26, acc: 0.5590703679793415
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 20, rounds: 27, acc: 0.5577792123950936
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 21, rounds: 22, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 21, rounds: 23, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 21, rounds: 24, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 21, rounds: 25, acc: 0.5500322788896062
learning_rate: 0.2, gamma: 2, lambda: 0, max depth: 21, rounds: 26, acc: 0.54809

In [34]:
learning_rate = [0.2]
gamma = map(lambda x: x / 10., range(10, 31))
lambda_ = [0]
max_depth = [20]
n_round = [22]
rand_seeds = [0]

best = dict()
for lr, g, l, m, n, r in product(learning_rate, gamma, lambda_, max_depth, n_round, rand_seeds):
    best.setdefault(r, [0, 0, 0, 0, 0, 0])
    acc, _ = experiment_xgboost(
        data_path=datap,
        log_path=logp,
        learning_rate=lr,
        gamma=g,
        lambda_=l,
        max_depth=m,
        n_round=n,
        rand_seed=r
    )
    print(f"learning_rate: {lr}, gamma: {g}, lambda: {l}, max depth: {m}, rounds: {n}, acc: {acc}")
    if acc > best.get(r)[5]:
        best.get(r)[0] = lr
        best.get(r)[1] = g
        best.get(r)[2] = l
        best.get(r)[3] = m
        best.get(r)[4] = n
        best.get(r)[5] = acc
print("="*80)
for k, v in best.items():
    print(f"Random seed: {str(k)}")
    print("Best params:")
    print(f"  learning rate: {v[0]}")
    print(f"  gamma: {v[1]}")
    print(f"  lambda: {v[2]}")
    print(f"  max_depth: {v[3]}")
    print(f"  n_round: {v[4]}")
    print(f"  accuracy: {v[5]}")
    print("="*80)

learning_rate: 0.2, gamma: 1.0, lambda: 0, max depth: 20, rounds: 22, acc: 0.5564880568108457
learning_rate: 0.2, gamma: 1.1, lambda: 0, max depth: 20, rounds: 22, acc: 0.5597159457714654
learning_rate: 0.2, gamma: 1.2, lambda: 0, max depth: 20, rounds: 22, acc: 0.5603615235635894
learning_rate: 0.2, gamma: 1.3, lambda: 0, max depth: 20, rounds: 22, acc: 0.5564880568108457
learning_rate: 0.2, gamma: 1.4, lambda: 0, max depth: 20, rounds: 22, acc: 0.5545513234344739
learning_rate: 0.2, gamma: 1.5, lambda: 0, max depth: 20, rounds: 22, acc: 0.5597159457714654
learning_rate: 0.2, gamma: 1.6, lambda: 0, max depth: 20, rounds: 22, acc: 0.5584247901872176
learning_rate: 0.2, gamma: 1.7, lambda: 0, max depth: 20, rounds: 22, acc: 0.5551969012265978
learning_rate: 0.2, gamma: 1.8, lambda: 0, max depth: 20, rounds: 22, acc: 0.552614590058102
learning_rate: 0.2, gamma: 1.9, lambda: 0, max depth: 20, rounds: 22, acc: 0.55390574564235
learning_rate: 0.2, gamma: 2.0, lambda: 0, max depth: 20, round

In [49]:
learning_rate = [0.2]
gamma = [1]
lambda_ = [1]
max_depth = [20]
n_round = [21]
rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]

best = dict()
for lr, g, l, m, n, r in product(learning_rate, gamma, lambda_, max_depth, n_round, rand_seeds):
    best.setdefault(r, [0, 0, 0, 0, 0, 0])
    acc, _ = experiment_xgboost(
        data_path=datap,
        log_path=logp,
        learning_rate=lr,
        gamma=g,
        lambda_=l,
        max_depth=m,
        n_round=n,
        rand_seed=r
    )
    print(f"learning_rate: {lr}, gamma: {g}, lambda: {l}, max depth: {m}, rounds: {n}, acc: {acc:.5f}")
    if acc > best.get(r)[5]:
        best.get(r)[0] = lr
        best.get(r)[1] = g
        best.get(r)[2] = l
        best.get(r)[3] = m
        best.get(r)[4] = n
        best.get(r)[5] = acc
print("="*80)
results = []
for k, v in best.items():
    results.append(v[5])
print(f"Best result: {statistics.mean(results):.5f} +- {statistics.stdev(results):.5f}")
print("="*80)

learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.55649
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.57134
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.55713
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.53518
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.55326
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.54874
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.54229
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.54745
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.53518
learning_rate: 0.2, gamma: 1, lambda: 1, max depth: 20, rounds: 21, acc: 0.56617
Best result: 0.55132 +- 0.01206


In [46]:
learning_rate = [0.2]
gamma = [1]
lambda_ = [2]
max_depth = [20]
n_round = [21]
rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]

best = dict()
for lr, g, l, m, n, r in product(learning_rate, gamma, lambda_, max_depth, n_round, rand_seeds):
    best.setdefault(r, [0, 0, 0, 0, 0, 0])
    acc, _ = experiment_xgboost(
        data_path=datap,
        log_path=logp,
        learning_rate=lr,
        gamma=g,
        lambda_=l,
        max_depth=m,
        n_round=n,
        rand_seed=r
    )
    print(f"learning_rate: {lr}, gamma: {g}, lambda: {l}, max depth: {m}, rounds: {n}, acc: {acc:.5f}")
    if acc > best.get(r)[5]:
        best.get(r)[0] = lr
        best.get(r)[1] = g
        best.get(r)[2] = l
        best.get(r)[3] = m
        best.get(r)[4] = n
        best.get(r)[5] = acc
print("="*80)
results = []
for k, v in best.items():
    results.append(v[5])
print(f"Best result: {statistics.mean(results):.5f} +- {statistics.stdev(results):.5f}")
print("="*80)

learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.56036
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.55972
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.55003
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.53454
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.54616
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.54616
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.55197
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.55197
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.54164
learning_rate: 0.2, gamma: 1, lambda: 2, max depth: 20, rounds: 21, acc: 0.55584
Best result: 0.54984 +- 0.00804
