In [1]:
import sys
sys.path.append("..")

In [3]:
import os
import random
import statistics
from itertools import product

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from src.data_loaders.cvs_loader import CVSLoader
from src.training.experiment_conventional_multiclass import init_data
from src.utils.training_utils import open_log
from src.utils.label_convertors import convert2vec
from src.training.experiment_conventional_multiclass import experiment

In [4]:
# from functools import partial


# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score
# import numpy as np

# from utils.data_loaders.cvs_loader import CVSLoader
# from utils.label_convertors import convert2vec

In [5]:
def experiment_rf(data_path,
                  log_path,
                  max_features,
                  n_estimators,
                  max_depth,
                  min_samples_leaf,
                  rand_seed=None,
                  n_jobs=-1):
    r""" Test random forest
    """
    model = RandomForestClassifier(bootstrap=True,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   n_estimators=n_estimators,
                                   min_samples_leaf=min_samples_leaf,
                                   n_jobs=n_jobs)
    acc, model = experiment(data_path, model, log_path, rand_seed)
    return acc, model

def do_experiment(data_path, log_path, n_estimators, max_depth, max_features, min_samples_leaf, rand_seeds):
    best = dict()
    for ne, md, mf, msl, rs in product(n_estimators, max_depth, max_features, min_samples_leaf, rand_seeds):
        best.setdefault(rs, [0, 0, 0, 0, 0])
        acc, _ = experiment_rf(
            data_path=data_path,
            log_path=log_path,
            n_estimators=ne,
            max_depth=md,
            max_features=mf,
            min_samples_leaf=msl,
            rand_seed=rs,
            n_jobs=30
        )
        print(f"num_estimators: {ne}, max depth: {md}, max features: {mf}, min samples leaf: {msl}, acc: {acc:.5f}")
        if acc > best.get(rs)[-1]:
            best.get(rs)[0] = ne
            best.get(rs)[1] = md
            best.get(rs)[2] = mf
            best.get(rs)[3] = msl
            best.get(rs)[4] = acc
    print("="*80)
    for k, v in best.items():
        print(f"Random seed: {str(k)}")
        print(f"Accuracy: {v[4]:.5f}")
        print("Best params:")
        print(f"  num_estimators: {v[0]}")
        print(f"  max_depth: {v[1]}")
        print(f"  max_features: {v[2]}")
        print(f"  min samples leaf: {v[3]}")
        print("="*80)

In [7]:
experiment_id = 7

In [8]:
datap = os.path.join("..", "data", "fromraw_cid_inchi_smiles_fp_labels_onehots.csv")
logp = os.path.join("..", "logs", "convention", "randomforest", "experiment_"+str(experiment_id))

n_estimators = [100, 500, 1000, 1500]
max_depth = [5, 10, 15]
max_features = ["auto", "log2", None, 512, 1024]
min_samples_leaf = [1, 5, 10, 15, 20]
# rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
rand_seeds = [0]

do_experiment(
    data_path=datap,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    max_features=max_features,
    min_samples_leaf=min_samples_leaf,
    rand_seeds=rand_seeds
)
experiment_id += 1

num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 1, acc: 0.44739
num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 5, acc: 0.44803
num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 10, acc: 0.44609
num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 15, acc: 0.44480
num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 20, acc: 0.44028
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 1, acc: 0.43125
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 5, acc: 0.43125
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 10, acc: 0.43125
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 15, acc: 0.43125
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 20, acc: 0.43125
num_estimators: 100, max depth: 5, max features: None, min samples leaf: 1, acc: 0.47256
num_estimators:

In [10]:
logp = os.path.join("..", "logs", "convention", "randomforest", "experiment_"+str(experiment_id))

n_estimators = [1500, 2000]
max_depth = [15, 20, 25, 30, 35, 40]
max_features = [512, 1024, 2048]
min_samples_leaf = [1]
# rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
rand_seeds = [0]

do_experiment(
    n_estimators=n_estimators,
    max_depth=max_depth,
    max_features=max_features,
    min_samples_leaf=min_samples_leaf,
    rand_seeds=rand_seeds
)

experiment_id += 1

num_estimators: 1500, max depth: 15, max features: 512, min samples leaf: 1, acc: 0.53002
num_estimators: 1500, max depth: 15, max features: 1024, min samples leaf: 1, acc: 0.53260
num_estimators: 1500, max depth: 15, max features: 2048, min samples leaf: 1, acc: 0.52744
num_estimators: 1500, max depth: 20, max features: 512, min samples leaf: 1, acc: 0.53712
num_estimators: 1500, max depth: 20, max features: 1024, min samples leaf: 1, acc: 0.53325
num_estimators: 1500, max depth: 20, max features: 2048, min samples leaf: 1, acc: 0.53260
num_estimators: 1500, max depth: 15, max features: 512, min samples leaf: 1, acc: 0.53196
num_estimators: 1500, max depth: 15, max features: 1024, min samples leaf: 1, acc: 0.52808
num_estimators: 1500, max depth: 15, max features: 2048, min samples leaf: 1, acc: 0.53066
num_estimators: 1500, max depth: 30, max features: 512, min samples leaf: 1, acc: 0.53906
num_estimators: 1500, max depth: 30, max features: 1024, min samples leaf: 1, acc: 0.53712
num

In [11]:
logp = os.path.join("..", "logs", "convention", "randomforest", "experiment_"+str(experiment_id))

n_estimators = [1500, 2000, 2500, 3000]
max_depth = [40, 45, 50, 55, 60]
max_features = [512]
min_samples_leaf = [1]
# rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
rand_seeds = [0]

do_experiment(
    n_estimators=n_estimators,
    max_depth=max_depth,
    max_features=max_features,
    min_samples_leaf=min_samples_leaf,
    rand_seeds=rand_seeds
)
    
experiment_id += 1

num_estimators: 1500, max depth: 40, max features: 512, min samples leaf: 1, acc: 0.53970
num_estimators: 1500, max depth: 45, max features: 512, min samples leaf: 1, acc: 0.54164
num_estimators: 1500, max depth: 50, max features: 512, min samples leaf: 1, acc: 0.53777
num_estimators: 1500, max depth: 55, max features: 512, min samples leaf: 1, acc: 0.54035
num_estimators: 1500, max depth: 60, max features: 512, min samples leaf: 1, acc: 0.54035
num_estimators: 2000, max depth: 40, max features: 512, min samples leaf: 1, acc: 0.54099
num_estimators: 2000, max depth: 45, max features: 512, min samples leaf: 1, acc: 0.53648
num_estimators: 2000, max depth: 50, max features: 512, min samples leaf: 1, acc: 0.53970
num_estimators: 2000, max depth: 55, max features: 512, min samples leaf: 1, acc: 0.54035
num_estimators: 2000, max depth: 60, max features: 512, min samples leaf: 1, acc: 0.53906
num_estimators: 2500, max depth: 40, max features: 512, min samples leaf: 1, acc: 0.54035
num_estima

In [14]:
logp = os.path.join("..", "logs", "convention", "randomforest", "experiment_"+str(experiment_id))

n_estimators = [1300, 1400, 1500, 1600, 1700]
max_depth = [43, 44, 45, 46, 47]
max_features = [512]
min_samples_leaf = [1]
# rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
rand_seeds = [0]

do_experiment(
    n_estimators=n_estimators,
    max_depth=max_depth,
    max_features=max_features,
    min_samples_leaf=min_samples_leaf,
    rand_seeds=rand_seeds
)
    
experiment_id += 1

num_estimators: 1300, max depth: 43, max features: 512, min samples leaf: 1, acc: 0.54035
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.54293
num_estimators: 1300, max depth: 45, max features: 512, min samples leaf: 1, acc: 0.54164
num_estimators: 1300, max depth: 46, max features: 512, min samples leaf: 1, acc: 0.53777
num_estimators: 1300, max depth: 47, max features: 512, min samples leaf: 1, acc: 0.53777
num_estimators: 1400, max depth: 43, max features: 512, min samples leaf: 1, acc: 0.53970
num_estimators: 1400, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.53970
num_estimators: 1400, max depth: 45, max features: 512, min samples leaf: 1, acc: 0.54035
num_estimators: 1400, max depth: 46, max features: 512, min samples leaf: 1, acc: 0.53970
num_estimators: 1400, max depth: 47, max features: 512, min samples leaf: 1, acc: 0.53970
num_estimators: 1500, max depth: 43, max features: 512, min samples leaf: 1, acc: 0.53970
num_estima

In [15]:
logp = os.path.join("..", "logs", "convention", "randomforest", "experiment_"+str(experiment_id))

n_estimators = [1300]
max_depth = [44]
max_features = [512]
min_samples_leaf = [1]
rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
# rand_seeds = [0]

do_experiment(
    n_estimators=n_estimators,
    max_depth=max_depth,
    max_features=max_features,
    min_samples_leaf=min_samples_leaf,
    rand_seeds=rand_seeds
)
    
experiment_id += 1

num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.53970
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.55649
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.54874
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.52937
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.54939
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.53906
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.55003
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.54487
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.52937
num_estimators: 1300, max depth: 44, max features: 512, min samples leaf: 1, acc: 0.55649
Random seed: 0
Accuracy: 0.53970
Best params:
  num_estimators: 1300
  max_depth: 44
  max_features: