In [1]:
import sys
sys.path.append("..")

In [2]:
import os
import random
import statistics
from itertools import product

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from src.data_loaders.cvs_loader import CVSLoader
from src.training.experiment_conventional_multiclass import init_data
from src.utils.training_utils import open_log
from src.utils.label_convertors import convert2vec
from src.training.experiment_conventional_multiclass import experiment

In [3]:
def experiment_rf(data_path,
                  log_path,
                  max_features,
                  n_estimators,
                  max_depth,
                  min_samples_leaf,
                  rand_seed=None,
                  n_jobs=-1):
    r""" Test random forest
    """
    model = RandomForestClassifier(bootstrap=True,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   n_estimators=n_estimators,
                                   min_samples_leaf=min_samples_leaf,
                                   n_jobs=n_jobs)
    acc, model = experiment(data_path, model, log_path, rand_seed)
    return acc, model

def do_experiment(data_path, log_path, n_estimators, max_depth, max_features, min_samples_leaf, rand_seeds):
    best = dict()
    for ne, md, mf, msl, rs in product(n_estimators, max_depth, max_features, min_samples_leaf, rand_seeds):
        best.setdefault(rs, [0, 0, 0, 0, 0])
        acc, _ = experiment_rf(
            data_path=data_path,
            log_path=log_path,
            n_estimators=ne,
            max_depth=md,
            max_features=mf,
            min_samples_leaf=msl,
            rand_seed=rs,
            n_jobs=30
        )
        print(f"num_estimators: {ne}, max depth: {md}, max features: {mf}, min samples leaf: {msl}, acc: {acc:.5f}")
        if acc > best.get(rs)[-1]:
            best.get(rs)[0] = ne
            best.get(rs)[1] = md
            best.get(rs)[2] = mf
            best.get(rs)[3] = msl
            best.get(rs)[4] = acc
    print("="*80)
    for k, v in best.items():
        print(f"Random seed: {str(k)}")
        print(f"Accuracy: {v[4]:.5f}")
        print("Best params:")
        print(f"  num_estimators: {v[0]}")
        print(f"  max_depth: {v[1]}")
        print(f"  max_features: {v[2]}")
        print(f"  min samples leaf: {v[3]}")
        print("="*80)

In [4]:
experiment_id = 0

In [5]:
datap = os.path.join("..", "data", "cyp450_smiles_GINfp_labels.json")
logp = os.path.join("..", "logs", "convention", "randomforest_ginfp", "experiment_"+str(experiment_id))

n_estimators = [100, 500, 1000, 1500]
max_depth = [5, 10, 15]
max_features = ["auto", "log2", 150, None]
min_samples_leaf = [1, 5, 10, 15, 20]
# rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
rand_seeds = [0]

do_experiment(
    data_path=datap,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    max_features=max_features,
    min_samples_leaf=min_samples_leaf,
    rand_seeds=rand_seeds
)
experiment_id += 1

num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 1, acc: 0.44932
num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 5, acc: 0.44739
num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 10, acc: 0.44868
num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 15, acc: 0.44674
num_estimators: 100, max depth: 5, max features: auto, min samples leaf: 20, acc: 0.44674
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 1, acc: 0.44416
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 5, acc: 0.44287
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 10, acc: 0.44351
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 15, acc: 0.44287
num_estimators: 100, max depth: 5, max features: log2, min samples leaf: 20, acc: 0.44351
num_estimators: 100, max depth: 5, max features: 150, min samples leaf: 1, acc: 0.45965
num_estimators: 

In [7]:
logp = os.path.join(
    "..", "logs", "convention", "randomforest", "experiment_" + str(experiment_id)
)

n_estimators = [500, 1000, 1500]
max_depth = [15, 20, 25, 30, 35, 40]
max_features = [100, 200, 300]
min_samples_leaf = [1]
# rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
rand_seeds = [0]

do_experiment(
    data_path=datap,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    max_features=max_features,
    min_samples_leaf=min_samples_leaf,
    rand_seeds=rand_seeds,
)

experiment_id += 1

num_estimators: 500, max depth: 15, max features: 100, min samples leaf: 1, acc: 0.54099
num_estimators: 500, max depth: 15, max features: 200, min samples leaf: 1, acc: 0.53583
num_estimators: 500, max depth: 15, max features: 300, min samples leaf: 1, acc: 0.53648
num_estimators: 500, max depth: 20, max features: 100, min samples leaf: 1, acc: 0.54229
num_estimators: 500, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.54164
num_estimators: 500, max depth: 20, max features: 300, min samples leaf: 1, acc: 0.54229
num_estimators: 500, max depth: 25, max features: 100, min samples leaf: 1, acc: 0.54099
num_estimators: 500, max depth: 25, max features: 200, min samples leaf: 1, acc: 0.54229
num_estimators: 500, max depth: 25, max features: 300, min samples leaf: 1, acc: 0.54035
num_estimators: 500, max depth: 30, max features: 100, min samples leaf: 1, acc: 0.54164
num_estimators: 500, max depth: 30, max features: 200, min samples leaf: 1, acc: 0.54745
num_estimators: 500, 

In [9]:
logp = os.path.join("..", "logs", "convention", "randomforest_ginfp", "experiment_"+str(experiment_id))

n_estimators = [1000]
max_depth = [20]
max_features = [200]
min_samples_leaf = [1]
rand_seeds = [0, 1029, 1829, 189, 9382, 128989, 812, 204, 486, 6987]
# rand_seeds = [0]

do_experiment(
    data_path=datap,
    log_path=logp,
    n_estimators=n_estimators,
    max_depth=max_depth,
    max_features=max_features,
    min_samples_leaf=min_samples_leaf,
    rand_seeds=rand_seeds
)
    
experiment_id += 1

num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.54422
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.55326
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.54874
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.53777
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.54099
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.53325
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.55584
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.54551
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.53260
num_estimators: 1000, max depth: 20, max features: 200, min samples leaf: 1, acc: 0.54551
Random seed: 0
Accuracy: 0.54422
Best params:
  num_estimators: 1000
  max_depth: 20
  max_features: