In [10]:
import sys
import json
import os
import numpy as np
import joblib
from catboost.utils import read_cd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from catboost import Pool, CatBoostRegressor
from scipy.stats import ttest_rel
import math
from collections import defaultdict
import pandas as pd

In [2]:
def create_dir(name):
    directory = os.path.dirname(name)
    if not os.path.exists(name):
        os.makedirs(name)

In [3]:
def generate_rf_ensemble_regression(dataset_name, n_splits=1,
                                    num_models=10, n_estimators = 1000, compress=3, 
                                    n_jobs=-1, max_depth=10):
                                    
    for fold in range(n_splits):
        
        # load and prepare data
        data_dir = os.path.join('datasets', dataset_name)
        full_train_file = os.path.join(data_dir, 'full_train')
        test_file = os.path.join(data_dir, 'test')
        cd_file = os.path.join(data_dir, 'pool.cd')

        full_train_pool = Pool(data=full_train_file, column_description=cd_file)
        test_pool = Pool(data=test_file, column_description=cd_file)

        seed = 10 * fold # fix different starting random seeds for all folds
        for i in range(num_models):
            model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, 
                                          n_jobs=n_jobs, random_state=seed, max_features=1.0)
            seed += 1 # new seed for each ensemble element

            model.fit(full_train_pool.get_features(), full_train_pool.get_label()) 
            joblib.dump(model, "results/models/" + dataset_name + "_" + "rf" + "_f" + str(fold) + "_" + str(i), compress=compress)

In [4]:
datasets = ["parkinsons"]

create_dir("results/models")

print('Regression - Bagging: Generating models..')
for name in datasets:
    print("Dataset =", name)

    # Training all models
    print("Training models...")
    generate_rf_ensemble_regression(name, n_splits=1)
print('Done!!')

Regression - Bagging: Generating models..
Dataset = parkinsons
Training models...
Done!!


----

In [2]:
def prr_regression(targets, preds, measure, pos_label=1):
    if pos_label != 1:
        measure_loc = -1.0 * measure
    else:
        measure_loc = measure
    preds = np.squeeze(preds)
    # Compute total MSE
    error = (preds - targets) ** 2
    MSE_0 = np.mean(error)
    # print 'BASE MSE', MSE_0

    # Create array
    array = np.concatenate(
        (preds[:, np.newaxis], targets[:, np.newaxis], error[:, np.newaxis], measure_loc[:, np.newaxis]), axis=1)

    # Results arrays
    results_max = [[0.0, 0.0]]
    results_var = [[0.0, 0.0]]
    results_min = [[0.0, 0.0]]

    optimal_ranking = array[:, 2].argsort()
    sorted_array = array[optimal_ranking]  # Sort by error

    for i in range(1, array.shape[0]):
        x = np.concatenate((sorted_array[:-i, 0], sorted_array[-i:, 1]), axis=0)
        mse = np.mean((x - sorted_array[:, 1]) ** 2)
        # Best rejection
        results_max.append([float(i) / float(array.shape[0]), (MSE_0 - mse) / MSE_0])
        # Random Rejection
        results_min.append([float(i) / float(array.shape[0]), float(i) / float(array.shape[0])])

    uncertainty_ranking = array[:, 3].argsort()
    sorted_array = array[uncertainty_ranking]  # Sort by uncertainty

    for i in range(1, array.shape[0]):
        x = np.concatenate((sorted_array[:-i, 0], sorted_array[-i:, 1]), axis=0)
        mse = np.mean((x - sorted_array[:, 1]) ** 2)
        results_var.append([float(i) / float(array.shape[0]), (MSE_0 - mse) / MSE_0])

    max_auc = auc([x[0] for x in results_max], [x[1] for x in results_max])
    var_auc = auc([x[0] for x in results_var], [x[1] for x in results_var])
    min_auc = auc([x[0] for x in results_min], [x[1] for x in results_min])

    AUC_RR = (var_auc - min_auc) / (max_auc - min_auc)

    return AUC_RR

def nll_regression(target, mu, var, epsilon=1e-8, raw=False):
    nll = (target - mu)**2 / (2.0 * var + epsilon) + np.log(var + epsilon) / 2.0 + np.log(2 * np.pi) / 2.0
    if raw: # for individual predictions
        return nll
    return np.mean(nll)

def ens_nll_regression(target, preds, epsilon=1e-8, raw=False):
    mu = preds[:, :, 0]
    var = preds[:, :, 1]
    nll = (target - mu)**2 / (2.0 * var + epsilon) + np.log(var + epsilon) / 2.0 + np.log(2 * np.pi) / 2.0
    proba = np.exp (-1 * nll)
    if raw: # for individual predictions
        return -1 * np.log(np.mean(proba, axis=0)) # for individual predictions
    return np.mean(-1 * np.log(np.mean(proba, axis=0)))

def ood_detect(domain_labels, in_measure, out_measure, mode, pos_label=1):
    scores = np.concatenate((in_measure, out_measure), axis=0)
    scores = np.asarray(scores, dtype=np.longdouble)
    if pos_label != 1:
        scores *= -1.0

    if mode == 'PR':
        precision, recall, thresholds = precision_recall_curve(domain_labels, scores)
        aupr = auc(recall, precision)
        return aupr

    elif mode == 'ROC':
        roc_auc = roc_auc_score(domain_labels, scores)
        return roc_auc

In [3]:
def normal_KL(params1, params2, epsilon=1e-20):
    mu_1 = params1[0]
    mu_2 = params2[0]

    logvar1 = np.log(params1[1] + epsilon)
    logvar2 = np.log(params2[1] + epsilon)

    mean_term = 0.5 * np.exp(2 * np.log(np.abs(mu_1 - mu_2)) - logvar2)
    sigma_term = 0.5 * (np.exp(logvar1 - logvar2) - 1.0 + logvar2 - logvar1)

    return mean_term + sigma_term

def epkl_reg(preds):
    """
    preds: array [n_samples, n_models, 2]
    """
    M = preds.shape[1]
    EPKL = []
    for pred in preds:
        epkl = 0.0
        for i, pr1 in enumerate(pred):
            for j, pr2 in enumerate(pred):
                if i != j:
                    epkl += normal_KL(pr1, pr2)

        epkl = epkl / (M * (M - 1))
        EPKL.append(epkl)
    return np.asarray(EPKL)

def ensemble_uncertainties_regression(preds):
    """
    preds: array [n_samples, n_models, 2] - last dim ins mean, var
    """
    epkl = epkl_reg(preds)

    var_mean = np.var(preds[:, :, 0], axis=1)
    mean_var = np.mean(preds[:, :, 1], axis=1)

    uncertainty = {'tvar': var_mean + mean_var,
                   'mvar': mean_var,
                   'varm': var_mean,
                   'epkl': epkl}

    return uncertainty

In [4]:
def calc_rmse(preds, target, raw=False):
    if raw:
        return (preds - target)**2 # for individual predictions
    return np.sqrt(np.mean((preds - target)**2))

def ens_rmse(target, preds, epsilon=1e-8, raw=False):
    means = preds[:, :, 0] 
    avg_mean = np.mean(means, axis=0) 
    if raw: # for individual predictions
        return calc_rmse(avg_mean, target, raw=True)
    return calc_rmse(avg_mean, target)

In [16]:
def load_and_predict(X, name, alg, fold, i):
    if alg == "rf":
        model = joblib.load("results_best/models/" + name + "_" + alg + "_f" + str(fold) + "_" + str(i))
        preds = model.predict(X)
        preds = np.array([(p, 1) for p in preds]) # 1 for unknown variance
    else:
        model = CatBoostRegressor()
        model.load_model("results/models/" + name + "_" + alg + "_f" + str(fold) + "_" + str(i)) 
        preds = model.predict(X)
    return preds, model
    
def predict(X, model, alg):
    preds = model.predict(X)
    if alg == "rf":
        preds = np.array([(p, 1) for p in preds])
    return preds
            
def aggregate_results(name, modes = ["single", "ens"], 
                      algorithms = ['sglb-fixed', 'rf'], num_models = 10, 
                      raw=False):
    
    n_splits=1
    
    results = [] # metric values for all algorithms and all folds
    
    # for ood evaluation
    ood_X_test = np.loadtxt("datasets/ood/" + name + '_rf')
    ood_X_test = ood_X_test[:-1, :-1]
    ood_size = len(ood_X_test)
        
    for mode in modes:
        for alg in algorithms:
        
            values = defaultdict(lambda: []) # metric values for all folds for given algorithm

            for fold in range(n_splits):
                if name != 'parkinsons':
                    X_train_all, y_train_all, X_train, y_train, X_validation, y_validation, X_test, y_test = make_train_val_test(
                                                                                        X, y, index_train, index_test, fold)
                else:
                    ood_test_pool = Pool(data="datasets/ood/" + name + "_rf", column_description="datasets/"+name+"/pool.cd")
                    X_test, y_test = ood_test_pool.get_features(), ood_test_pool.get_label()
                    y_test = np.array(y_test).astype(np.float64)
                test_size = len(X_test)
                domain_labels = np.concatenate([np.zeros(test_size), np.ones(ood_size)])

                if mode == "single":
                    # use 0th model from ensemble as a single model
                    preds, model = load_and_predict(X_test, name, alg, fold, 0)

                    values["rmse"].append(calc_rmse(preds[:, 0], y_test, raw=raw))
                    values["nll"].append(nll_regression(y_test, preds[:, 0], preds[:, 1], raw=raw))
                    values["TU_prr"].append(prr_regression(y_test, preds[:, 0], preds[:, 1]))
                    values["KU_prr"].append(float("nan"))
                    values["KU_auc"].append(float("nan"))
                    
                    ood_preds = predict(ood_X_test, model, alg)
                    in_measure = preds[:, 1]
                    out_measure = ood_preds[:, 1]
                    values["TU_auc"].append(ood_detect(domain_labels, in_measure, out_measure, mode="ROC"))

                if mode == "ens":
                    all_preds = [] # predictions of all models in ensemble
                    all_preds_ood = []
                    
                    for i in range(num_models):
                        preds, model = load_and_predict(X_test, name, alg, fold, i)
                        all_preds.append(preds)
                        preds = predict(ood_X_test, model, alg)
                        all_preds_ood.append(preds)   
                    all_preds = np.array(all_preds)
                    
                    values["rmse"].append(ens_rmse(y_test, all_preds, raw=raw))
                    values["nll"].append(ens_nll_regression(y_test, all_preds, raw=raw)) 
                    
                    TU = ensemble_uncertainties_regression(np.swapaxes(all_preds, 0, 1))["tvar"]
                    KU = ensemble_uncertainties_regression(np.swapaxes(all_preds, 0, 1))["varm"]

                    mean_preds = np.mean(all_preds[:, :, 0], axis=0)

                    values["TU_prr"].append(prr_regression(y_test, mean_preds, TU))
                    values["KU_prr"].append(prr_regression(y_test, mean_preds, KU))
                    
                    all_preds_ood = np.array(all_preds_ood)
                    TU_ood = ensemble_uncertainties_regression(np.swapaxes(all_preds_ood, 0, 1))["tvar"]
                    KU_ood = ensemble_uncertainties_regression(np.swapaxes(all_preds_ood, 0, 1))["varm"]
                    values["TU_auc"].append(ood_detect(domain_labels, TU, TU_ood, mode="ROC"))
                    values["KU_auc"].append(ood_detect(domain_labels, KU, KU_ood, mode="ROC"))
                        
            results.append(values)

    return np.array(results)
    
def make_table(values):
    prr_TU = np.array([values[i]["TU_prr"] for i in range(len(values))])
    auc_TU = np.array([values[i]["TU_auc"] for i in range(len(values))])
    TU = np.concatenate((np.squeeze(prr_TU), np.squeeze(auc_TU)), axis=0)

    prr_KU = np.array([values[i]["KU_prr"] for i in range(len(values))])
    auc_KU = np.array([values[i]["KU_auc"] for i in range(len(values))])
    KU = np.concatenate((np.squeeze(prr_KU), np.squeeze(auc_KU)), axis=0)

    df = pd.DataFrame(100*np.abs(np.stack((TU, KU))), index=['TU', 'KU']) 
    columns=[('PRR%', 'Single','SGLB'),('PRR%', 'Single','RF'), ('PRR%', 'Ensemble','SGLB'),('PRR%', 'Ensemble','RF'), 
             ('AUC-ROC%', 'Single','SGLB'),('AUC-ROC%', 'Single','RF'), ('AUC-ROC%', 'Ensemble','SGLB'),('AUC-ROC%', 'Ensemble','RF')]
    df.columns=pd.MultiIndex.from_tuples(columns)
    return df

In [17]:
dataset = "parkinsons"

print('\t\t\tRegression - Bagging')
print("\t===Comparison with random forest, PRR and AUC-ROC===")

values = aggregate_results(dataset, algorithms=["sglb-fixed", "rf"], modes=["single", "ens"], raw=False)

make_table(values)

			Regression - Bagging
	===Comparison with random forest, PRR and AUC-ROC===


Unnamed: 0_level_0,PRR%,PRR%,PRR%,PRR%,AUC-ROC%,AUC-ROC%,AUC-ROC%,AUC-ROC%
Unnamed: 0_level_1,Single,Single,Ensemble,Ensemble,Single,Single,Ensemble,Ensemble
Unnamed: 0_level_2,SGLB,RF,SGLB,RF,SGLB,RF,SGLB,RF
TU,49.905147,20.063157,15.207253,5.25285,97.244254,50.0,64.376507,91.929877
KU,,,13.438849,5.25285,,,59.164811,91.930171
