In [1]:
import sys
import json
import os
import numpy as np
from catboost import Pool, CatBoostClassifier

In [2]:
mode = "classification" #sys.argv[1]

def create_dir(name):
    directory = os.path.dirname(name)
    if not os.path.exists(name):
        os.makedirs(name)

In [13]:
def tune_parameters_classification(dataset_name, alg='sgb'):

    # load and prepare data
    data_dir = os.path.join('datasets', dataset_name)
    train_file = os.path.join(data_dir, 'train')
    validation_file = os.path.join(data_dir, 'validation')
    test_file = os.path.join(data_dir, 'test')
    cd_file = os.path.join(data_dir, 'pool.cd')
    
    train_pool = Pool(data=train_file, column_description=cd_file)
    validation_pool = Pool(data=validation_file, column_description=cd_file)
    test_pool = Pool(data=test_file, column_description=cd_file)
    

    seed = 1000 # starting random seed for hyperparameter tuning
    
    # list of hyperparameters for grid search
    # we do not tune the number of trees, it is important for virtual ensembles
    depths = [3, 4, 5, 6] # tree depth
    lrs = [0.001, 0.01, 0.1] # learning rate 
    if alg == "sgb" or alg == "sglb": # by default, we tune sample rate
        samples = [0.25, 0.5, 0.75]
    if alg == "sgb-fixed": # sgb without sample rate tuning
        samples = [0.5]
    if alg == "sglb-fixed": # sglb without sample rate tuning
        samples = [1.0]
    shape = (len(depths), len(lrs), len(samples))

    # perform grid search
    results = np.zeros(shape)
    for d, depth in enumerate(depths):
        for l, lr in enumerate(lrs):
            for s, sample in enumerate(samples):
                if alg == 'sgb' or alg == 'sgb-fixed':
                    model = CatBoostClassifier(loss_function='Logloss',learning_rate=lr, depth=depth, subsample=sample, bootstrap_type='Bernoulli', verbose=False, random_seed=seed)                      
                if alg == 'sglb' or alg == 'sglb-fixed':
                    model = CatBoostClassifier(loss_function='Logloss',learning_rate=lr, depth=depth, subsample=sample, bootstrap_type='Bernoulli', verbose=False, random_seed=seed, posterior_sampling=True)
                    
                model.fit(train_pool, eval_set=validation_pool, use_best_model=False)
                    
                # compute nll
                results[d, l, s] = model.evals_result_['validation']['Logloss'][-1]
                    
                seed += 1 # update seed
        
    # get best parameters
    argmin = np.unravel_index(np.argmin(results), shape)
    depth = depths[argmin[0]]
    lr = lrs[argmin[1]]
    sample = samples[argmin[2]]
        
    params = {'depth': depth, 'lr': lr, 'sample': sample}
    
    return params
    
def generate_ensemble_classification(dataset_name, params, alg="sgb", num_models=10):

    # load and prepare data
    data_dir = os.path.join('datasets', dataset_name)
    full_train_file = os.path.join(data_dir, 'full_train')
    test_file = os.path.join(data_dir, 'test')
    cd_file = os.path.join(data_dir, 'pool.cd')
    
    full_train_pool = Pool(data=full_train_file, column_description=cd_file)
    test_pool = Pool(data=test_file, column_description=cd_file)

    # parameters
    depth = params['depth']
    lr = params['lr']
    sample = params['sample']
        
    seed = 0
    for i in range(num_models):
        if alg == 'sgb' or alg == 'sgb-fixed':
            model = CatBoostClassifier(loss_function='Logloss', verbose=False, 
                                       learning_rate=lr, depth=depth, subsample=sample,
                                       bootstrap_type='Bernoulli', custom_metric='ZeroOneLoss', 
                                       random_seed=seed)   
        if alg == 'sglb' or alg == 'sglb-fixed':
            model = CatBoostClassifier(loss_function='Logloss', verbose=False, 
                                       learning_rate=lr, depth=depth, subsample=sample, 
                                       bootstrap_type='Bernoulli', posterior_sampling=True, 
                                       custom_metric='ZeroOneLoss', random_seed=seed)
        seed += 1 # new seed for each ensemble element

        model.fit(full_train_pool, eval_set=test_pool, use_best_model=False) # do not use test pool for choosing best iteration
        model.save_model("results/models/" + dataset_name + "_" + alg + "_" + str(i), format="cbm")        

In [5]:
if mode == "classification":

    tuning = 0 #int(sys.argv[2])
    
    datasets = ["churn"] #["adult", "amazon", "click", "internet", "appetency", "churn", "upselling", "kick"]

    algorithms = ['sgb-fixed', 'sglb-fixed'] # choose from ['sgb-fixed', 'sglb-fixed', 'sgb', 'sglb'] 
    # for -fixed we do not tune sample rate and use 0.5 for sbf and 1. for sglb
    
    for name in datasets:
        print("dataset =", name)
    
        if tuning == 1:
            create_dir("results/params")
        
            # Tune hyperparameters
            print("tuning hyperparameters...")
            for alg in algorithms:
                print(alg)
                params = tune_parameters_classification(name, alg=alg)
                with open("results/params/" + name + "_" + alg + '.json', 'w') as fp:
                    json.dump(params, fp)
                    
        # Training all models
        print("training models...")
        create_dir("results/models")
        for alg in algorithms:
            print(alg)
            with open("results/params/" + name + "_" + alg + '.json', 'r') as fp:
                params = json.load(fp)
            generate_ensemble_classification(name, params, alg=alg)
        print()
  

dataset = churn
training models...
sgb-fixed
sglb-fixed



In [8]:
if mode == "classification":

    tuning = 0 #int(sys.argv[2])
    
    datasets = ["credit"] #["adult", "amazon", "click", "internet", "appetency", "churn", "upselling", "kick"]

    algorithms = ['sgb-fixed', 'sglb-fixed'] # choose from ['sgb-fixed', 'sglb-fixed', 'sgb', 'sglb'] 
    # for -fixed we do not tune sample rate and use 0.5 for sbf and 1. for sglb
    
    for name in datasets:
        print("dataset =", name)
    
        if tuning == 1:
            create_dir("results/params")
        
            # Tune hyperparameters
            print("tuning hyperparameters...")
            for alg in algorithms:
                print(alg)
                params = tune_parameters_classification(name, alg=alg)
                with open("results/params/" + name + "_" + alg + '.json', 'w') as fp:
                    json.dump(params, fp)
                    
        # Training all models
        print("training models...")
        create_dir("results/models")
        for alg in algorithms:
            print(alg)
            with open("results/params/" + name + "_" + alg + '.json', 'r') as fp:
                params = json.load(fp)
            generate_ensemble_classification(name, params, alg=alg)
        print()
  

dataset = credit
training models...
sgb-fixed
sglb-fixed



## Results

In [9]:
import numpy as np
from catboost import Pool, CatBoostClassifier
from catboost.utils import read_cd
from gbdt_uncertainty.data import process_classification_dataset
from gbdt_uncertainty.assessment import prr_class, ood_detect, nll_class
from gbdt_uncertainty.uncertainty import entropy_of_expected_class, expected_entropy_class, entropy
from sklearn.metrics import zero_one_loss, log_loss
from scipy.stats import ttest_rel
import math
import os
import joblib
import sys
from collections import defaultdict

In [10]:
algorithms = ['sgb-fixed', 'sglb-fixed'] 

# for proper tables
convert_name = {"adult": "Adult", "amazon": "Amazon", "click": "Click", 
                "internet": "Internet", "appetency": "KDD-Appetency", "churn": "KDD-Churn",
                "upselling": "KDD-Upselling", "kick": "Kick", 'credit': 'Credit-Card'}

In [11]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))
     
def load_model(name, alg, i):
    if alg == "rf":
        model = joblib.load("results/models/" + name + "_" + alg + "_" + str(i))
    else:
        model = CatBoostClassifier()
        model.load_model("results/models/" + name + "_" + alg + "_" + str(i)) 
    return model
    
def rf_virtual_ensembles_predict(model, X, count=10):
    trees = model.estimators_
    num_trees = len(trees)
    ens_preds = []
    for i in range(count):
        indices = range(int(i*num_trees/count), int((i+1)*num_trees/count))
        all_preds = []
        for ind in indices:
            all_preds.append(trees[ind].predict_proba(X))
        all_preds = np.array(all_preds)
        preds = np.mean(all_preds, axis=0)
        ens_preds.append(preds)
    ens_preds = np.array(ens_preds)

    return np.swapaxes(ens_preds, 0, 1)
    
def virtual_ensembles_predict(X, model, alg, num_models=10):
    if alg == "rf":
        all_preds = rf_virtual_ensembles_predict(model, X, count=num_models)
    else:
        all_preds = model.virtual_ensembles_predict(X, prediction_type='VirtEnsembles', virtual_ensembles_count=num_models)
        all_preds = sigmoid(all_preds)
        all_preds = np.concatenate((1 - all_preds, all_preds), axis=2)
    return np.swapaxes(all_preds, 0, 1)
    
def compute_significance(values_all, metric, minimize=True):

    values_mean = np.mean(values_all, axis=1) 
    
    # choose best algorithm
    if minimize:
        best_idx = np.nanargmin(values_mean)
    else:
        best_idx = np.nanargmax(values_mean)
        
    textbf = {best_idx} # for all algorithms insignificantly different from the best one
    # compute statistical significance on test

    for idx in range(len(values_mean)):
        test = ttest_rel(values_all[best_idx], values_all[idx]) # paired t-test
        if test[1] > 0.05:
            textbf.add(idx)
            
    return values_mean, textbf

def compute_best(values, minimize=True):

    # choose best algorithm
    if minimize:
        best_idx = np.nanargmin(values)
    else:
        best_idx = np.nanargmax(values)
        
    textbf = {best_idx} 
    for idx in range(len(values)):
        if values[best_idx] == values[idx]: 
            textbf.add(idx)
            
    return textbf
    
def make_table_entry(values_all, metric, minimize=True, round=2):
    
    num_values = len(values_all)
    
    values_mean, textbf = compute_significance(values_all, metric, minimize=minimize)

    # prepare all results in latex format

    table = ""

    for idx in range(num_values):
        if idx in textbf:
            table += "\\textbf{" + str(np.round(values_mean[idx], round)) + "} "
        else:    
            table += str(np.round(values_mean[idx], round)) + " "
        table += "& " 
            
    return table

def normalize_test_labels(y_test):
    y_test_norm = []
    c0 = min(y_test)
    for y in y_test:
        if y == c0:
            y_test_norm.append(0)
        else:
            y_test_norm.append(1)
    return np.array(y_test_norm)
            
def aggregate_results(name, modes = ["single", "ens", "virt"], 
                      algorithms = ['sgb-fixed', 'sglb-fixed'], num_models = 10):
    

    results = [] # metric values for all algorithms and all folds
        
    for mode in modes:
        for alg in algorithms:
        
            if alg == "rf":
                train_pool, y_train, test_pool, y_test, enc = process_classification_dataset(name)
                
                # process ood data
                cd = read_cd("datasets/"+name+"/pool.cd", data_file = "datasets/"+name+"/test")
                try: 
                    label_ind = cd['column_type_to_indices']['Label']
                except:
                    label_ind = cd['column_type_to_indices']['Target']

                ood_test_pool = np.loadtxt("datasets/ood/" + name, delimiter="\t", dtype="object")
                ood_test_pool = enc.transform(ood_test_pool).astype("float64")
                ood_test_pool = np.delete(ood_test_pool, label_ind, 1)
                ood_size = len(ood_test_pool)
                
            else:
                test_pool = Pool(data="datasets/"+name+"/test", column_description="datasets/"+name+"/pool.cd")
                ood_test_pool = Pool(data="datasets/ood/" + name, column_description="datasets/"+name+"/pool.cd")
                ood_size = ood_test_pool.num_row()

                y_test = test_pool.get_label()
            
            test_size = len(y_test)
            domain_labels = np.concatenate([np.zeros(test_size), np.ones(ood_size)])
                    
            y_test_norm = normalize_test_labels(y_test)
        
            values = defaultdict() # metric values for all folds for given algorithm

            if mode == "single":
                # use 0th model from ensemble as a single model
                model = load_model(name, alg, 0)
                preds = model.predict(test_pool)
                preds_proba = model.predict_proba(test_pool)
    
                values["error"] = (preds != y_test).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)
                values["TU_prr"] = prr_class(y_test_norm, preds_proba, entropy(preds_proba), False)
                values["KU_prr"] = float("nan")
                values["KU_auc"] = float("nan")
                    
                ood_preds_proba = model.predict_proba(ood_test_pool)
                in_measure = entropy(preds_proba)
                out_measure = entropy(ood_preds_proba)
                values["TU_auc"] = ood_detect(domain_labels, in_measure, out_measure, mode="ROC")

            if mode == "ens":
                all_preds = [] # predictions of all models in ensemble
                all_preds_ood = []
                    
                for i in range(num_models):
                    model = load_model(name, alg, i)
                    preds = model.predict_proba(test_pool)
                    all_preds.append(preds)
                    preds = model.predict_proba(ood_test_pool)
                    all_preds_ood.append(preds) 
                        
                all_preds = np.array(all_preds)
                preds_proba = np.mean(all_preds, axis=0)
                
                all_preds_ood = np.array(all_preds_ood)
                
                preds = np.argmax(preds_proba, axis=1)
                values["error"] = (preds != y_test_norm).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)
                
                TU = entropy_of_expected_class(all_preds)
                DU = expected_entropy_class(all_preds)
                KU = TU - DU
                
                TU_ood = entropy_of_expected_class(all_preds_ood)
                DU_ood = expected_entropy_class(all_preds_ood)
                KU_ood = TU_ood - DU_ood

                values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU, False)
                values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU, False)
                  
                values["TU_auc"] = ood_detect(domain_labels, TU, TU_ood, mode="ROC")
                values["KU_auc"] = ood_detect(domain_labels, KU, KU_ood, mode="ROC")
                        
            if mode == "virt":
                if alg in ["sgb", "sgb-fixed"]: # we do not evaluate virtual sgb model
                    continue
                    
                # generate virtual ensemble from 0th model
                model = load_model(name, alg, 0)

                all_preds = virtual_ensembles_predict(test_pool, model, alg)
                
                preds_proba = np.mean(all_preds, axis=0)
    
                preds = np.argmax(preds_proba, axis=1)
                values["error"] = (preds != y_test_norm).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)
                
                TU = entropy_of_expected_class(all_preds)
                DU = expected_entropy_class(all_preds)
                KU = TU - DU
                
                all_preds_ood = virtual_ensembles_predict(ood_test_pool, model, alg)
                TU_ood = entropy_of_expected_class(all_preds_ood)
                DU_ood = expected_entropy_class(all_preds_ood)
                KU_ood = TU_ood - DU_ood

                values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU, False)
                values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU, False)
                  
                values["TU_auc"] = ood_detect(domain_labels, TU, TU_ood, mode="ROC")
                values["KU_auc"] = ood_detect(domain_labels, KU, KU_ood, mode="ROC")
                        
            if mode == "virt" and alg in ["sgb", "sgb-fixed"]: # we do not evaluate virtual sgb model
                continue
            
            results.append(values)

    return np.array(results)
    
def make_table_element(mean, textbf, idx):
    table = ""
    if np.isnan(mean[idx]):
        table += "--- & "
        return table
    if idx in textbf:
        table += "\\textbf{" + str(int(np.rint(mean[idx]))) + "} "
    else:    
        table += str(int(np.rint(mean[idx]))) + " "
    table += "& "
    return table

In [9]:
datasets = ["internet"] #["adult", "amazon", "click", "internet", "appetency", "churn", "upselling", "kick"]

table_type = "prr_auc" #sys.argv[1]

if table_type == "prr_auc":
    print("===PRR and AUC-ROC Table===")
    
    for name in datasets:

        values = aggregate_results(name)
        
        prr_TU = np.array([values[i]["TU_prr"] for i in range(len(values))])
        prr_KU = np.array([values[i]["KU_prr"] for i in range(len(values))])
        prr = np.concatenate((prr_TU, prr_KU), axis=0)

        textbf_prr = compute_best(prr, minimize=False)
    
        auc_TU = np.array([values[i]["TU_auc"] for i in range(len(values))])
        auc_KU = np.array([values[i]["KU_auc"] for i in range(len(values))])
        auc = 100*np.concatenate((auc_TU, auc_KU), axis=0)
        
        textbf_auc = compute_best(auc, minimize=False)

        num = len(auc_TU)
    
        table = "\multirow{2}{*} {" + convert_name[name] + "} & TU & "
        for idx in range(num):
            table += make_table_element(prr, textbf_prr, idx)

        for idx in range(num):
            table += make_table_element(auc, textbf_auc, idx)
            
        print(table.rstrip("& ") + " \\\\")
        
        table = " & KU & "
        for idx in range(num, 2*num):
            table += make_table_element(prr, textbf_prr, idx)
            
        for idx in range(num, 2*num):
            table += make_table_element(auc, textbf_auc, idx)
        print(table.rstrip("& ") + " \\\\")
        
        print("\midrule")

===PRR and AUC-ROC Table===
\multirow{2}{*} {Internet} & TU & 75 & 79 & 78 & \textbf{79} & 79 & \textbf{50} & \textbf{50} & \textbf{50} & \textbf{50} & \textbf{50} \\
 & KU & --- & --- & 72 & 72 & 57 & --- & --- & \textbf{50} & \textbf{50} & \textbf{50} \\
\midrule


In [14]:
datasets = ["credit"] #["adult", "amazon", "click", "internet", "appetency", "churn", "upselling", "kick"]

table_type = "prr_auc" #sys.argv[1]

if table_type == "prr_auc":
    print("===PRR and AUC-ROC Table===")
    
    for name in datasets:

        values = aggregate_results(name)
        
        prr_TU = np.array([values[i]["TU_prr"] for i in range(len(values))])
        prr_KU = np.array([values[i]["KU_prr"] for i in range(len(values))])
        prr = np.concatenate((prr_TU, prr_KU), axis=0)

        textbf_prr = compute_best(prr, minimize=False)
    
        auc_TU = np.array([values[i]["TU_auc"] for i in range(len(values))])
        auc_KU = np.array([values[i]["KU_auc"] for i in range(len(values))])
        auc = 100*np.concatenate((auc_TU, auc_KU), axis=0)
        
        textbf_auc = compute_best(auc, minimize=False)

        num = len(auc_TU)
    
        table = "\multirow{2}{*} {" + convert_name[name] + "} & TU & "
        for idx in range(num):
            table += make_table_element(prr, textbf_prr, idx)

        for idx in range(num):
            table += make_table_element(auc, textbf_auc, idx)
            
        print(table.rstrip("& ") + " \\\\")
        
        table = " & KU & "
        for idx in range(num, 2*num):
            table += make_table_element(prr, textbf_prr, idx)
            
        for idx in range(num, 2*num):
            table += make_table_element(auc, textbf_auc, idx)
        print(table.rstrip("& ") + " \\\\")
        
        print("\midrule")

===PRR and AUC-ROC Table===
\multirow{2}{*} {Credit-Card} & TU & 45 & \textbf{46} & 46 & 46 & 45 & 78 & 75 & 80 & 75 & 78 \\
 & KU & --- & --- & 20 & 18 & 11 & --- & --- & 99 & \textbf{99} & 92 \\
\midrule
