In [4]:
import sys
import json
import os
from category_encoders.leave_one_out import LeaveOneOutEncoder
import numpy as np
import joblib
from catboost.utils import read_cd
from sklearn.ensemble import RandomForestClassifier

In [5]:
def create_dir(name):
    directory = os.path.dirname(name)
    if not os.path.exists(name):
        os.makedirs(name)

In [7]:
def process_classification_dataset(name):
    # converting categorical features to numerical

    data_dir = os.path.join('datasets', name)
    train_file = os.path.join(data_dir, 'full_train')
    test_file = os.path.join(data_dir, 'test')
    cd_file = os.path.join(data_dir, 'pool.cd')

    train = np.loadtxt(train_file, delimiter="\t", dtype="object")
    test = np.loadtxt(test_file, delimiter="\t", dtype="object")
    cd = read_cd(cd_file, data_file=train_file)

    # Target can be called 'Label' or 'Target' in pool.cd
    try:
        label_ind = cd['column_type_to_indices']['Label']
    except:
        label_ind = cd['column_type_to_indices']['Target']

    np.random.seed(42)  # fix random seed
    train = np.random.permutation(train)

    y_train = train[:, label_ind]
    y_train = y_train.reshape(-1)

    y_test = test[:, label_ind]
    y_test = y_test.reshape(-1)
    
    try:
        cat_features = cd['column_type_to_indices']['Categ']  # features to be replaced

        enc = LeaveOneOutEncoder(cols=cat_features, return_df=False, random_state=10, sigma=0.3)
    except:
        enc = LeaveOneOutEncoder(return_df=False, random_state=10, sigma=0.3)

    transformed_train = enc.fit_transform(train, y_train).astype("float64")
    X_train = np.delete(transformed_train, label_ind, 1)  # remove target column

    transformed_test = enc.transform(test).astype("float64")
    X_test = np.delete(transformed_test, label_ind, 1)  # remove target column

    return np.nan_to_num(X_train), y_train, np.nan_to_num(X_test), y_test, enc

In [10]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [15]:
def generate_rf_ensemble_classification(dataset_name, num_models=10, 
                                        n_estimators = 1000, compress=3, 
                                        n_jobs=-1, max_depth=10):

    X_train, y_train, X_test, y_test, _ = process_classification_dataset(dataset_name)

   
    depths = [5,10,15,20]
    estimators = [1000,3000,5000]
    seed = 0
    param_grid = { 
    'n_estimators': [1000, 3000, 5000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5,10,15,20],
    'criterion' :['gini', 'entropy']
    }
    rf_model = RandomForestClassifier(random_state=seed)
    for i in range(num_models):
        
        shape = (len(depths), len(estimators))

        results = np.zeros(shape)
        CV_rfc = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv= 5)
        CV_rfc.fit(X_train, y_train)
        best_parameters = CV_rfc.best_params_
        print("Best Paprameter for i = ", i , "are ", best_parameters, "\n\n\n")

        seed += 1 # new seed for each ensemble element
        #model.fit(X_train, y_train) 
        #results[d, n] = model.evals_result_['validation']['Logloss'][-1]
        rfc1=RandomForestClassifier(random_state=42, max_features=best_parameters['max_features'], n_estimators= best_parameters['n_estimators'], 
                                    max_depth=best_parameters['max_depth'], criterion=best_parameters['criterion'])

        joblib.dump(rfc1, "results/models/" + dataset_name + "_" 
                    + "rf" + "_" + str(i), compress=compress)

In [16]:
mode = 'classification_rf' #sys.argv[1]

if mode == "classification_rf":
    datasets = ["internet"]
#     datasets = ["adult", "amazon", "click", "internet", 
#                 "appetency", "churn", "upselling", "kick"]
                
    create_dir("results/models")
    
    for name in datasets:
        print("dataset =", name)
    
        # Training all models
        print("training models...")
        generate_rf_ensemble_classification(name)

dataset = internet
training models...
Best Paprameter for i =  0 are  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'n_estimators': 1000} 



Best Paprameter for i =  1 are  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'n_estimators': 1000} 



Best Paprameter for i =  2 are  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'n_estimators': 1000} 



Best Paprameter for i =  3 are  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'n_estimators': 1000} 



Best Paprameter for i =  4 are  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'n_estimators': 1000} 



Best Paprameter for i =  5 are  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'n_estimators': 1000} 



Best Paprameter for i =  6 are  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'n_estimators': 1000} 



Best Paprameter for i =  7 are  {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'aut

KeyboardInterrupt: 

In [22]:
mode = 'classification_rf' #sys.argv[1]

if mode == "classification_rf":
    datasets = ["credit"]
#     datasets = ["adult", "amazon", "click", "internet", 
#                 "appetency", "churn", "upselling", "kick"]
                
    create_dir("results/models")
    
    for name in datasets:
        print("dataset =", name)
    
        # Training all models
        print("training models...")
        generate_rf_ensemble_classification(name)
        ## This function is generate_rf_ensemble_classification training models, this is the most important features

dataset = credit
training models...


# Results

In [8]:
import numpy as np
from catboost import Pool, CatBoostClassifier
from catboost.utils import read_cd
from gbdt_uncertainty.assessment import prr_class, ood_detect, nll_class
from gbdt_uncertainty.uncertainty import entropy_of_expected_class, expected_entropy_class, entropy
from sklearn.metrics import zero_one_loss, log_loss
from scipy.stats import ttest_rel
import math
import os
import joblib
import sys
from collections import defaultdict

In [19]:
algorithms = ['sgb-fixed', 'sglb-fixed'] 

# for proper tables
convert_name = {"adult": "Adult", "amazon": "Amazon", "click": "Click", 
                "internet": "Internet", "appetency": "KDD-Appetency", "churn": "KDD-Churn",
                "upselling": "KDD-Upselling", "kick": "Kick", 'credit': 'Credit-Card'}

In [27]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))
     
def load_model(name, alg, i):
    if alg == "rf":
        model = joblib.load("results/models/" + name + "_" + alg + "_" + str(i))
    else:
        model = CatBoostClassifier()
        model.load_model("results/models/" + name + "_" + alg + "_" + str(i)) 
    return model
    
def rf_virtual_ensembles_predict(model, X, count=10):
    trees = model.estimators_
    num_trees = len(trees)
    ens_preds = []
    for i in range(count):
        indices = range(int(i*num_trees/count), int((i+1)*num_trees/count))
        all_preds = []
        for ind in indices:
            all_preds.append(trees[ind].predict_proba(X))
        all_preds = np.array(all_preds)
        preds = np.mean(all_preds, axis=0)
        ens_preds.append(preds)
    ens_preds = np.array(ens_preds)

    return np.swapaxes(ens_preds, 0, 1)
    
def virtual_ensembles_predict(X, model, alg, num_models=10):
    if alg == "rf":
        all_preds = rf_virtual_ensembles_predict(model, X, count=num_models)
    else:
        all_preds = model.virtual_ensembles_predict(X, prediction_type='VirtEnsembles', virtual_ensembles_count=num_models)
        all_preds = sigmoid(all_preds)
        all_preds = np.concatenate((1 - all_preds, all_preds), axis=2)
    return np.swapaxes(all_preds, 0, 1)
    
def compute_significance(values_all, metric, minimize=True):

    values_mean = np.mean(values_all, axis=1) 
    
    # choose best algorithm
    if minimize:
        best_idx = np.nanargmin(values_mean)
    else:
        best_idx = np.nanargmax(values_mean)
        
    textbf = {best_idx} # for all algorithms insignificantly different from the best one
    # compute statistical significance on test

    for idx in range(len(values_mean)):
        test = ttest_rel(values_all[best_idx], values_all[idx]) # paired t-test
        if test[1] > 0.05:
            textbf.add(idx)
            
    return values_mean, textbf

def compute_best(values, minimize=True):

    # choose best algorithm
    if minimize:
        best_idx = np.nanargmin(values)
    else:
        best_idx = np.nanargmax(values)
        
    textbf = {best_idx} 
    for idx in range(len(values)):
        if values[best_idx] == values[idx]: 
            textbf.add(idx)
            
    return textbf
    
def make_table_entry(values_all, metric, minimize=True, round=2):
    
    num_values = len(values_all)
    
    values_mean, textbf = compute_significance(values_all, metric, minimize=minimize)

    # prepare all results in latex format

    table = ""

    for idx in range(num_values):
        if idx in textbf:
            table += "\\textbf{" + str(np.round(values_mean[idx], round)) + "} "
        else:    
            table += str(np.round(values_mean[idx], round)) + " "
        table += "& " 
            
    return table

def normalize_test_labels(y_test):
    y_test_norm = []
    c0 = min(y_test)
    for y in y_test:
        if y == c0:
            y_test_norm.append(0)
        else:
            y_test_norm.append(1)
    return np.array(y_test_norm)
            
def aggregate_results(name, modes = ["single", "ens", "virt"], 
                      algorithms = ['sgb-fixed', 'sglb-fixed'], num_models = 10):
    

    results = [] # metric values for all algorithms and all folds
        
    for mode in modes:
        for alg in algorithms:
        
            if alg == "rf":
                train_pool, y_train, test_pool, y_test, enc = process_classification_dataset(name)
                
                # process ood data
                cd = read_cd("datasets/"+name+"/pool.cd", data_file = "datasets/"+name+"/test")
                try: 
                    label_ind = cd['column_type_to_indices']['Label']
                except:
                    label_ind = cd['column_type_to_indices']['Target']

                ood_test_pool = np.loadtxt("datasets/ood/" + name + "_rf", delimiter="\t", dtype="object")
                ood_test_pool = enc.transform(ood_test_pool).astype("float64")
                ood_test_pool = np.delete(ood_test_pool, label_ind, 1)
                ood_size = len(ood_test_pool)
                
            else:
                test_pool = Pool(data="datasets/"+name+"/test", column_description="datasets/"+name+"/pool.cd")
                ood_test_pool = Pool(data="datasets/ood/" + name, column_description="datasets/"+name+"/pool.cd")
                ood_size = ood_test_pool.num_row()

                y_test = test_pool.get_label()
            
            test_size = len(y_test)
            domain_labels = np.concatenate([np.zeros(test_size), np.ones(ood_size)])
                    
            y_test_norm = normalize_test_labels(y_test)
        
            values = defaultdict() # metric values for all folds for given algorithm

            if mode == "single":
                # use 0th model from ensemble as a single model
                model = load_model(name, alg, 0)
                preds = model.predict(test_pool)
                preds_proba = model.predict_proba(test_pool)
    
                values["error"] = (preds != y_test).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)
                values["TU_prr"] = prr_class(y_test_norm, preds_proba, entropy(preds_proba), False)
                values["KU_prr"] = float("nan")
                values["KU_auc"] = float("nan")
                    
                ood_preds_proba = model.predict_proba(ood_test_pool)
                in_measure = entropy(preds_proba)
                out_measure = entropy(ood_preds_proba)
                values["TU_auc"] = ood_detect(domain_labels, in_measure, out_measure, mode="ROC")

            if mode == "ens":
                all_preds = [] # predictions of all models in ensemble
                all_preds_ood = []
                    
                for i in range(num_models):
                    model = load_model(name, alg, i)
                    preds = model.predict_proba(test_pool)
                    all_preds.append(preds)
                    preds = model.predict_proba(ood_test_pool)
                    all_preds_ood.append(preds) 
                        
                all_preds = np.array(all_preds)
                preds_proba = np.mean(all_preds, axis=0)
                
                all_preds_ood = np.array(all_preds_ood)
                
                preds = np.argmax(preds_proba, axis=1)
                values["error"] = (preds != y_test_norm).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)
                
                TU = entropy_of_expected_class(all_preds)
                DU = expected_entropy_class(all_preds)
                KU = TU - DU
                
                TU_ood = entropy_of_expected_class(all_preds_ood)
                DU_ood = expected_entropy_class(all_preds_ood)
                KU_ood = TU_ood - DU_ood

                values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU, False)
                values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU, False)
                  
                values["TU_auc"] = ood_detect(domain_labels, TU, TU_ood, mode="ROC")
                values["KU_auc"] = ood_detect(domain_labels, KU, KU_ood, mode="ROC")
                        
            if mode == "virt":
                if alg in ["sgb", "sgb-fixed"]: # we do not evaluate virtual sgb model
                    continue
                    
                # generate virtual ensemble from 0th model
                model = load_model(name, alg, 0)

                all_preds = virtual_ensembles_predict(test_pool, model, alg)
                
                preds_proba = np.mean(all_preds, axis=0)
    
                preds = np.argmax(preds_proba, axis=1)
                values["error"] = (preds != y_test_norm).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)
                
                TU = entropy_of_expected_class(all_preds)
                DU = expected_entropy_class(all_preds)
                KU = TU - DU
                
                all_preds_ood = virtual_ensembles_predict(ood_test_pool, model, alg)
                TU_ood = entropy_of_expected_class(all_preds_ood)
                DU_ood = expected_entropy_class(all_preds_ood)
                KU_ood = TU_ood - DU_ood

                values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU, False)
                values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU, False)
                  
                values["TU_auc"] = ood_detect(domain_labels, TU, TU_ood, mode="ROC")
                values["KU_auc"] = ood_detect(domain_labels, KU, KU_ood, mode="ROC")
                        
            if mode == "virt" and alg in ["sgb", "sgb-fixed"]: # we do not evaluate virtual sgb model
                continue
            
            results.append(values)

    return np.array(results)
    
def make_table_element(mean, textbf, idx):
    table = ""
    if np.isnan(mean[idx]):
        table += "--- & "
        return table
    if idx in textbf:
        table += "\\textbf{" + str(int(np.rint(mean[idx]))) + "} "
    else:    
        table += str(int(np.rint(mean[idx]))) + " "
    table += "& "
    return table

In [18]:
table_type = 'rf_prr_auc' #sys.argv[1]

datasets = ["internet"] #["adult", "amazon", "click", "internet", "appetency", "churn", "upselling", "kick"]

if table_type == "rf_prr_auc":
    print("===Comparison with random forest, PRR and AUC-ROC===")
        
    for name in datasets:

        values = aggregate_results(name, algorithms=["sglb-fixed", "rf"], modes=["virt", "ens"])
        
        prr_TU = np.array([values[i]["TU_prr"] for i in range(len(values))])
        prr_KU = np.array([values[i]["KU_prr"] for i in range(len(values))])
        prr = np.concatenate((prr_TU, prr_KU), axis=0)

        textbf_prr = compute_best(prr, minimize=False)
    
        auc_TU = np.array([values[i]["TU_auc"] for i in range(len(values))])
        auc_KU = np.array([values[i]["KU_auc"] for i in range(len(values))])
        auc = 100*np.concatenate((auc_TU, auc_KU), axis=0)
        textbf_auc = compute_best(auc, minimize=False)

        num = len(auc_TU)
    
        table = "\multirow{2}{*} {" + convert_name[name] + "} & TU & "
        for idx in range(num):
            table += make_table_element(prr, textbf_prr, idx)

        for idx in range(num):
            table += make_table_element(auc, textbf_auc, idx)
            
        print(table.rstrip("& ") + " \\\\")
        
        table = " & KU & "
        for idx in range(num, 2*num):
            table += make_table_element(prr, textbf_prr, idx)
            
        for idx in range(num, 2*num):
            table += make_table_element(auc, textbf_auc, idx)
        print(table.rstrip("& ") + " \\\\")
        
        print("\midrule")

===Comparison with random forest, PRR and AUC-ROC===
\multirow{2}{*} {Internet} & TU & 79 & 69 & \textbf{79} & 68 & 64 & 75 & 64 & 74 \\
 & KU & 57 & 38 & 72 & 36 & 98 & 92 & \textbf{100} & 92 \\
\midrule


In [12]:
table_type = 'rf_nll_error' #sys.argv[1]

datasets = ["internet"] #["adult", "amazon", "click", "internet", "appetency", "churn", "upselling", "kick"]

if table_type == "rf_nll_error": 

    print("===Comparison with random forest, NLL and Error===")
    for name in datasets:

        values = aggregate_results(name, algorithms=["sglb-fixed", "rf"], modes=["single", "ens"])
        
        table = convert_name[name] + " & "
        
        values_nll = np.array([values[i]["nll"] for i in range(len(values))])
        values_error = np.array([values[i]["error"] for i in range(len(values))])
        
        table += make_table_entry(values_nll, "nll", round=3)
        table += make_table_entry(values_error*100, "error", round=1)
        
        print(table.rstrip("& ") + " \\\\")

===Comparison with random forest, NLL and Error===
Internet & \textbf{0.217} & 0.275 & \textbf{0.217} & 0.274 & \textbf{10.0} & 11.2 & \textbf{10.0} & 11.0 \\


----

In [28]:
table_type = 'rf_prr_auc' #sys.argv[1]

datasets = ["credit"] #["adult", "amazon", "click", "internet", "appetency", "churn", "upselling", "kick"]

if table_type == "rf_prr_auc":
    print("===Comparison with random forest, PRR and AUC-ROC===")
        
    for name in datasets:

        values = aggregate_results(name, algorithms=["sglb-fixed", "rf"], modes=["virt", "ens"])
        
        prr_TU = np.array([values[i]["TU_prr"] for i in range(len(values))])
        prr_KU = np.array([values[i]["KU_prr"] for i in range(len(values))])
        prr = np.concatenate((prr_TU, prr_KU), axis=0)

        textbf_prr = compute_best(prr, minimize=False)
    
        auc_TU = np.array([values[i]["TU_auc"] for i in range(len(values))])
        auc_KU = np.array([values[i]["KU_auc"] for i in range(len(values))])
        auc = 100*np.concatenate((auc_TU, auc_KU), axis=0)
        textbf_auc = compute_best(auc, minimize=False)

        num = len(auc_TU)
    
        table = "\multirow{2}{*} {" + convert_name[name] + "} & TU & "
        for idx in range(num):
            table += make_table_element(prr, textbf_prr, idx)

        for idx in range(num):
            table += make_table_element(auc, textbf_auc, idx)
            
        print(table.rstrip("& ") + " \\\\")
        
        table = " & KU & "
        for idx in range(num, 2*num):
            table += make_table_element(prr, textbf_prr, idx)
            
        for idx in range(num, 2*num):
            table += make_table_element(auc, textbf_auc, idx)
        print(table.rstrip("& ") + " \\\\")
        
        print("\midrule")

===Comparison with random forest, PRR and AUC-ROC===
\multirow{2}{*} {Credit-Card} & TU & 45 & \textbf{55} & 46 & 55 & 78 & 77 & 75 & 76 \\
 & KU & 11 & 37 & 18 & 39 & 92 & 55 & \textbf{99} & 60 \\
\midrule


In [29]:
table_type = 'rf_nll_error' #sys.argv[1]

datasets = ["credit"] #["adult", "amazon", "click", "internet", "appetency", "churn", "upselling", "kick"]

if table_type == "rf_nll_error": 

    print("===Comparison with random forest, NLL and Error===")
    for name in datasets:

        values = aggregate_results(name, algorithms=["sglb-fixed", "rf"], modes=["single", "ens"])
        
        table = convert_name[name] + " & "
        
        values_nll = np.array([values[i]["nll"] for i in range(len(values))])
        values_error = np.array([values[i]["error"] for i in range(len(values))])
        
        table += make_table_entry(values_nll, "nll", round=3)
        table += make_table_entry(values_error*100, "error", round=1)
        
        print(table.rstrip("& ") + " \\\\")

===Comparison with random forest, NLL and Error===
Credit-Card & \textbf{0.405} & \textbf{0.409} & \textbf{0.405} & \textbf{0.409} & \textbf{16.7} & 17.2 & \textbf{16.7} & 17.2 \\
