In [75]:

import os
import yaml
from Experiments.EXP1.trainer import trainer, model_callByName, loss_callByName
from data_utils import get_uci_data, common_processor_UCI, seed_all, normalize, splitter
from Experiments.EXP1.TestPerform import testPerform_muSigma, testPerform_isotonic, testPerform_kernel, testPerform_projKernel
import torch
import pandas as pd
from src.evaluations import obs_vs_exp, mu_sig_toQuants
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import random_projection
from scipy.spatial import distance



def special_check(dataname = "wine", modelname = None, width = 10, ratio = 0.3):
    
    base_seed = 1234
    num_repeat = 1
    big_df = {}

    err_mu_dic = {}
    err_std_dic = {}

    if dataname == "wine":

        raw_train_X, raw_train_Y = get_uci_data("wine", os.getcwd()+"/Dataset/UCI_datasets")
        test_X, test_Y = get_uci_data("wine-red", os.getcwd()+"/Dataset/UCI_datasets")

        raw_train_X, x_normalizer = normalize(raw_train_X)

        test_X = x_normalizer.transform(test_X)


    elif dataname in ["MPG12", "MPG13"]:

        df = pd.read_fwf(os.getcwd() + "/Dataset/UCI_datasets/auto_mpg.txt", colspecs='infer', widths=None, infer_nrows=100)
        dataset = df.iloc[:,0:8].to_numpy()
        ds1 = dataset[dataset[:, 7] == 1][:, :7]
        ds2 = dataset[dataset[:, 7] == 2][:, :7]
        ds3 = dataset[dataset[:, 7] == 3][:, :7]

        raw_train_X, raw_train_Y = ds1[:,1:], ds1[:,0]

        if dataname == "MPG12":

            test_X, test_Y = ds2[:,1:], ds2[:,0]

        elif dataname == "MPG13":

            test_X, test_Y = ds3[:,1:], ds3[:,0]

        raw_train_X, x_normalizer = normalize(raw_train_X)

        test_X = x_normalizer.transform(test_X) 



    elif dataname in ["concrete", "boston"]:

        x, y = get_uci_data(dataname, os.getcwd()+"/Dataset/UCI_datasets")

        x_norm, x_normalizer = normalize(x)

        x = x_norm

        tr_idx, te_idx = splitter(int(len(x)/2), int(len(x)/2), seed = 1234)

        # we resample from tr_idx to create a shifted distribution

        raw_train_X, raw_train_Y, raw_test_X, raw_test_Y = x[tr_idx], y[tr_idx], x[te_idx], y[te_idx]

        
        x_bar = np.mean(raw_train_X, axis = 0)

        cov_mat = np.cov(raw_train_X.T)

        x_seed = np.random.multivariate_normal(mean = x_bar, cov = cov_mat, size = 1)[0]


        resample_pickers = np.random.multivariate_normal(mean = x_seed, cov = 0.3 * cov_mat, size = 1000)

        picked_idx = []

        for picker in resample_pickers:

            closest_index = distance.cdist(raw_test_X, [picker]).argmin()
            picked_idx.append(closest_index)

        test_X = raw_test_X[np.array(picked_idx)]
        test_Y = raw_test_Y[np.array(picked_idx)]



    if modelname:
        model_trial = [modelname]
        
    else:
        model_trial = ["vanillaKernel_CovSelect", "vanillaKernel", "RFKernel", "vanillaKernel_RandomProj"]

    for modelname in model_trial:

        print("model: "+ modelname +" on data: "+dataname)


        # train base model

        with open(os.getcwd()+"/Experiments/EXP1/config_bin/vanillaPred_on_wine_config.yml", 'r') as file:
            base_configs = yaml.safe_load(file)


        base_misc_info = base_configs["misc_info"]
        base_train_config= base_configs["training_config"]


        if dataname in ["MPG12", "MPG13"]:

            base_train_config["bat_size"] = 10
            base_misc_info["model_config"]["hidden_layers"] = [10]

        else:

            base_train_config["bat_size"] = 64
            base_misc_info["model_config"]["hidden_layers"] = [10, 5]



        base_misc_info["model_config"]["n_input"] = raw_train_X.shape[1]

        base_train_config["LR"] = 5E-3



# -------------------------------------------------------------
        width = width
# -------------------------------------------------------------


        crits_dic = {}

        for k in range(num_repeat):

            SEED = base_seed + k

            # split some for recalibration

            
# -------------------------------------------------------------
            recal_ratio = ratio
# -------------------------------------------------------------


            N_recal = int(len(raw_train_X)*recal_ratio)
            N_train = len(raw_train_X) - N_recal

            tr_idx, recal_idx = splitter(N_train, N_recal)

            train_X, train_Y = raw_train_X[tr_idx], raw_train_Y[tr_idx]
            recal_X, recal_Y = raw_train_X[recal_idx], raw_train_Y[recal_idx]



            train_X, test_X, recal_X = torch.Tensor(train_X), torch.Tensor(test_X), torch.Tensor(recal_X)
            train_Y, test_Y, recal_Y = torch.Tensor(train_Y).to(torch.device("cuda")), torch.Tensor(test_Y).to(torch.device("cuda")), torch.Tensor(recal_Y).to(torch.device("cuda"))


            if modelname == "RFKernel":

                depth = 10

                base_model = RandomForestRegressor(max_depth=depth, random_state=0)
                base_model.fit(train_X.cpu().numpy(), train_Y.cpu().numpy())


                record = testPerform_kernel(test_X, test_Y, recal_X, recal_Y, model_name= modelname, model = base_model, wid = width)


            else:

                base_model = model_callByName[base_misc_info["model_init"]](**base_misc_info["model_config"])

                trainer(
                    seed = SEED,
                    raw_train_X = train_X,
                    raw_train_Y = train_Y,
                    model = base_model,
                    training_config = base_train_config,
                    harvestor = None,          
                    misc_info = base_misc_info,
                    diff_trainingset = True
                )


                if modelname == "vanillaKernel":

                    record = testPerform_kernel(test_X, test_Y, recal_X, recal_Y, 
                                                model_name= modelname, model = base_model, wid = width)

                else:

                    n_component = 4

                    if modelname == "vanillaKernel_RandomProj":

                        transformer = random_projection.GaussianRandomProjection(n_components = n_component)

                        reformer = lambda x : torch.Tensor(transformer.fit_transform(x.cpu().numpy()))

                    elif modelname == "vanillaKernel_CovSelect":

                        temp_y = recal_Y.cpu().numpy()
                        temp_x = recal_X.cpu().numpy()


                        corr_li = np.zeros(temp_x.shape[1])

                        for i in range(temp_x.shape[1]):

                            corr_li[i] = np.abs(np.corrcoef(temp_x[:, i], temp_y)[0,1])


                        sorted_CORR = np.sort(corr_li)



                        threshold = sorted_CORR[-n_component]


                        BEST_idx = np.where(corr_li >= threshold)[0]
                        if len(BEST_idx) > n_component:
                            BEST_idx = BEST_idx[:n_component]

                        reformer = lambda x : x[:, BEST_idx]

                    record = testPerform_projKernel(test_X, test_Y, recal_X, recal_Y, 
                                                    model_name = modelname, 
                                                    model= base_model, reformer= reformer, wid = width)          


            if k == 0:
                for key in record.keys():

                    crits_dic[modelname + "_"+key] = []

            for key in record.keys():

                crits_dic[modelname + "_"+key].append(record[key])

        for key in crits_dic.keys():
            err_mu_dic[key] = (max(crits_dic[key]) + min(crits_dic[key]))/2
            err_std_dic[key] = (max(crits_dic[key]) - min(crits_dic[key]))/2


    if len(big_df) == 0:
        big_df["idxes"] = list(err_mu_dic.keys())

    big_df[dataname +"_mu"] = list(err_mu_dic.values())
    big_df[dataname + "_std"] = list(err_std_dic.values())


    df = pd.DataFrame.from_dict(big_df)  

    # df.to_csv(os.getcwd()+"/Experiments/EXP4/record_bin/kernel_benchmarks.csv",index=False)

    print(df)





In [81]:
special_check(dataname = "boston", modelname = None, width = 1, ratio = 0.3)

model: vanillaKernel_CovSelect on data: boston
model: vanillaKernel on data: boston
model: RFKernel on data: boston
model: vanillaKernel_RandomProj on data: boston
                                  idxes  boston_mu  boston_std
0     vanillaKernel_CovSelect_MACE_Loss   0.081226         0.0
1     vanillaKernel_CovSelect_AGCE_Loss   0.094634         0.0
2    vanillaKernel_CovSelect_CheckScore   0.618851         0.0
3               vanillaKernel_MACE_Loss   0.106173         0.0
4               vanillaKernel_AGCE_Loss   0.128075         0.0
5              vanillaKernel_CheckScore   0.796128         0.0
6                    RFKernel_MACE_Loss   0.124756         0.0
7                    RFKernel_AGCE_Loss   0.153976         0.0
8                   RFKernel_CheckScore   0.781922         0.0
9    vanillaKernel_RandomProj_MACE_Loss   0.220460         0.0
10   vanillaKernel_RandomProj_AGCE_Loss   0.306100         0.0
11  vanillaKernel_RandomProj_CheckScore   1.375413         0.0
