In [4]:
import os, sys
import torch
import numpy as np
import argparse
from argparse import Namespace
import tqdm
import six
from scipy import stats
import pickle
from helper import set_seeds
from torch.utils.data import DataLoader
from datasets.datasets import get_scaled_data, get_synthetic_data
from utils.q_model_ens import QModelEns, MSEModel
from losses import batch_qr_loss, batch_interval_loss
import helper
from helper import SYN_DATA, REAL_DATA

sys.modules['sklearn.externals.six'] = six
np.warnings.filterwarnings('ignore')

os.environ["MKL_CBWR"] = 'AUTO'

results_path = helper.results_path

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"


def get_loss_fn(loss_name):
    if loss_name == 'batch_qr' or loss_name == 'batch_wqr':
        fn = batch_qr_loss
    elif loss_name == 'batch_int':
        fn = batch_interval_loss
    else:
        raise ValueError('loss arg not valid')

    return fn


def update_results_during_training(y_upper, y_lower, x, y, set_name, results_dict, alpha):
    with torch.no_grad():
        if len(x) == 0 or len(y) == 0:
            return
        y = y.reshape(-1).to(device)
        idx = np.random.permutation(len(x))  # [:len(xx)]
        x = x[idx].to(device)
        quantiles = torch.Tensor([alpha / 2, 1 - alpha / 2]).to(device)
        

        if torch.is_tensor(y):
            curr_y = y.cpu().detach().numpy()[idx]
        else:
            curr_y = y[idx]
        in_the_range = ((curr_y >= y_lower) & (curr_y <= y_upper))
        lengths = (y_upper - y_lower)

        if 'pearsons_correlation' + '_over_' + set_name not in results_dict:
            results_dict['pearsons_correlation' + '_over_' + set_name] = []

        results_dict['pearsons_correlation' + '_over_' + set_name] += [
            stats.pearsonr(in_the_range, lengths)[0]]

        if 'coverage' + '_over_' + set_name not in results_dict:
            results_dict['coverage' + '_over_' + set_name] = []

        results_dict['coverage' + '_over_' + set_name] += [np.mean(in_the_range)]

        if 'interval_lengths' + '_over_' + set_name not in results_dict:
            results_dict['interval_lengths' + '_over_' + set_name] = []

        results_dict['interval_lengths' + '_over_' + set_name] += [np.mean(lengths)]


In [31]:
from Experiments.EXP1.trainer import model_callByName, loss_callByName

POSSIBLE_REAL_DATA_NAMES = ['kin8nm', 'naval', 'meps_19', 'meps_20', 'meps_21', 'facebook_1', 'facebook_2',
                            'blog_data', 'bio', 'scaled_bio', 'bike']


# net train
# ---------------------------------------------------------
DATA_NAME = 'blog_data'



data_type = REAL_DATA

# DATA_NAMES = ['meps_19', 'meps_20', 'meps_21', 'facebook_1', 'facebook_2', 'blog_data']




SEEDS = range(0, 1)

save_results_during_training = True


arg_loss = 'batch_int'
TRAINING_OVER_ALL_QUANTILES = True



d = DATA_NAME

print("On dataset "+ d)

for s in tqdm.tqdm(SEEDS):

    arg_data = d
    arg_seed = s


    set_seeds(arg_seed)
    data_args = Namespace(dataset=arg_data, seed=arg_seed)

    # if data_type == REAL_DATA:
    # Fetching data
    data_out = get_scaled_data(arg_data, arg_seed, recal_prop = 0.1)

    unscaled_x_train = None
    unscaled_x_test = None
    minority_group_uncertainty = None
    group_feature = None


    x_tr, x_va, x_te, y_tr, y_va, y_te, y_al = \
        data_out.x_tr, data_out.x_va, data_out.x_te, data_out.y_tr, \
        data_out.y_va, data_out.y_te, data_out.y_al
    
    # set aside some for recalibration
    #############################################
    x_recal = x_tr[:int(0.2 * len(x_tr))]
    y_recal = y_tr[:int(0.2 * len(y_tr))]
    x_tr = x_tr[int(0.2 * len(x_tr)):]
    y_tr = y_tr[int(0.2 * len(y_tr)):]
    #############################################

    x_va, y_va = x_va.to(device), y_va.to(device)
    x_tr, y_tr = x_tr.to(device), y_tr.to(device)
    y_te, x_te = y_te.to(device), x_te.to(device)

    x_test = x_te
    y_test = y_te

    w_tr, w_va, get_tr_weights = helper.get_wqr_weights(arg_loss, x_tr, y_tr, x_va, y_va, device = device)

    y_range = (y_al.max() - y_al.min()).item()

    # creating the model
    num_tr = x_tr.shape[0]
    dim_x = x_tr.shape[1]
    dim_y = y_tr.shape[1]

    model_ens = QModelEns(input_size=dim_x + 1, output_size=dim_y,
                          hidden_size=64, num_layers=2, dropout=0,
                          lr=1e-3, wd=0,
                          num_ens=1, device=device)

    loader = DataLoader(helper.IndexedDataset(x_tr, y_tr),
                        shuffle=True,
                        batch_size=1024)

    # Loss function
    loss_fn = get_loss_fn(arg_loss)
    batch_loss = True if 'batch' in arg_loss else False
    """ train loop """
    tr_loss_list = []
    va_loss_list = []
    te_loss_list = []
    batch_size = 1024
    for ep in range(2000):

        if model_ens.done_training:
            print('Done training ens at EP {}'.format(ep))
            break

        # Take train step
        ep_train_loss = []  # list of losses from each batch, for one epoch
        epoch_loss = []

        for xi, yi, index in loader:
            if TRAINING_OVER_ALL_QUANTILES:
                q_list = torch.rand(30)
            else:
                q_list = torch.Tensor([alpha / 2])

            arg_corr_mult = 0
            loss = model_ens.loss(loss_fn, xi, yi, q_list,
                                  batch_q=batch_loss,
                                  take_step=True, args=arg_corr_mult, weights=get_tr_weights(index))
            ep_train_loss.append(loss)

        ep_tr_loss = np.nanmean(np.stack(ep_train_loss, axis=0), axis=0)
        tr_loss_list.append(ep_tr_loss)



        # Validation loss
        # x_va, y_va = x_va.to(args.device), y_va.to(args.device)
        if TRAINING_OVER_ALL_QUANTILES:
            va_te_q_list = torch.linspace(0.01, 0.99, 99)
        else:
            va_te_q_list = torch.Tensor([alpha / 2, 1 - alpha / 2])

        ep_va_loss = model_ens.update_va_loss(
            loss_fn, x_va, y_va, va_te_q_list,
            batch_q=batch_loss, curr_ep=ep, num_wait=200,
            args=arg_corr_mult, weights=w_va
        )
        va_loss_list.append(ep_va_loss)

        # Printing some losses
        if (ep % 200 == 0):
            print('EP:{}'.format(ep))
            pass
        
    
    # Move everything to cpu
    x_tr, y_tr, x_va, y_va, x_te, y_te = \
        x_tr.cpu(), y_tr.cpu(), x_va.cpu(), y_va.cpu(), x_te.cpu(), y_te.cpu()
    model_ens.use_device(torch.device('cpu'))
    
    
    








    




On dataset blog_data


  0%|          | 0/1 [00:00<?, ?it/s]

EP:0
EP:200
EP:400


100%|██████████| 1/1 [02:38<00:00, 158.62s/it]

Done training ens at EP 547





In [34]:
from Experiments.EXP1.TestPerform import testPerform_projKernel
from sklearn import random_projection
from sklearn.ensemble import RandomForestRegressor
import torch
from src.kernel_methods import kernel_estimator
from losses import independence_penalty

## 


alpha = 0.1


quantiles = torch.Tensor([alpha/2, 1-alpha/2])


n_component = 50
transformer = random_projection.GaussianRandomProjection(n_components = n_component)
reformer = lambda x : torch.Tensor(transformer.fit_transform(x.cpu().numpy()))


###############################################
## might need to resample
resample_recal = 4
resample_test = 4

x_recald = x_recal[:int(len(x_recal)/resample_recal)]
y_recald = y_recal[:int(len(y_recal)/resample_recal)].view(-1).cuda()
x_ted = x_te[:int(len(x_te)/resample_test)]
y_ted = y_te[:int(len(y_te)/resample_test)].view(-1).cuda()

###############################################

recal_preds = model_ens.predict_q(
    x_recald, quantiles, ens_pred_type='conf',
    recal_model=None, recal_type=None
)

recal_preds = torch.permute(recal_preds, (1,0))
recal_mean_1 = recal_preds[0].cuda()
recal_mean_2 = recal_preds[1].cuda()


test_preds = model_ens.predict_q(
    x_ted, quantiles, ens_pred_type='conf',
    recal_model=None, recal_type=None
)

test_preds = torch.permute(test_preds, (1,0))
test_mean_1 = test_preds[0].cuda()
test_mean_2 = test_preds[1].cuda()






In [39]:
ker_range = [1, 5, 10, 15, 30]
for width in ker_range:
    print(width)

    # we do the recal mean for two quantiles 


    test_Z =  reformer(x_ted)

    recal_Z = reformer(x_recald)


    # lower part
    eps_diffQuants_1 = kernel_estimator(
        test_Z = test_Z.cuda(),
        recal_Z = recal_Z.cuda(),
        recal_epsilon = torch.Tensor(y_recald - recal_mean_1).cuda(),
        quants = np.array([alpha/2]),
        wid= width
    )

    y_lower = (eps_diffQuants_1 + test_mean_1.view(1,-1).repeat(len(eps_diffQuants_1),1)).cpu().numpy()

    
    # upper part
    eps_diffQuants_2 = kernel_estimator(
        test_Z = test_Z.cuda(),
        recal_Z = recal_Z.cuda(),
        recal_epsilon = torch.Tensor(y_recald - recal_mean_2).cuda(),
        quants = np.array([1-alpha/2]),
        wid= width
    )

    y_upper = (eps_diffQuants_2 + test_mean_2.view(1,-1).repeat(len(eps_diffQuants_2),1)).cpu().numpy()

    
    
    
    ret = {}
    val_criterias = [
            "interval_AGCE"
        ]  

    for key in val_criterias:

        real_loss = loss_callByName[key]

        real_err = real_loss( torch.Tensor(np.vstack((y_lower, y_upper))), y_ted.cpu(), q_interval = np.array([alpha/2, 1-alpha/2])).item()

        if isinstance(real_err, torch.Tensor):

            real_err = real_err.item()

        ret[key] = real_err


    print(ret)
    print("--------------------------------------------------")

1
{'interval_AGCE': 0.7931297421455383}
--------------------------------------------------
5
{'interval_AGCE': 0.07557249069213867}
--------------------------------------------------
10
{'interval_AGCE': 0.023664116859436035}
--------------------------------------------------
15
{'interval_AGCE': 0.052671730518341064}
--------------------------------------------------
30
{'interval_AGCE': 0.038931310176849365}
--------------------------------------------------
