In [1]:
import torch
import torch.nn.functional as F
from torch.distributions import Dirichlet
import math
import numpy as np

In [2]:
alpha = torch.tensor([[1,2,3,4,5],[0,0,1,2,3]])
print(alpha)
print(alpha.sum(1))
print(alpha.sum(1).unsqueeze(-1).repeat(1,5))

tensor([[1, 2, 3, 4, 5],
        [0, 0, 1, 2, 3]])
tensor([15,  6])
tensor([[15, 15, 15, 15, 15],
        [ 6,  6,  6,  6,  6]])


In [9]:

def dirichlet_kl_divergence(
    alphas, target_alphas, precision=None, target_precision=None, epsilon=1e-8
):
    """
    This function computes the Forward KL divergence between a model Dirichlet distribution
    and a target Dirichlet distribution based on the concentration (alpha) parameters of each.

    :param alphas: Tensor containing concentation parameters of model. Expected shape is batchsize X num_classes.
    :param target_alphas: Tensor containing target concentation parameters. Expected shape is batchsize X num_classes.
    :param precision: Optional argument. Can pass in precision of model. Expected shape is batchsize X 1
    :param target_precision: Optional argument. Can pass in target precision. Expected shape is batchsize X 1
    :param epsilon: Smoothing factor for numercal stability. Default value is 1e-8
    :return: Tensor for Batchsize X 1 of forward KL divergences between target Dirichlet and model
    """
    if not precision:
        precision = torch.sum(alphas, dim=1, keepdim=True)
    if not target_precision:
        target_precision = torch.sum(target_alphas, dim=1, keepdim=True)
    precision_term = torch.lgamma(target_precision) - torch.lgamma(precision)
    assert torch.all(torch.isfinite(precision_term)).item()
    alphas_term = torch.sum(
        torch.lgamma(alphas + epsilon)
        - torch.lgamma(target_alphas + epsilon)
        + (target_alphas - alphas)
        * (
            torch.digamma(target_alphas + epsilon)
            - torch.digamma(target_precision + epsilon)
        ),
        dim=1,
        keepdim=True,
    )
    # print(alphas_term)
    assert torch.all(torch.isfinite(alphas_term)).item()

    cost = torch.squeeze(precision_term + alphas_term)
    return cost
# alpha = torch.tensor([[1,2,3,4,5]])
alpha = torch.tensor([[0.1,0.1,6,0.1,0.1,]])
beta = torch.tensor([[1,1,1,1,1]])

alpha = torch.tensor([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
beta = torch.tensor([[1,1],[1,1],[1,1],[1,1],[1,1]])

print(torch.sum(alpha, 1))

print(dirichlet_kl_divergence(alpha,beta).sum(), dirichlet_kl_divergence(alpha,beta))
print(dirichlet_kl_divergence(beta,alpha).sum(), dirichlet_kl_divergence(beta,alpha))

tensor([0.2000, 6.1000, 1.1000, 1.1000, 1.1000])
tensor(11.5703) tensor([1.1814, 6.1812, 1.4026, 1.4026, 1.4026])
tensor(35.1154) tensor([6.2611, 8.7620, 6.6974, 6.6974, 6.6974])


In [62]:
import tensorflow as tf 
import numpy as np
def _KL(alphas, target_alphas,  precision=None, target_precision=None, epsilon=1e-8):
    # print("K:",K)
    # beta=tf.constant(np.ones((1,K)),dtype=tf.float32)
    target_alphas = tf.cast(target_alphas,tf.float32)
    alphas = tf.cast(alphas,tf.float32)
    # print(target_alphas)
    if not precision:
        precision = tf.reduce_sum(alphas, axis=1, keepdims=True)
    if not target_precision:
        target_precision = tf.reduce_sum(target_alphas, axis=1, keepdims=True)
    precision = tf.cast(precision,tf.float32)
    target_precision = tf.cast(target_precision,tf.float32)
    
    precision_term = tf.compat.v1.lgamma(target_precision) - tf.compat.v1.lgamma(precision)
    # assert torch.all(torch.isfinite(precision_term)).item()
    alphas_term = tf.reduce_sum(
        tf.compat.v1.lgamma(alphas + epsilon)
        - tf.compat.v1.lgamma(target_alphas + epsilon)
        + (target_alphas - alphas)
        * (
            tf.compat.v1.digamma(target_alphas + epsilon)
            - tf.compat.v1.digamma(target_precision + epsilon)
        ),
        axis=1,
        keepdims=True,
    )
    # print(alphas_term)
    # assert torch.all(torch.isfinite(alphas_term)).item()

    cost = tf.squeeze(precision_term + alphas_term)
    return cost

def KL(alpha,K):
    # print("K:",K)
    beta=tf.constant(np.ones((1,K)),dtype=tf.float32)

    S_alpha = tf.reduce_sum(alpha,axis=1,keepdims=True)
    S_beta = tf.reduce_sum(beta,axis=1,keepdims=True)
    lnB = tf.compat.v1.lgamma(S_alpha) - tf.reduce_sum(tf.compat.v1.lgamma(alpha),axis=1,keepdims=True)
    lnB_uni = tf.reduce_sum(tf.compat.v1.lgamma(beta),axis=1,keepdims=True) - tf.compat.v1.lgamma(S_beta)
    
    dg0 = tf.compat.v1.digamma(S_alpha)
    dg1 = tf.compat.v1.digamma(alpha)
    # tf.print("alpha",alpha.shape)
    # tf.print("beta",beta.shape)
    kl = tf.reduce_sum((alpha - beta)*(dg1-dg0),axis=1,keepdims=True) + lnB + lnB_uni
    # print("kl", kl)
    return kl


logits = torch.tensor([[0.1,1]])
label = torch.tensor([[0,1]])
alphas = torch.exp(logits)

target_alphas = torch.empty_like(alphas, requires_grad=False).fill_(
    1
)
# print("ta", target_alphas)
target_concentration = torch.sum(alphas, 1,keepdims=True)
# print("tc", target_concentration)
# target_alphas[torch.tensor([0]), label] = target_concentration
target_alphas = target_alphas + (target_concentration * label)

# #alphafix
# alphas = alphas+1
# print("alphas",alphas)
# print("ta", target_alphas)
# print("forward")
# print(dirichlet_kl_divergence(alphas, target_alphas).sum(), dirichlet_kl_divergence(alphas, target_alphas))
# print("backward")
# print(dirichlet_kl_divergence(target_alphas, alphas).sum(), dirichlet_kl_divergence(target_alphas, alphas))


logits = tf.Variable([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
alpha = tf.exp(logits)
labels = tf.Variable([[0,1],[1,0],[1,0],[0,1],[0,1]],dtype=tf.float32)


# logits = torch.tensor([[0.1,1]])
# labels = torch.tensor([[0,1]])
# alphas = torch.exp(logits)


target_concentration = tf.reduce_sum(alpha,axis=1,keepdims=True)
target_alphas = (tf.ones_like(alpha) + (target_concentration * labels))

alpha = alpha + 1
print("alpha",alpha)
print("tc",target_concentration)
print("ta",target_alphas)
# if tf.reduce_any(tf.math.is_nan(logits)):
    # tf.print("NaN val in KLloss",logits)
# inverse_labels = tf.math.abs(labels - 1)

# beta = target_alphas #+ inverse_labels

# # beta=tf.Variable(tf.ones_like(alpha),dtype=tf.float32)
print("forward")
print(tf.reduce_sum(_KL(alpha,target_alphas)),_KL(alpha,target_alphas))
print("reverse")
print(tf.reduce_sum(_KL(target_alphas,alpha)),_KL(target_alphas,alpha))
# print("Old")
# print(tf.reduce_sum(KL(alpha,2)),KL(alpha,2))


alpha tf.Tensor(
[[  2.105171    2.105171 ]
 [404.4288      2.105171 ]
 [  2.105171    3.7182817]
 [  2.105171    3.7182817]
 [  2.105171    3.7182817]], shape=(5, 2), dtype=float32)
tc tf.Tensor(
[[  2.210342 ]
 [404.53397  ]
 [  3.8234527]
 [  3.8234527]
 [  3.8234527]], shape=(5, 1), dtype=float32)
ta tf.Tensor(
[[  1.         3.210342]
 [405.53397    1.      ]
 [  4.823453   1.      ]
 [  1.         4.823453]
 [  1.         4.823453]], shape=(5, 2), dtype=float32)
forward
tf.Tensor(7.450365, shape=(), dtype=float32) tf.Tensor([0.94614553 0.68737984 4.136518   0.84016085 0.84016085], shape=(5,), dtype=float32)
reverse
tf.Tensor(5.895544, shape=(), dtype=float32) tf.Tensor([0.7998429  0.49340296 3.2784505  0.66192377 0.66192377], shape=(5,), dtype=float32)


In [31]:



logits = tf.Variable([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
labels = tf.Variable([[0,1],[1,0],[1,0],[0,1],[0,1]],dtype=tf.float32)

evidence = tf.exp(logits)
alpha = evidence + 1
print("alpha",alpha)
p = labels
S = tf.reduce_sum(alpha,axis=1,keepdims=True) 
E = alpha - 1
m = alpha / S
A = tf.reduce_sum((p-m)**2, axis=1, keepdims=True) 
B = tf.reduce_sum(alpha*(S-alpha)/(S*S*(S+1)), axis=1, keepdims=True) 
# tf.print(annealing_coef)
# annealing_coef = 

alp = evidence*(1-p) + 1 
print(alp)

alpha tf.Tensor(
[[  2.105171    2.105171 ]
 [404.4288      2.105171 ]
 [  2.105171    3.7182817]
 [  2.105171    3.7182817]
 [  2.105171    3.7182817]], shape=(5, 2), dtype=float32)
tf.Tensor(
[[2.105171  1.       ]
 [1.        2.105171 ]
 [1.        3.7182817]
 [2.105171  1.       ]
 [2.105171  1.       ]], shape=(5, 2), dtype=float32)


In [57]:
logits = torch.tensor([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
label = torch.tensor([[0,1],[1,0],[1,0],[0,1],[0,1]])


# logits = torch.tensor([[0.1,20]])
# label = torch.tensor([[0,1]])
alphas = torch.exp(logits)

target_alphas = torch.empty_like(alphas, requires_grad=False).fill_(
    1
)
# print("ta", target_alphas)
target_concentration = torch.sum(alphas, 1,keepdims=True)
# print("tc", target_concentration)
# target_alphas[torch.tensor([0]), label] = target_concentration
target_alphas = target_alphas + (target_concentration * label)

#alphafix
alphas = alphas+1
print("alphas",alphas)
print("ta", target_alphas)
print("forward")
print(dirichlet_kl_divergence(alphas, target_alphas).sum(), dirichlet_kl_divergence(alphas, target_alphas))
print("backward")
print(dirichlet_kl_divergence(target_alphas, alphas).sum(), dirichlet_kl_divergence(target_alphas, alphas))

alphas tensor([[  2.1052,   2.1052],
        [404.4288,   2.1052],
        [  2.1052,   3.7183],
        [  2.1052,   3.7183],
        [  2.1052,   3.7183]])
ta tensor([[  1.0000,   3.2103],
        [405.5340,   1.0000],
        [  4.8235,   1.0000],
        [  1.0000,   4.8235],
        [  1.0000,   4.8235]])
forward
tensor(7.4504) tensor([0.9461, 0.6874, 4.1365, 0.8402, 0.8402])
backward
tensor(5.8955) tensor([0.7998, 0.4934, 3.2785, 0.6619, 0.6619])


In [5]:
import tensorflow_probability as tfp
-tf.reduce_mean(tfp.distributions.Dirichlet(alpha).entropy())

<tf.Tensor: shape=(), dtype=float32, numpy=7.023075>

In [None]:
def _KL_old(alpha, beta, K=5 ):
    # print("K:",K)
    # beta=tf.constant(np.)),dtype=tf.float32)
    beta = tf.ones_like(alpha)
    # beta = tf.cast(beta,tf.float32)
    print("beta",beta)
    alpha = tf.cast(alpha,tf.float32)
    S_alpha = tf.reduce_sum(alpha,axis=1,keepdims=True)
    S_beta = tf.reduce_sum(beta,axis=1,keepdims=True)
    lnB = tf.compat.v1.lgamma(S_alpha) - tf.reduce_sum(tf.compat.v1.lgamma(alpha),axis=1,keepdims=True)
    lnB_uni = tf.reduce_sum(tf.compat.v1.lgamma(beta),axis=1,keepdims=True) - tf.compat.v1.lgamma(S_beta)
    
    dg0 = tf.compat.v1.digamma(S_alpha)
    dg1 = tf.compat.v1.digamma(alpha)
    # tf.print("alpha",alpha.shape)
    # tf.print("beta",beta.shape)
    kl = tf.reduce_sum((alpha - beta)*(dg1-dg0),axis=1,keepdims=True) + lnB + lnB_uni
    # print("kl", kl)
    return kl
# alpha = tf.Variable([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
# beta = tf.Variable([[0,1],[0,1],[1,0],[0,1],[0,1]],dtype=tf.float32)
alpha = tf.Variable([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
beta = tf.Variable([[0,1],[0,1],[1,0],[0,1],[0,1]],dtype=tf.float32)
target_concentration = tf.reduce_sum(alpha,axis=1,keepdims=True)
if tf.reduce_any(tf.math.is_nan(logits)):
    tf.print("NaN val in KLloss",logits)
target_alphas = (tf.ones_like(logits)* target_concentration) * labels
inverse_labels = tf.math.abs(labels - 1)

beta = target_alphas #+ inverse_labels
# beta=tf.constant(np.ones((1,2)),dtype=tf.float32)
print(beta)
print(tf.reduce_sum(_KL_old(alpha,beta,2)),_KL_old(alpha,beta,2))
print("----")
print(tf.reduce_sum(_KL_old(beta,alpha,2)),_KL_old(beta,alpha,2))

In [128]:
alphas = torch.tensor([[0.1,0.1,6,0.1,0.1],[0.1,0.1,7,0.1,0.1],[0.1,0.1,6,0.1,0.1]])
# beta = torch.tensor([[1,1,1,1,1]])
target_concentration = torch.sum(alphas,1)
print(target_concentration)
label = torch.tensor([[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0]])
label = torch.tensor([[3],[0]])

target_alphas = torch.empty_like(alphas, requires_grad=False).fill_(
    1
)
print(torch.arange(len(label)))
for i in torch.arange(len(label)):
    target_alphas[i][label[i]] = target_concentration 
# target_alphas[torch.arange(len(label)),label] = target_concentration
print(target_alphas)

tensor([6.4000, 7.4000, 6.4000])
tensor([0, 1])


RuntimeError: shape mismatch: value tensor of shape [3] cannot be broadcast to indexing result of shape [1]

In [146]:
alpha = tf.Variable([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
beta = tf.Variable([[0,1],[0,1],[1,0],[0,1],[0,1]],dtype=tf.float32)
reverse_beta = tf.math.abs(beta - 1)

# print(tf.reduce_sum(alpha,axis=1,keepdims=True))
# print(tf.ones_like(alpha))
target_concentration = (tf.ones_like(alpha)* tf.reduce_sum(alpha,axis=1,keepdims=True)) * beta
# target_concentration = tf.fill(alpha.shape, tf.reduce_sum(alpha)) * beta
print(reverse_beta)
TS = tf.cast(reverse_beta,tf.float32) + tf.cast(target_concentration,tf.float32)
print(TS)

tf.Tensor(
[[1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]], shape=(5, 2), dtype=float32)
tf.Tensor(
[[1.  0.2]
 [1.  6.1]
 [1.1 1. ]
 [1.  1.1]
 [1.  1.1]], shape=(5, 2), dtype=float32)


In [141]:
print(alpha.shape)
print(alpha.shape[-1])

(5, 2)
2


In [168]:
alpha = torch.tensor([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
print(alpha)
print(alpha.flatten(start_dim=1))
print(alpha.flatten(start_dim=1).norm(2,1))
print(alpha.norm(2,1))

tensor([[0.1000, 0.1000],
        [6.0000, 0.1000],
        [0.1000, 1.0000],
        [0.1000, 1.0000],
        [0.1000, 1.0000]])
tensor([[0.1000, 0.1000],
        [6.0000, 0.1000],
        [0.1000, 1.0000],
        [0.1000, 1.0000],
        [0.1000, 1.0000]])
tensor([0.1414, 6.0008, 1.0050, 1.0050, 1.0050])
tensor([0.1414, 6.0008, 1.0050, 1.0050, 1.0050])


In [166]:
alpha = tf.Variable([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
print(alpha)
print(tf.squeeze(alpha))
print(tf.norm(tf.squeeze(alpha),axis=1))
print(tf.norm(alpha,axis=1))

<tf.Variable 'Variable:0' shape=(5, 2) dtype=float32, numpy=
array([[0.1, 0.1],
       [6. , 0.1],
       [0.1, 1. ],
       [0.1, 1. ],
       [0.1, 1. ]], dtype=float32)>
tf.Tensor(
[[0.1 0.1]
 [6.  0.1]
 [0.1 1. ]
 [0.1 1. ]
 [0.1 1. ]], shape=(5, 2), dtype=float32)
tf.Tensor([0.14142136 6.000833   1.0049876  1.0049876  1.0049876 ], shape=(5,), dtype=float32)
tf.Tensor([0.14142136 6.000833   1.0049876  1.0049876  1.0049876 ], shape=(5,), dtype=float32)


In [176]:
alpha = tf.Variable([[0.1,0.1],[6,0.1],[0.1,1],[0.1,1],[0.1,1]])
x = tf.Variable( float("NaN"))
print(x)
print(tf.math.is_nan(alpha))
print(tf.reduce_any(tf.math.is_nan(alpha)))

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=nan>
tf.Tensor(
[[False False]
 [False False]
 [False False]
 [False False]
 [False False]], shape=(5, 2), dtype=bool)
tf.Tensor(False, shape=(), dtype=bool)


In [8]:
import torch
import torchvision
import torchvision.transforms as transforms

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime


transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))])

# Create datasets for training & validation, download if necessary
training_set = torchvision.datasets.FashionMNIST('./data', train=True, transform=transform, download=True)
validation_set = torchvision.datasets.FashionMNIST('./data', train=False, transform=transform, download=True)

# Create data loaders for our datasets; shuffle for training, not for validation
training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True, num_workers=2)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False, num_workers=2)

# Class labels
classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
        'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')

# Report split sizes
print('Training set has {} instances'.format(len(training_set)))
print('Validation set has {} instances'.format(len(validation_set)))

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data\FashionMNIST\raw\train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]

Extracting ./data\FashionMNIST\raw\train-images-idx3-ubyte.gz to ./data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data\FashionMNIST\raw\train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]

Extracting ./data\FashionMNIST\raw\train-labels-idx1-ubyte.gz to ./data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]

Extracting ./data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz to ./data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz


  0%|          | 0/5148 [00:00<?, ?it/s]

Extracting ./data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\FashionMNIST\raw

Training set has 60000 instances
Validation set has 10000 instances


In [2]:
import torch
import torch.nn.functional as F
from torch.distributions import Dirichlet
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("..") # Adds higher directory to python modules path.
import os
import numpy as np
import sys
import os
import pickle
import argparse
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision.transforms as trn
import torchvision.datasets as dset
import torch.nn.functional as F
from skimage.filters import gaussian as gblur
from PIL import Image as PILImage
# from uncertainty_est.models.ebm.mcmc import MCMC
# from uncertainty_est.models.priornet.dpn_losses import UnfixedDirichletKLLoss
# from uncertainty_est.models.priornet.uncertainties import (
#     dirichlet_prior_network_uncertainty,
# )
def classifier_loss(ld_logits, y_l, lg_logits = None):
        alpha = torch.exp(ld_logits)  # / self.p_y.unsqueeze(0).to(self.device)
        # evidence = exp_evidence(logits)
        # print(alpha)
        # if self.alpha_fix:
        alpha = alpha + 1
         # alpha = evidence + 1
        # print(alpha)
        # soft_output = F.one_hot(y_l, len(y_l))
        soft_output = y_l
        #get logits/Y
        # print(soft_output)
        alpha_0 = alpha.sum(1).unsqueeze(-1).repeat(1, len(y_l))
        #S = tf.reduce_sum(alpha,axis=1,keepdims=True) 
        #expand sum of alpha to size of classes which is unnessecary
        # print(alpha_0)
        # print(torch.digamma(alpha_0))
        # print(torch.digamma(alpha))
        UCE_loss = torch.mean(
            soft_output * (torch.digamma(alpha_0) - torch.digamma(alpha))
        )
        dirchlet_weight = 0.0001 * -Dirichlet(alpha).entropy().mean()
        print("dir",dirchlet_weight)
        ### this is the 
        UCE_loss = (
            UCE_loss + dirchlet_weight
        )
        # self.log("train/clf_loss", UCE_loss)

        return UCE_loss
import tensorflow_probability as tfp
def classifier_loss_tf(labels, logits):
    # alpha = torch.exp(ld_logits)  # / self.p_y.unsqueeze(0).to(self.device)
    
    evidence = tf.exp(logits)
    # print(evidence)
    # evidence = exp_evidence(logits)

    # if self.alpha_fix:
        # alpha = alpha + 1
    alpha = evidence + 1
    # print(alpha)
    # soft_output = F.one_hot(y_l, self.n_classes)
    #get lables/Y
    # print(labels)/
    # alpha_0 = alpha.sum(1).unsqueeze(-1).repeat(1, len(labels))
    S = tf.reduce_sum(alpha,keepdims=True)
    #expand sum of alpha to size of classes which is unnessecary
    # print(S)
    # print(tf.compat.v1.digamma(S) )
    # print(tf.compat.v1.digamma(alpha) )
    UCE_loss = tf.reduce_mean(
        labels * (tf.compat.v1.digamma(S) - tf.compat.v1.digamma(alpha))
    )
    ### this is the 
    _alpha = alpha.numpy()
    _alpha = torch.tensor(_alpha)
    # dirichlet_weight = 0.0001 * -Dirichlet(_alpha).entropy().mean()
    dirichlet_weight = 0.001 * tf.reduce_mean(-tfp.distributions.Dirichlet(alpha).entropy())
    print("dir",dirichlet_weight)
    UCE_loss = (
        UCE_loss + dirichlet_weight
    )
    # self.log("train/clf_loss", UCE_loss)

    return UCE_loss

logits = np.array([[0.5,0.5,1.5,0.5]])
labels =  np.array([0,0,1,0])
print("tf",classifier_loss_tf(labels, logits))
print("----")
print("torch",classifier_loss(torch.tensor(logits), torch.tensor(labels)))

# S_alpha = tf.reduce_sum(alpha,keepdims=True)
#     S_beta = tf.reduce_sum(beta,keepdims=True)
#     lnB = tf.compat.v1.lgamma(S_alpha) - tf.reduce_sum(tf.compat.v1.lgamma(alpha),keepdims=True)

dir tf.Tensor(0.002861670235601034, shape=(), dtype=float64)
tf tf.Tensor(0.2409108756761404, shape=(), dtype=float64)
----
dir tensor(0.0003, dtype=torch.float64)
torch tensor(0.2383, dtype=torch.float64)


In [3]:
import matplotlib.pyplot as plt
import numpy as np

# Helper function for inline image display
def matplotlib_imshow(img, one_channel=False):
    if one_channel:
        img = img.mean(dim=0)
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    if one_channel:
        plt.imshow(npimg, cmap="Greys")
    else:
        plt.imshow(np.transpose(npimg, (1, 2, 0)))

dataiter = iter(training_loader)
images, labels = dataiter.next()

# Create a grid from the images and show them
img_grid = torchvision.utils.make_grid(images)
matplotlib_imshow(img_grid, one_channel=True)
print('  '.join(classes[labels[j]] for j in range(4)))

NameError: name 'training_loader' is not defined

In [10]:
import torch.nn as nn
import torch.nn.functional as F

# PyTorch models inherit from torch.nn.Module
class GarmentClassifier(nn.Module):
    def __init__(self):
        super(GarmentClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


model = GarmentClassifier()

In [4]:
loss_fn = torch.nn.CrossEntropyLoss()
dummy_outputs = torch.tensor([[0.7407, 0.2628, 0.1607, 0.2345, 0.9636, 0.6613, 0.5190, 0.5094, 0.7109,
         0.1418],
        [0.7736, 0.7108, 0.8648, 0.1413, 0.3945, 0.9406, 0.6282, 0.6684, 0.0633,
         0.1361],
        [0.0420, 0.9455, 0.6659, 0.9571, 0.2122, 0.6753, 0.6661, 0.7386, 0.3048,
         0.7909],
        [0.7384, 0.4126, 0.5886, 0.7754, 0.2548, 0.9817, 0.2804, 0.9997, 0.7108,
         0.1492]])
dummy_labels = torch.tensor([1, 5, 3, 7])
# NB: Loss functions expect data in batches, so we're creating batches of 4
# Represents the model's confidence in each of the 10 classes for a given input
# dummy_outputs = torch.rand(4, 10)
# Represents the correct class among the 10 being tested
# dummy_labels = torch.tensor([1, 5, 3, 7])

print(dummy_outputs)
print(dummy_labels)

loss = loss_fn(dummy_outputs, dummy_labels)
print('Total loss for this batch: {}'.format(loss.item()))



from torch.distributions import Dirichlet
def classifier_loss(ld_logits, y_l):
        n_classes=10
        alpha = torch.exp(ld_logits)  # / self.p_y.unsqueeze(0).to(self.device)
        # Multiply by class counts for Bayesian update
        print("evidence",alpha)
        # if self.alpha_fix:
        alpha = alpha + 1
        print("alpha",alpha)
        soft_output = F.one_hot(y_l, n_classes)
        print("soft_output",soft_output)
        print("sum",alpha.sum(1))
        alpha_0 = alpha.sum(1).unsqueeze(-1).repeat(1, n_classes)
        print("alpha_0",alpha_0)
        UCE_loss = torch.mean(
            soft_output * (torch.digamma(alpha_0) - torch.digamma(alpha))
        )
        print("UCE_loss_1",UCE_loss)
        dirichlet_weight = 0.0001 * -Dirichlet(alpha).entropy().mean()
        print(dirichlet_weight)
        
        UCE_loss = (
            UCE_loss + 0.0001 * -Dirichlet(alpha).entropy().mean()
        )
        print("UCE_loss_3",UCE_loss)
        # self.log("train/clf_loss", UCE_loss)

        return UCE_loss
loss = classifier_loss(dummy_outputs, dummy_labels)
print('Total loss for this batch: {}'.format(loss.item()))

tensor([[0.7407, 0.2628, 0.1607, 0.2345, 0.9636, 0.6613, 0.5190, 0.5094, 0.7109,
         0.1418],
        [0.7736, 0.7108, 0.8648, 0.1413, 0.3945, 0.9406, 0.6282, 0.6684, 0.0633,
         0.1361],
        [0.0420, 0.9455, 0.6659, 0.9571, 0.2122, 0.6753, 0.6661, 0.7386, 0.3048,
         0.7909],
        [0.7384, 0.4126, 0.5886, 0.7754, 0.2548, 0.9817, 0.2804, 0.9997, 0.7108,
         0.1492]])
tensor([1, 5, 3, 7])
Total loss for this batch: 2.1059627532958984
evidence tensor([[2.0974, 1.3006, 1.1743, 1.2643, 2.6211, 1.9373, 1.6803, 1.6643, 2.0358,
         1.1523],
        [2.1676, 2.0356, 2.3745, 1.1518, 1.4836, 2.5615, 1.8742, 1.9511, 1.0653,
         1.1458],
        [1.0429, 2.5741, 1.9462, 2.6041, 1.2364, 1.9646, 1.9466, 2.0930, 1.3564,
         2.2054],
        [2.0926, 1.5107, 1.8015, 2.1715, 1.2902, 2.6690, 1.3237, 2.7175, 2.0356,
         1.1609]])
alpha tensor([[3.0974, 2.3006, 2.1743, 2.2643, 3.6211, 2.9373, 2.6803, 2.6643, 3.0358,
         2.1523],
        [3.1676, 3.0356, 

In [5]:
import tensorflow as tf
dummy_outputs = tf.Variable([[0.7407, 0.2628, 0.1607, 0.2345, 0.9636, 0.6613, 0.5190, 0.5094, 0.7109,
         0.1418],
        [0.7736, 0.7108, 0.8648, 0.1413, 0.3945, 0.9406, 0.6282, 0.6684, 0.0633,
         0.1361],
        [0.0420, 0.9455, 0.6659, 0.9571, 0.2122, 0.6753, 0.6661, 0.7386, 0.3048,
         0.7909],
        [0.7384, 0.4126, 0.5886, 0.7754, 0.2548, 0.9817, 0.2804, 0.9997, 0.7108,
         0.1492]],dtype=tf.float32)
dummy_labels = tf.Variable([1, 5, 3, 7])

In [8]:
labels =tf.Variable([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
outputs =tf.Variable([[70.5058365, 17.3287029, 6.09617901, 7.22885942, 26.5094051, 4.77077579, 41.5663223, 19.999897, 5.15687704, 21.2428246]],dtype=tf.float32)
# evidence [[4.17153364e+30 2.98016243e-08 444.15744 1378.64917 3.0698109e-12 118.010757 1.12726083e+18 2.06136597e-09 173.621384 5.94782945e-10]]
# S [[4.17153364e+30]]
# UCE_loss_1 7.10830545
# drweight -0


import tensorflow_probability as tfp
def classifier_loss_tf(labels, outputs):
    n_classes=10
    evidence = tf.exp(outputs)
    # print("evidence",evidence)
    # evidence = exp_evidence(outputs)
    alpha = evidence + 1
    # print(alpha)
    soft_output = tf.one_hot(labels, n_classes)
    print("soft_output",soft_output)
    S = tf.reduce_sum(alpha,axis=1,keepdims=True)
    # print("S",S)
    alpha_0 = S * tf.ones((outputs.shape[-1]))
    # print("alpha_0",alpha_0)
    UCE_loss = tf.reduce_mean(
        tf.cast(soft_output,dtype=tf.float32) * (tf.compat.v1.digamma(alpha_0) - tf.compat.v1.digamma(alpha))
    )
    print("UCE_loss_1",UCE_loss)
    dirichlet_weight = 0.0001 * tf.reduce_mean(-tfp.distributions.Dirichlet(alpha).entropy())
    print("dr",dirichlet_weight)
    UCE_loss = (
        UCE_loss + dirichlet_weight
    )

    return UCE_loss
loss = classifier_loss_tf(dummy_labels,dummy_outputs)
print('Total loss for this batch: {}'.format(loss))

soft_output tf.Tensor(
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]], shape=(4, 10), dtype=float32)
UCE_loss_1 tf.Tensor(0.23098798, shape=(), dtype=float32)
dr tf.Tensor(0.0015009035, shape=(), dtype=float32)
Total loss for this batch: 0.23248888552188873


In [None]:
# Optimizers specified in the torch.optim package
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [None]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [6]:
import torch
import torch.nn as nn

    
class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.linear1 = torch.nn.Linear(100, 200)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(200, 10)
        self.softmax = torch.nn.Softmax()
    
    def classifier_loss(ld_logits, y_l):
        n_classes=10
        alpha = torch.exp(ld_logits)  # / self.p_y.unsqueeze(0).to(self.device)
        # Multiply by class counts for Bayesian update

        # if self.alpha_fix:
        alpha = alpha + 1

        soft_output = F.one_hot(y_l, n_classes)
        alpha_0 = alpha.sum(1).unsqueeze(-1).repeat(1, self.n_classes)
        UCE_loss = torch.mean(
            soft_output * (torch.digamma(alpha_0) - torch.digamma(alpha))
        )
        UCE_loss = (
            UCE_loss + 0.0001 * -Dirichlet(alpha).entropy().mean()
        )
        # self.log("train/clf_loss", UCE_loss)

        return UCE_loss
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x

tinymodel = TinyModel()

In [2]:

#inspired by https://github.com/selflein/MA-EBM/blob/1e90da47118127ab432d11b9e2b2aa57305386fc/uncertainty_est/models/ebm/mcmc_priornet.py
def MAEBM():
    def loss(labels, outputs):
        # evidence = tf.exp(outputs) #activation function
        evidence = exp_evidence(outputs)
        alpha = evidence + 1
        S = tf.reduce_sum(alpha)
        # tf.print(S)
        alpha_0 = tf.fill(alpha.shape,S)
        # tf.print(alpha_0)
        
        # tf.print(alpha)
        UCE_loss = tf.reduce_mean(labels * (tf.compat.v1.digamma(alpha_0) - tf.compat.v1.digamma(alpha)))
        
        UCE_loss  +=  0.0001  * (tf.reduce_sum(tf.compat.v1.lgamma(alpha)) - tf.compat.v1.lgamma(S) -
                (len(alpha) - S) * tf.compat.v1.digamma(S) -
                ((alpha - 1.0) * tf.reduce_sum(tf.compat.v1.digamma(alpha))))
        # tf.print(UCE_loss)
        return UCE_loss
    return loss 

def classifier_loss(ld_logits, y_l, lg_logits):
        alpha = torch.exp(ld_logits)  # / self.p_y.unsqueeze(0).to(self.device)
        # Multiply by class counts for Bayesian update

        # if self.alpha_fix:
        alpha = alpha + 1
        n_classes = 3
        entropy_reg_weight = 0.0001
        
        soft_output = F.one_hot(y_l, n_classes)
        print("labels",soft_output)
        alpha_0 = alpha.sum(1)
        alpha_0= alpha_0.unsqueeze(-1).repeat(1, n_classes)
        print("alpha_0",alpha_0)
        UCE_loss = torch.mean(
            soft_output * (torch.digamma(alpha_0) - torch.digamma(alpha))
        )
        print("loss_1",UCE_loss)
        UCE_loss = (
            UCE_loss + entropy_reg_weight * -Dirichlet(alpha).entropy().mean()
        )
        # self.log("train/clf_loss", UCE_loss)

        return UCE_loss

def ebm():
    def loss_fn(labels, outputs):
        # evidence = tf.exp(outputs) #activation function
        evidence = tf.nn.log_softmax(outputs)
        loss  =tf.reduce_mean(tf.reduce_sum(-label * evidence)) 
        # loss += 0.0001 * 

        return loss
        
    return loss_fn 


In [3]:
alpha = torch.tensor([[.1,.1,.1]])
classifier_loss(alpha,torch.tensor([1]),None)

labels tensor([[0, 1, 0]])
alpha_0 tensor([[6.3155, 6.3155, 6.3155]])
loss_1 tensor(0.4244)


tensor(0.4245)

In [7]:
alpha = np.array([.1,.1,.1])
alpha = torch.tensor([.1,.1,.1])


print(-Dirichlet(alpha).entropy().mean())


tensor(13.0250)


In [None]:
def train():
    net.train()  # enter train mode
    loss_avg = 0.0

    # start at a random point of the outlier dataset; this induces more randomness without obliterating locality
    train_loader_out.dataset.offset = np.random.randint(len(train_loader_out.dataset))
    for in_set, out_set in zip(train_loader_in, train_loader_out):
        data = torch.cat((in_set[0], out_set[0]), 0) #dataset of the in distrib and out of distrib
        target = in_set[1] # labels for in distrub

        data, target = data.cuda(), target.cuda()

        # forward
        x = net(data) #predictions for all of data

        # backward
        scheduler.step()
        optimizer.zero_grad()

        loss = F.cross_entropy(x[:len(in_set[0])], target) # loss of all in distribution inputs
        # cross-entropy from softmax distribution to uniform distribution
        if args.score == 'energy':
            Ec_out = -torch.logsumexp(x[len(in_set[0]):], dim=1) # logsumexp of all of the OOD inputs
            Ec_in = -torch.logsumexp(x[:len(in_set[0])], dim=1) # logsumexp of all of the IOD inputs
            loss += 0.1*(torch.pow(F.relu(Ec_in-args.m_in), 2).mean() + torch.pow(F.relu(args.m_out-Ec_out), 2).mean()) #loss is equal to 0.1 times the pow2 of the relu of Energy of all IoD + the pow2 of the relu of Energy of all OOD
                            '''
                            so the IoD energy is reduced by 0.1
                            each energy is run through a relu and is minus a threshold, the default is -25 and -7 for IoD and OOD.
                            energy is measured as a negative number. 
                            
                            
                            '''
        elif args.score == 'OE':
            loss += 0.5 * -(x[len(in_set[0]):].mean(1) - torch.logsumexp(x[len(in_set[0]):], dim=1)).mean()

        loss.backward()
        optimizer.step()

        # exponential moving average
        loss_avg = loss_avg * 0.8 + float(loss) * 0.2
    state['train_loss'] = loss_avg


In [13]:
import numpy as np
import scipy
from scipy.special import logsumexp
import tensorflow as tf


In [8]:
x = np.array ([.54,.32,.76])
Ec_in = -logsumexp(x)
print(Ec_in)

-1.6546810021220524


In [15]:
loss =0
loss += 0.1*(pow(tf.nn.relu(Ec_in- (-25)), 2).numpy().mean()) #+ torch.pow(F.relu(args.m_out-Ec_out), 2).mean()) #loss is equal to 0.1 times the pow2 of the relu of Energy of all IoD + the pow2 of the relu of Energy of all OOD
print(loss)

54.500391911268096


In [30]:

import numpy as np
from scipy.special import gammaln, digamma


def dirichlet_prior_network_uncertainty(logits, epsilon=1e-10, alpha_correction=True):
    """
    :param logits:
    :param epsilon:
    :return:
    """

    logits = np.asarray(logits, dtype=np.float64)
    alphas = np.exp(logits)

    alphas = np.clip(alphas, 0, np.finfo(np.dtype("float32")).max)

    if alpha_correction:
        alphas = alphas + 1

    alpha0 = np.sum(alphas, axis=1, keepdims=True)
    probs = alphas / alpha0

    conf = np.max(probs, axis=1)

    entropy_of_exp = -np.sum(probs * np.log(probs + epsilon), axis=1)
    expected_entropy = -np.sum(
        (alphas / alpha0) * (digamma(alphas + 1) - digamma(alpha0 + 1.0)), axis=1
    )

    mutual_info = entropy_of_exp - expected_entropy

    epkl = np.squeeze((alphas.shape[1] - 1.0) / alpha0)

    dentropy = (
        np.sum(
            gammaln(alphas) - (alphas - 1.0) * (digamma(alphas) - digamma(alpha0)),
            axis=1,
            keepdims=True,
        )
        - gammaln(alpha0)
    )

    uncertainty = {
        "confidence_alea_uncert.": conf,
        "entropy_of_expected": -entropy_of_exp,
        "expected_entropy": -expected_entropy,
        "mutual_information": -mutual_info,
        "EPKL": -epkl,
        "differential_entropy": -np.squeeze(dentropy),
    }
    results = {}
    for k,v in uncertainty.items():
        if type(v) is type(np.array(0)):
            results[k] = torch.from_numpy(v).float()
        else: 
            results[k] = torch.from_numpy(np.array(v)).float()
    # k: torch.from_numpy(v).float() for k, v in uncertainty.items()
    return results

In [34]:
dirichlet_prior_network_uncertainty([[1,1,1,1,1,1,1,1,1,1]])
logits = torch.tensor([[1,1,1,1,1,1,1,1,1,1]])
dir_uncert = dirichlet_prior_network_uncertainty(logits)
dir_uncert["p(x)"] = logits.logsumexp(1)
dir_uncert["max p(y|x)"] = logits.float().softmax(1).max(1).values
print(dir_uncert)

{'confidence_alea_uncert.': tensor([0.1000]), 'entropy_of_expected': tensor([-2.3026]), 'expected_entropy': tensor([-2.1875]), 'mutual_information': tensor([-0.1151]), 'EPKL': tensor(-0.2420), 'differential_entropy': tensor(15.8416), 'p(x)': tensor([3.3026]), 'max p(y|x)': tensor([0.1000])}


In [None]:
#comparasion of tf and torch version of this function.
x = {'confidence_alea_uncert.': tensor([0.1000]), 
 'entropy_of_expected': tensor([-2.3026]), 
 'expected_entropy': tensor([-2.1875]), 
 'mutual_information': tensor([-0.1151]), 
 'EPKL': tensor(-0.2420), 
 'differential_entropy': tensor(15.8416), 
 'p(x)': tensor([3.3026]), 
 'max p(y|x)': tensor([0.1000])}


y = {'confidence_alea_uncert.': 0.09999999999999999, 
 'entropy_of_expected': -2.302585091994046, 
 'expected_entropy': -2.1874864443604523, 
 'mutual_information': -0.11509864763359356, 
 'EPKL': -0.2420472792329956, 
 'differential_entropy': 15.84162062462957, 
 'p(x)': <tf.Tensor: shape=(), dtype=float32, numpy=3.3025851>, 
 'max p(y|x)': 0.1}

In [25]:

import numpy as np
from scipy.special import gammaln, digamma


def dirichlet_prior_network_uncertainty(logits, epsilon=1e-10, alpha_correction=True):
    """
    :param logits:
    :param epsilon:
    :return:
    """

    logits = np.asarray(logits, dtype=np.float64)
    alphas = np.exp(logits)

    alphas = np.clip(alphas, 0, np.finfo(np.dtype("float32")).max)

    if alpha_correction:
        alphas = alphas + 1

    alpha0 = np.sum(alphas, axis=1, keepdims=True)
    probs = alphas / alpha0

    conf = np.max(probs, axis=1)

    entropy_of_exp = -np.sum(probs * np.log(probs + epsilon), axis=1)
    expected_entropy = -np.sum(
        (alphas / alpha0) * (digamma(alphas + 1) - digamma(alpha0 + 1.0)), axis=1
    )

    mutual_info = entropy_of_exp - expected_entropy

    epkl = np.squeeze((alphas.shape[1] - 1.0) / alpha0)

    dentropy = (
        np.sum(
            gammaln(alphas) - (alphas - 1.0) * (digamma(alphas) - digamma(alpha0)),
            axis=1,
            keepdims=True,
        )
        - gammaln(alpha0)
    )

    uncertainty = {
        "confidence_alea_uncert.": conf,
        "entropy_of_expected": -entropy_of_exp,
        "expected_entropy": -expected_entropy,
        "mutual_information": -mutual_info,
        "EPKL": -epkl,
        "differential_entropy": -np.squeeze(dentropy),
    }
    results = {}
    
    for k,v in uncertainty.items():
            results[k] =v.mean()
    # k: torch.from_numpy(v).float() for k, v in uncertainty.items()
    return results
# dirichlet_prior_network_uncertainty([[1,1,1,1,1,1,1,1,1,1]])
logits = [[1,1,1,1,1,1,1,1,1,1]]
dir_uncert = dirichlet_prior_network_uncertainty(logits)
dir_uncert["p(x)"] = logits.logsumexp(1)
dir_uncert["max p(y|x)"] = logits.softmax(1).max(1).values
print(dir_uncert)

AttributeError: 'list' object has no attribute 'logsumexp'