## Two-level variational Bayesian inference

In [1]:
import pandas as pd
import torch
from torch.autograd import Variable

import torch.nn as nn
import torch.optim as optim

In [2]:
from torch.distributions.normal import Normal
from torch.distributions.multivariate_normal import MultivariateNormal
from torch.distributions.gamma import Gamma

In [3]:
#mean = torch.Tensor([[1, 2, 3], [4, 5, 6]])
mean = torch.rand(3)
#cov1 = torch.eye(3)
#cov2 = torch.Tensor([[1, 1, 1], [1, 2, 2], [1, 2, 3]])
#cov = torch.stack([cov1, cov2], 0)
cov = torch.diag(torch.exp(torch.rand(3)))
#cov = torch.eye(3)
distrib = MultivariateNormal(loc=mean, covariance_matrix=cov)
distrib.sample().reshape(3,1)
#cov

tensor([[-0.4011],
        [-0.0233],
        [ 1.2526]])

In [4]:
Normal(0,1).sample(torch.Size([4,1]))

tensor([[0.7021],
        [2.1348],
        [0.1397],
        [1.2101]])

In [5]:
mean.sum()

tensor(1.7031)

In [6]:
Gamma(torch.exp(torch.rand(1)), torch.exp(torch.rand(1))).sample()

tensor([0.2100])

In [7]:
class StraightLineLayer(nn.Module):
    def __init__(self, input_dim):

        super().__init__()
        self.input_dim = input_dim
        self.W_mean = nn.Parameter(torch.rand(self.input_dim))
        self.V = nn.Parameter(torch.rand(self.input_dim))
        self.a = nn.Parameter(torch.rand(1))
        self.b = nn.Parameter(torch.rand(1))
        #self.c = nn.Parameter(torch.tensor(1.0))
        #self.d = nn.Parameter(torch.tensor(1.0))

    def forward(self, X):

        #alpha = Gamma(self.c, self.d).sample(torch.Size([1, 1]))
        tau = Gamma(torch.exp(self.a), torch.exp(self.b)).sample() 
        
        assert tau > 0
        #tau = torch.exp(log_tau)
        
        #V = torch.diag(torch.exp(self.V))
        
        z = Normal(0, 1).sample(torch.Size([self.input_dim, 1]))
        #cov = 1/tau * torch.diag(torch.exp(self.V))
        #W = MultivariateNormal(loc = self.W_mean, covariance_matrix = cov).sample().reshape(self.input_dim,1) 
        #W = Normal(self.W_mean, 1/tau * V).sample().reshape(self.input_dim, 1)
        W = self.W_mean.reshape(self.input_dim, 1) + z * (torch.exp(self.V) / tau)**(0.5)
        
        return torch.matmul(X, W), W, tau

In [8]:
class fullVariationalBayes(nn.Module):

    def __init__(self, input_dim, a0=1e-2, b0=1e-4, c0=1e-2, d0=1e-4, num_samples = 100):
        super().__init__()
        
        self.num_samples = num_samples
        self.c = nn.Parameter(torch.tensor(1.0))
        self.d = nn.Parameter(torch.tensor(1.0))
        
        self.a0 = nn.Parameter(torch.tensor(a0))
        self.b0 = nn.Parameter(torch.tensor(b0))
        self.c0 = nn.Parameter(torch.tensor(c0))
        self.d0 = nn.Parameter(torch.tensor(d0))
        
        self.f = StraightLineLayer(input_dim)
        
    def forward(self, x, y):
        
        nLogLik = 0.0
        tmp_data_size = x.shape[0]
        tmp_sigmaN2 = torch.exp(self.f.b - self.f.a)
        
        for i in range(self.num_samples):
            
            pred, W, tau = self.f(x)
            alpha = Gamma(torch.exp(self.c), torch.exp(self.d)).sample(torch.Size([1, 1]))
            nLogLik = nLogLik - Normal(pred, 1/tau**(0.5)).log_prob(y).sum() 
            nLogLik = nLogLik - Normal(0, 1/(tau*alpha)**(0.5)).log_prob(W).sum()
            nLogLik = nLogLik - Gamma(torch.exp(self.a0), torch.exp(self.b0)).log_prob(tau)
            nLogLik = nLogLik - Gamma(torch.exp(self.c0), torch.exp(self.d0)).log_prob(alpha)
                       
        nLogLik = nLogLik / self.num_samples
 
        LogVar_W_tau = (-1.) * tmp_data_size * (self.f.b + self.f.V.sum() + torch.exp(self.f.a - self.f.b) - torch.special.digamma(torch.exp(self.f.a)))
        LogVar_W_tau = LogVar_W_tau - torch.lgamma(torch.exp(self.f.a)) + (torch.exp(self.f.a) - 1) * torch.special.digamma(torch.exp(self.f.a)) + self.f.b - torch.exp(self.f.a)
    
        LogVar_alpha = (-1.) * torch.lgamma(torch.exp(self.c)) + (torch.exp(self.c) - 1) * torch.special.digamma(torch.exp(self.c)) + self.d - torch.exp(self.c)
        
        return LogVar_W_tau + LogVar_alpha + nLogLik 

In [9]:
tensorX = torch.hstack([torch.ones(100, 1), torch.rand(100, 3)])

In [10]:
tensorX.shape

torch.Size([100, 4])

In [11]:
tensory = torch.matmul(tensorX, torch.tensor([1., 2., 3., 5.]).reshape(4, 1))

In [12]:
from tqdm import tqdm

epochs = 1000

#model = nLogLikelyhood_v1()

#model = nLogLikelyhood_v2()

#model = maxPosterior_v1()

learning_rate = 0.02

model = fullVariationalBayes(input_dim = tensorX.shape[1])

optimizer = optim.Adam(model.parameters(), lr = learning_rate)

for epoch in tqdm(range(epochs), desc="Training..."):
    
    optimizer.zero_grad()
    
    #nLogLik = model(x_tensor, y_tensor)
    #e = torch.mean(nLogLik)
    
    nLogLik = model(tensorX, tensory)
    #nLogLik.backward(retain_graph=True)
    nLogLik.backward()
    
    #e.backward()
    optimizer.step()

Training...:  62%|█████████████████████████████████████████▌                         | 620/1000 [00:52<00:32, 11.78it/s]


ValueError: Expected parameter concentration (Tensor of shape (1,)) of distribution Gamma(concentration: tensor([nan], grad_fn=<ExpBackward0>), rate: tensor([871518.0625], grad_fn=<ExpBackward0>)) to satisfy the constraint GreaterThan(lower_bound=0.0), but found invalid values:
tensor([nan], grad_fn=<ExpBackward0>)