In [1]:
from gradient_descent import gradient_descent, minibatch_class
import utils
from utils import Poisson_reg
from utils import sample_PLN

import matplotlib.pyplot as plt
from pandas import read_csv
import math
from timeit import default_timer as timer
from tqdm import tqdm 
import time 
import sys 

from __future__ import print_function
import psutil
import multiprocessing
import concurrent.futures
import threading

import numpy as np
import torch
from torch import nn
import scipy.linalg as SLA 
from scipy.linalg import toeplitz 
torch.set_default_dtype(torch.float64)

In [59]:
class PLN_full():
    def __init__(self, C_init, beta_init, M_init, S_init, requires_tridiag = True): 
        '''
            Initialization : 
            'Y' : the data, size (n,p). n is the number of samples we have and p the number of species. 
                  the entries should be int but dtype = float. 
            'O': offset : additional offset. (not very important for comprehension). size (n,p)
            'covariates' : covariates, size (n,d)
            'C_init' : initialization for C. I plan to do a more advanced initialization. 
            'beta_init ' : Initialization for beta. I plan to do a more advanced initialization. 
            'M_init' : initialization for the variational parameter M
            'S_init ': initialization for the variational parameter S
        '''
        # model parameters
        
        # we start with the initialization of C, that is the most subtle. If you want tridiagonals matrices 
        # at every iteration then the parameters C of the optimizer will be a vector ( we will change this vector 
        # into a matrix at each iteration. It is more convenient this way. 
        #Else, it will be a matrix

        
        if requires_tridiag : 
            self.C = torch.clone(mat2vec_lower_tridiag(C_init))
            self.requires_tridiag = True 
        else : 
            self.C = torch.clone(C_init) 
            self.requires_tridiag = False 
            
        #self.C.requires_grad_(True)
        
        self.beta = torch.clone(beta_init)
        self.beta.requires_grad_(True)
        
        self.Sigma = torch.clone(torch.mm(C_init,C_init.T))
        self.Sigma.requires_grad_(False) # we don't need to have a gradient for Sigma since we will 
                                         # always update it with a closed form (since we have constraints
                                         # of positivity and symmetry)
        #variational parameters
        self.M = torch.clone(M_init)
        self.M.requires_grad_(True)
        self.S = torch.clone(S_init) 
        self.S.requires_grad_(True)
        
        
        
        self.params = {'S' : self.S,'M' : self.M,'C' : self.C, 'beta' : self.beta}
        
        self.optimizer = torch.optim.Adam(self.params.values(), lr = 0.002)
        
        self.params['Sigma'] = self.Sigma
        
        # some list to store some stats
        self.MSE_Sigma_list = list()
        self.MSE_beta_list = list()
        self.ELBO_list = list()
        self.running_times = list()

        
    def extract_data(self,data): 
        '''
        function to extract the data. This function is just here to have a code more compact. 
        
        args : 
              'data': list with 3 elements : Y, O and covariates in this order. 
        '''
        #known variables
        self.Y = data[0];self.O = data[1];self.covariates = data[2]
        self.n, self.p = self.Y.shape
        
    def compute_ELBO(self): 
        return ELBO(self.Y,self.O , self.covariates,self.M ,self.S ,self.C ,self.beta)
    
    def compute_ELBO_tridiag(self): 
        return ELBO_tridiag(self.Y,self.O , self.covariates,self.M ,self.S ,self.C ,self.beta)
    def compute_ELBO_Sigma(self): 
        #we update Sigma
        #self.Sigma = 1/self.n*(torch.sum(torch.stack([torch.outer(self.M[i,:],self.M[i,:]) + torch.diag(torch.multiply(self.S,self.S)[i,:]) 
         #                              for i in range(self.n)]), axis = 0))
        
        
        return ELBO_Sigma(self.Y,self.O , self.covariates,self.M ,self.S ,self.Sigma ,self.beta)
    
    def full_grad_ascent(self,data, lr = 0.1, tolerance = 0, N_epoch = 1000, verbose = True ): 
        self.extract_data(data)
        if self.requires_tridiag : 
            loss = self.compute_ELBO_tridiag
        else : 
            loss = self.compute_ELBO
        
        self.last_params, self.running_times, self.MSE_Sigma_list, self.MSE_beta_list, self.ELBO_list = self.torch_gradient_ascent(self.optimizer, 
                            loss,self.params, lr = lr, tolerance = tolerance, N_epoch = N_epoch, requires_closed_beta= True, 
                            requires_closed_Sigma= False,verbose = verbose)
    def full_grad_ascent_Sigma(self,data, lr = 0.1, tolerance = 0, N_epoch = 1000, verbose = True ): 
        self.extract_data(data)
        self.last_params, self.running_times, self.MSE_Sigma_list, self.MSE_beta_list, self.ELBO_list = self.torch_gradient_ascent(self.optimizer, 
                            self.compute_ELBO_Sigma,self.params, lr = lr, tolerance = tolerance, N_epoch = N_epoch, requires_closed_beta= True, 
                            requires_closed_Sigma= True, verbose = verbose)
        
    def torch_gradient_ascent(self,optimizer, compute_gain,params,  lr , tolerance , N_epoch , requires_closed_beta, requires_closed_Sigma, verbose): 
        '''
        gradient ascent function. We compute the gradients thanks to the autodifferentiation of pytorch. 

        args : 
                'optimizer' : torch.optim.optimizer. the optimizer for the parameters. 

                'compute_gain' : function. It should call the parameters by itself. i.e. we wil call compute_gain() without any parameter 
                                in argument. 

                'lr' : float.  a learning rateM if we want to set the optimizer learning rate to a certain lr. If None, 
                      it will take the actual learning_rate of the optimizer. 
                'tolerance': float. the threshold we set to stop the algorithm. It will stop if the norm of each gradient's parameter 
                             is lower than this threshold, or if we are not improving the loss more than tolerance. 
                'N_epoch': int. the Maximum number of epoch we are ready to do. 

                'Verbose' : bool. if True, will print some messages useful to interpret the gradient ascent. If False, nothing will be printed. 

                we have ot yet implement it so thtat it can takes batches. 


        returns : the parameters optimized. 
        '''

        t0 = time.time()

        MSE_Sigma_list = list()
        MSE_beta_list = list()
        ELBO_list = list()
        running_times = list()
        # we set the gradient to zero just to make sure the gradients are properly calculated
        optimizer.zero_grad()
        if lr is not None : # if we want to set a threshold, we set it. Ohterwise, we skip this condition and keep the actual learning_rate
            optimizer.param_groups[0]['lr'] = lr 


        #if requires_closed_beta : 

        '''
        #if batch_size is None, we take n. 
        if batch_size == None : 
            batch_size = self.Y.shape[0]
        '''

        stop_condition = False 
        i = 0
        old_loss = 1.

        while i < N_epoch and stop_condition == False: 
            optimizer.zero_grad()
            loss = -compute_gain()
            loss.backward()
            with torch.no_grad(): 
                #model.C = mat2vec_lower_tridiag(torch.cholesky(model.Sigma))
                #print('ELBO : ', model.compute_ELBO_tridiag())
                pass
            #print('sanity check : ', torch.norm(params['C'].grad+grad_C(data[0], data[1],data[2] ,params['M'] ,params['S'] ,params['C'] ,params['beta']) ))
            if torch.isnan(loss).item() == True : 
                print('NAN')
            else : last_params = params 
            optimizer.step()
            i += 1


            
    #         print('params[C] before', params['C'])
    #         print('model.C before : ', model.C)
            #params['C'] = mat2vec_lower_tridiag(torch.cholesky(tmp))
            if i % 100 == 0 : 
                with torch.no_grad():
                    tmp = closed_Sigma(params['M'],params['S'])
#                     print('params C avant : ', params['C'])
#                     self.C = mat2vec_lower_tridiag(torch.cholesky(tmp))
#                     print('params C après : ', params['C'])
                    print('self.C avant : ', self.C)
                    closed_form = mat2vec_lower_tridiag(torch.cholesky(tmp))
                    params['C'] = closed_form
                    self.C = closed_form
                    print('self.C apres : ', self.C)
                    
                    #params['C'] = mat2vec_lower_tridiag(torch.cholesky(tmp))
    #         print('model.C after : ', model.C)
            #print('prams[C] after : ', params['C'])
            # condition to see if we have reach the tolerance threshold
            if  abs(loss.item() - old_loss) < tolerance : 
                #if max([torch.norm(param.grad) for param in params]) < tolerance  or abs(loss.item()- old_loss)>  tolerance :
                stop_condition = True 
            old_loss = loss.item()

            running_times.append(time.time()-t0)

            if requires_closed_Sigma : 
                print('formula with Sigma')
                if i % 100 == 0 : 
                    params['Sigma'] = closed_Sigma(params['M'],params['S'])
                MSE_Sigma_list.append(torch.mean((params['Sigma']-true_Sigma)**2).item())

            else : 

                if len(params['C'].shape) > 1 : 
                    #print('formula with C of size 2')
                    MSE_Sigma_list.append(torch.mean((torch.mm(params['C'],params['C'].T)-true_Sigma)**2).item())
                else : 
                    #print('formula with C of size 1')
                    MSE_Sigma_list.append(torch.mean((torch.mm(vec2mat_lower_tridiag(params['C']),
                                                           vec2mat_lower_tridiag(params['C']).T)-true_Sigma)**2).item())


            #print('grad_Sigma: ', grad_Sigma(data[0],data[1],data[2],params['M'],params['S'],params['Sigma'], params['beta']))
            MSE_beta_list.append(torch.mean((params['beta']-true_beta)**2).item())
            ELBO_list.append(-loss.item())

            if i%100 == 0 and verbose : 
                print('iteration number: ', i)
                print('-------UPDATE-------')
                print(' MSE with Sigma : ', np.round(MSE_Sigma_list[-1],5))
                print(' MSE with beta : ', np.round(MSE_beta_list[-1],5))
                print('ELBO : ', np.round(-loss.item(),5))
                print_stats(loss, last_params, optimizer)


        # just print some stats if we want to 
        if stop_condition : 
            print('---------------------------------Tolerance {} reached in {} iterations'.format(tolerance, i))
        else : 
            print('---------------------------------Maximum number of iterations reached : ', N_epoch)
            print_stats(loss, last_params, optimizer)
        print(' MSE with Sigma : ', np.round(MSE_Sigma_list[-1],5))
        print(' MSE with beta : ', np.round(MSE_beta_list[-1],5))

        return last_params, running_times, MSE_Sigma_list, MSE_beta_list, ELBO_list
    

In [60]:
model = PLN_full(C_init, beta_init, M_init, S_init, requires_tridiag = True)
%time model.full_grad_ascent(data, N_epoch = 6000,verbose=True, lr = 0.1)

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

self.C avant :  tensor([2.3104, 0.0000, 0.3733, 0.0000, 0.0000, 1.0608, 0.0000, 0.0000, 0.0000,
        0.9995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0200, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.2103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.1772, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8305,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2387,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.7335, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 1.3219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.3148, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.6870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4881, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 

In [4]:
print(model.optimizer.param_groups[0]['params'][1].shape)
model.params['C'].requires_grad = False

NameError: name 'model' is not defined

In [63]:
def torch_gradient_ascent(optimizer, compute_gain,params,  lr , tolerance , N_epoch , requires_closed_beta, requires_closed_Sigma, verbose): 
    '''
    gradient ascent function. We compute the gradients thanks to the autodifferentiation of pytorch. 

    args : 
            'optimizer' : torch.optim.optimizer. the optimizer for the parameters. 
            
            'compute_gain' : function. It should call the parameters by itself. i.e. we wil call compute_gain() without any parameter 
                            in argument. 

            'lr' : float.  a learning rateM if we want to set the optimizer learning rate to a certain lr. If None, 
                  it will take the actual learning_rate of the optimizer. 
            'tolerance': float. the threshold we set to stop the algorithm. It will stop if the norm of each gradient's parameter 
                         is lower than this threshold, or if we are not improving the loss more than tolerance. 
            'N_epoch': int. the Maximum number of epoch we are ready to do. 

            'Verbose' : bool. if True, will print some messages useful to interpret the gradient ascent. If False, nothing will be printed. 
            
            we have ot yet implement it so thtat it can takes batches. 


    returns : the parameters optimized. 
    '''

    t0 = time.time()
    
    MSE_Sigma_list = list()
    MSE_beta_list = list()
    ELBO_list = list()
    running_times = list()
    # we set the gradient to zero just to make sure the gradients are properly calculated
    optimizer.zero_grad()
    if lr is not None : # if we want to set a threshold, we set it. Ohterwise, we skip this condition and keep the actual learning_rate
        optimizer.param_groups[0]['lr'] = lr 

        
    #if requires_closed_beta : 
        
    '''
    #if batch_size is None, we take n. 
    if batch_size == None : 
        batch_size = self.Y.shape[0]
    '''

    stop_condition = False 
    i = 0
    old_loss = 1.

    while i < N_epoch and stop_condition == False: 
        optimizer.zero_grad()
        loss = -compute_gain()
        loss.backward()
        with torch.no_grad(): 
            #model.C = mat2vec_lower_tridiag(torch.cholesky(model.Sigma))
            #print('ELBO : ', model.compute_ELBO_tridiag())
            pass
        #print('sanity check : ', torch.norm(params['C'].grad+grad_C(data[0], data[1],data[2] ,params['M'] ,params['S'] ,params['C'] ,params['beta']) ))
        if torch.isnan(loss).item() == True : 
            print('NAN')
        else : last_params = params 
        optimizer.step()
        i += 1
        
                
        #tmp = closed_Sigma(params['M'],params['S'])
#         print('params[C] before', params['C'])
#         print('model.C before : ', model.C)
        #params['C'] = mat2vec_lower_tridiag(torch.cholesky(tmp))
        if i % 100 == 1001 : 
            with torch.no_grad():
                model.C = mat2vec_lower_tridiag(torch.cholesky(tmp))
                params['C'] = mat2vec_lower_tridiag(torch.cholesky(tmp))
#         print('model.C after : ', model.C)
        #print('prams[C] after : ', params['C'])
        # condition to see if we have reach the tolerance threshold
        if  abs(loss.item() - old_loss) < tolerance : 
            #if max([torch.norm(param.grad) for param in params]) < tolerance  or abs(loss.item()- old_loss)>  tolerance :
            stop_condition = True 
        old_loss = loss.item()
        
        running_times.append(time.time()-t0)
        
        if requires_closed_Sigma : 
            print('formula with Sigma')
            if i % 100 == 0 : 
                params['Sigma'] = closed_Sigma(params['M'],params['S'])
            MSE_Sigma_list.append(torch.mean((params['Sigma']-true_Sigma)**2).item())
        
        else : 
            
            if len(params['C'].shape) > 1 : 
                #print('formula with C of size 2')
                MSE_Sigma_list.append(torch.mean((torch.mm(params['C'],params['C'].T)-true_Sigma)**2).item())
            else : 
                #print('formula with C of size 1')
                MSE_Sigma_list.append(torch.mean((torch.mm(vec2mat_lower_tridiag(params['C']),
                                                       vec2mat_lower_tridiag(params['C']).T)-true_Sigma)**2).item())
        
        
        #print('grad_Sigma: ', grad_Sigma(data[0],data[1],data[2],params['M'],params['S'],params['Sigma'], params['beta']))
        MSE_beta_list.append(torch.mean((params['beta']-true_beta)**2).item())
        ELBO_list.append(-loss.item())
        
        if i%100 == 0 and verbose : 
            print('iteration number: ', i)
            print('-------UPDATE-------')
            print(' MSE with Sigma : ', np.round(MSE_Sigma_list[-1],5))
            print(' MSE with beta : ', np.round(MSE_beta_list[-1],5))
            print('ELBO : ', np.round(-loss.item(),5))
            print_stats(loss, last_params, optimizer)
            
    
    # just print some stats if we want to 
    if stop_condition : 
        print('---------------------------------Tolerance {} reached in {} iterations'.format(tolerance, i))
    else : 
        print('---------------------------------Maximum number of iterations reached : ', N_epoch)
        print_stats(loss, last_params, optimizer)
    print(' MSE with Sigma : ', np.round(MSE_Sigma_list[-1],5))
    print(' MSE with beta : ', np.round(MSE_beta_list[-1],5))
    
    return last_params, running_times, MSE_Sigma_list, MSE_beta_list, ELBO_list

def print_stats(loss, params, optimizer): 
    '''
    small function that print some stats. 

    It will print the actual learning rate of the optimizer, the actual log likelihood 
    and the norms of each parameter's gradient. The norm of the parameter's gradient should be low
    when we are close to the optimum. 
    '''
    print('---------------------------------lr :', optimizer.param_groups[0]['lr'])
    print('---------------------------------log likelihood :', - loss.item())
    for param_name, param in params.items(): 
        try :
            with torch.no_grad : # to avoid Warnings 
                print('---------------------------------grad_{}_norm : '.format(param_name), round(torch.norm(param.grad).item(), 3))
        except : 
            pass 

def ELBO(Y, O,covariates ,M ,S ,C ,beta): 
    n = Y.shape[0]
    SrondS = torch.multiply(S,S)
    OplusM = O+M
    MmoinsXB = M-torch.mm(covariates, beta) 
    
    tmp = torch.sum(  torch.multiply(Y, OplusM)  -torch.exp(OplusM+SrondS/2) +1/2*torch.log(SrondS))
    
    tmp -= 1/2*torch.trace(  
                            torch.mm(  
                                        torch.inverse(torch.mm(C,C.T)), 
                                        torch.diag(torch.sum(SrondS, dim = 0))+ torch.mm(MmoinsXB.T, MmoinsXB)
                                    )
                          )
    tmp-= n*torch.log(torch.det(C))
    return tmp 

def ELBO_tridiag(Y, O,covariates ,M ,S ,vectC ,beta):
    matC = vec2mat_lower_tridiag(vectC)
    return ELBO(Y, O,covariates ,M ,S ,matC ,beta)

def grad_beta(Y, O, covariates ,M ,S ,C ,beta) : 
    grad = torch.mm(torch.mm(covariates.T, M-torch.mm(covariates, beta) ), torch.inverse(torch.mm(C,C.T)))
    return grad 

def grad_M(Y, O, covariates ,M ,S ,C ,beta):
    grad = Y - torch.exp(O+M+torch.multiply(S,S)/2)-torch.mm(M-torch.mm(covariates,beta), torch.inverse(torch.mm(C,C.T)))
    return grad 
def grad_S(Y, O, covariates ,M ,S ,C ,beta): 
    return torch.div(1,S)-torch.multiply(S, torch.exp(O+M+torch.multiply(S,S)/2))-torch.mm(S, torch.diag(torch.diag(torch.inverse(torch.mm(C,C.T)))))

def grad_C(Y, O, covariates ,M ,S ,C ,beta): 
    n = Y.shape[0]
    CCT = torch.mm(C,C.T)
    MmoinsXB = M-torch.mm(covariates, beta) 
    big_mat = torch.diag(torch.sum(torch.multiply(S,S), dim = 0))+ torch.mm(MmoinsXB.T, MmoinsXB)
    return torch.mm(torch.mm(torch.inverse(C),(big_mat +big_mat.T)/2.),torch.inverse(CCT)).T - Y.shape[0]*torch.inverse(C).T
def grad_Sigma(Y, O, covariates ,M ,S ,Sigma ,beta): 
    n = Y.shape[0]
    inv_Sigma = torch.inverse(Sigma)
    grad = -n/2*(inv_Sigma)
    grad += 1/2*(sum([inv_Sigma@(torch.outer(M[i,:],M[i,:])+ torch.diag(torch.multiply(S,S)[i,:]))@inv_Sigma 
                      for i in range(n)]))
    return grad


def closed_Sigma(M,S):
    n = M.shape[0]
    print
    test = 1/n*(torch.mm(M,M.T) + torch.sum(torch.multiply(S,S), dim = 0)) 
    real = 1/n*(torch.sum(torch.stack([torch.outer(M[i,:],M[i,:]) + torch.diag(torch.multiply(S,S)[i,:]) 
                                       for i in range(n)]), axis = 0))
    print_stats('test-real ', test-real)
    return 1/n*(torch.sum(torch.stack([torch.outer(M[i,:],M[i,:]) + torch.diag(torch.multiply(S,S)[i,:]) 
                                       for i in range(n)]), axis = 0))
                          

def ELBO_Sigma(Y, O,covariates ,M ,S ,Sigma ,beta):
    n = Y.shape[0]
    SrondS = torch.multiply(S,S)
    OplusM = O+M
    MmoinsXB = M-torch.mm(covariates, beta) 
    tmp = torch.sum(  torch.multiply(Y, OplusM)  -torch.exp(OplusM+SrondS/2) +1/2*torch.log(SrondS))
    tmp -= 1/2*torch.trace(  
                            torch.mm(  
                                        torch.inverse(Sigma_init), 
                                        torch.diag(torch.sum(SrondS, dim = 0))+ torch.mm(MmoinsXB.T, MmoinsXB)
                                    )
                          )
    tmp-= n/2*torch.log(torch.det(Sigma))
    return tmp
#grad_test(Y_sampled, O, covariates, M_init,S_init, C_init, beta_init )
print(grad_Sigma(Y_sampled, O, covariates, M_init,S_init, closed_Sigma(M_init,S_init), beta_init))


RuntimeError: The size of tensor a (400) must match the size of tensor b (20) at non-singleton dimension 1


$$\boxed{\begin{align} J_{\theta, q}(Y) &=\mathbb{1}_n^{\top}\left(Y \odot(O+M)-\frac{1}{2} \exp \left(O+M+\frac{S\odot S}{2}\right) + \frac 12 \log (S \odot S) \right)\mathbb{1}_p \\
& \quad  - \frac 12\operatorname{tr}\left((CC^{\top})^{-1}\left(\operatorname{diag}(\mathbb{1}_n^{\top} (S\odot S))+(M-X \beta)^{\top}(M-X \beta)\right)\right) \\
& \quad  - n \log |C|+ cst 
\end{align}}$$

$$\boxed{\begin{align} J_{\theta, q}(Y) &=\mathbb{1}_n^{\top}\left(Y \odot(O+M)-\frac{1}{2} \exp \left(O+M+\frac{S\odot S}{2}\right) + \frac 12 \log (S \odot S) \right)\mathbb{1}_p \\
& \quad  - \frac 12\operatorname{tr}\left(\Sigma^{-1}\left(\operatorname{diag}(\mathbb{1}_n^{\top} (S\odot S))+(M-X \beta)^{\top}(M-X \beta)\right)\right) \\
& \quad  - \frac n2 \log |\Sigma|+ cst 
\end{align}}$$

In [47]:
model = PLN_full(C_init, beta_init, M_init, S_init)
%time model.full_grad_ascent(data, N_epoch = 6000,verbose=True, lr = 0.01)

iteration number:  100
-------UPDATE-------
 MSE with Sigma :  0.06394
 MSE with beta :  0.92219
ELBO :  68927.20315
---------------------------------lr : 0.01
---------------------------------log likelihood : 68927.20314820035
iteration number:  200
-------UPDATE-------
 MSE with Sigma :  0.06683
 MSE with beta :  0.5905
ELBO :  94676.31783
---------------------------------lr : 0.01
---------------------------------log likelihood : 94676.31782619393
iteration number:  300
-------UPDATE-------
 MSE with Sigma :  0.05249
 MSE with beta :  0.37158
ELBO :  111383.41081
---------------------------------lr : 0.01
---------------------------------log likelihood : 111383.41080900468
iteration number:  400
-------UPDATE-------
 MSE with Sigma :  0.04338
 MSE with beta :  0.25187
ELBO :  121869.64466
---------------------------------lr : 0.01
---------------------------------log likelihood : 121869.64465887006
iteration number:  500
-------UPDATE-------
 MSE with Sigma :  0.0426
 MSE with beta 

iteration number:  3700
-------UPDATE-------
 MSE with Sigma :  0.11404
 MSE with beta :  0.08467
ELBO :  135573.72048
---------------------------------lr : 0.01
---------------------------------log likelihood : 135573.72048354792
iteration number:  3800
-------UPDATE-------
 MSE with Sigma :  0.11404
 MSE with beta :  0.08401
ELBO :  135580.68231
---------------------------------lr : 0.01
---------------------------------log likelihood : 135580.68230559537
iteration number:  3900
-------UPDATE-------
 MSE with Sigma :  0.11403
 MSE with beta :  0.08333
ELBO :  135587.39482
---------------------------------lr : 0.01
---------------------------------log likelihood : 135587.39481511613
iteration number:  4000
-------UPDATE-------
 MSE with Sigma :  0.11401
 MSE with beta :  0.08262
ELBO :  135593.87287
---------------------------------lr : 0.01
---------------------------------log likelihood : 135593.8728735441
iteration number:  4100
-------UPDATE-------
 MSE with Sigma :  0.11398
 MSE 

In [49]:
d = 4 # nb of cavariates
n = 400; p = 20

In [50]:
true_Sigma = torch.from_numpy(toeplitz(0.5**np.arange(p)))
true_C = torch.cholesky(true_Sigma)
true_beta = torch.randn(d, p)

covariates = torch.rand((n,d))
O =  1+torch.zeros((n,p))

sample_model = sample_PLN()
Y_sampled = torch.from_numpy(sample_model.sample(true_Sigma,true_beta, O, covariates)) 

data = [Y_sampled.double(), O, covariates]

In [51]:
torch.manual_seed(0)

noise = torch.randn(p) 
Sigma_init =  torch.diag(noise**2)
#Sigma_init = torch.from_numpy(toeplitz(0.4**np.arange(p)))
C_init = torch.cholesky(Sigma_init)
beta_init = torch.rand((d, p))

M_init = torch.ones((n,p))/100# some random values to initialize we divide to avoid nan values 
S_init = torch.ones((n,p))/8 # some random values to initializ. we divise to avoid nan values 

In [3]:
def vec2mat_lower_tridiag(vectC): 
    '''
    transform a vector of size (n(n+1)/2) into a lower tridiagonal matrix of size (n,n). 
    
    args : 'vectC' vector of size (n*(n+1)/2)
    
    returns 'matC' : matrice of size(n,n). Its lower tridiagonal will be computed from 
            vectC as follows : 
            for all i < n : - matC[i,i] = out[i*(i+1)/2]
                            - matC[i+1, 0] = out[i*(i+1)/2 + 1]
            matC[n,n] = out[n*(n+1)/2] 
            
            see torch.tril_indices for more 
            
    Note that if x is an array of size (n*(n+1)/2) for any integer n, 
    then mat2vec_lower_tridiag(vec2mac_lower_trig(x)) = x 
    '''
    
    somme_entiers = vectC.shape[0]
    n = int((-1+math.sqrt(1+8*somme_entiers))/2)
    mask = torch.tril_indices(n,n).unbind()
    matC = torch.zeros((n,n))#, dtype = torch.float)
    matC[mask] = vectC
    return matC

def mat2vec_lower_tridiag(matC): 
    '''
    extract the lower tridiagonal of a matrice. returns a vector. 
    
    args : 'matC' : A squared matrices of float. 
    
    returns : a vector of size(n*(n+1)/2) where n is the first (or second) size of matC. 
              It is computed as the following : 
              for all i < n : -out[i*(i+1)/2] = matC[i,i]
                              -out[i*(i+1)/2 + 1] = matC[i+1, 0]
              out[n*(n+1)/2] = matC[n,n]
              
    Note that if x is an array of size (n*(n+1)/2) for any integer n, 
    then mat2vec_lower_tridiag(vec2mac_lower_trig(x)) = x 
    
    see torch.tril_indices for more 
    '''
    n = matC.shape[0]
    mask = torch.tril_indices(n,n).unbind()
    return matC[mask]


def grad_test(Y_, O_, covariates_,M_ ,S_ ,C_ ,beta_): 
    Y = torch.clone(Y_)
    O = torch.clone(O_)
    covariates = torch.clone(covariates_)
    M = torch.clone(M_)
    S = torch.clone(S_)
    C = torch.clone(C_)
    beta = torch.clone(beta_)
    for i in range(500): 
        grad = grad_C(Y, O, covariates ,M ,S ,C ,beta)
        C+=0.00000015*grad 
        #M+=0.00015*grad 
        #beta+=0.00000015*grad
        if torch.isnan(ELBO(Y, O, covariates ,M ,S ,C ,beta)) == True : 
            print('nan')
        if i % 200 == 0: 
            print('ELBO : ', ELBO(Y, O, covariates ,M ,S ,C ,beta))
        #print('norm grad : ', torch.norm(grad))

Here we have a PLN model described as the following : 

- Consider $n$ sites $(i=1 \ldots n)$

- Measure $x_{i}=\left(x_{i h}\right)_{1 \leq h \leq d}$ :
$x_{i h}=$ given environmental descriptor (covariate) for site $i$
(altitude, temperature, latitude, ...)

- Consider $p$ species $(j=1 \ldots p)$ Measure $Y=\left(Y_{i j}\right)_{1 \leq i \leq n, 1 \leq j \leq p}$ :

- Measure $Y = Y_{i j}=$ number of observed individuals from species $j$ in site $i$ (abundance). 

- Associate a random vector $Z_{i}$ with each site Assume that the unknown $\left(Z_{i}\right)_{1 \leq i \leq n}$ are independant such that:
$$
Z_{i} \sim \mathcal{N}_{p}(x_i \beta, \Sigma) \quad \Sigma = CC^{\top}
$$

and $C$ is a lower triangular matrix. 
- Assume that the observed abundances $\left(Y_{i j}\right)_{1 \leq i \leq n, 1 \leq j \leq p}$ are independent conditionally on the $Z=\left(Z_{i}\right)_{i}$


$$
\left(Y_{i j} \mid Z_{i j}\right) \sim \mathcal{P}\left(\exp \left(o_{i j}+Z_{i j}\right)\right)
$$

Where $O = (o_{ij})_{1\leq i\leq n, 1\leq j\leq p}$ are known offsets. 

The unknown parameter is $\theta = (C,\beta)$.

$Z$ being a latent variable, we want to use the EM algorithm to derive the maximum likelihood estimator. However, it requires to compute 

$$
\mathbb{E}_{\theta}\left[p_{\theta}\left(Z_{i} \mid Y\right)\right]=\mathbb{E}_{\theta}\left[p_{\theta}\left(Z_{i} \mid Y_{i}\right)\right] \propto \int_{\mathbb{R}^{p}} p_{\theta}\left(Z_{i}\right) \prod_{j} p_{\theta}\left(Y_{i j} \mid Z_{i j}\right) \mathrm{d} Z_{i}
$$ which is intractable in practice. 

We thus choose the variationnal approximation. We set

$$ 
q^{\star} = \underset{q \in \mathcal{Q_{gauss}}}{\operatorname{argmax}} J_{\theta,q}(Y) 
$$
<br>
$$
\begin{align} J_{\theta, q}(Y)& =\log p_{\theta}(Y)-K L\left[q(Z) \| p_{\theta}(Z \mid Y)\right]                                    \\ 
                              & = \mathbb{E}_{q}\left[\log p_{\theta}(Y, Z)\right] \underbrace{-\mathbb{E}_{q}[\log q(Z)]}_{\text {entropy } \mathcal{H}(q)}    \end{align}
$$

where 

$$
\mathcal{Q}_{\text {Gauss }}=\{
q=\left(q_{1}, \ldots q_{n}\right), q_{i} \sim \mathcal{N}\left(M_{i}, \operatorname{diag} (S_{i}\odot S_i ), M_i \in \mathbb{S} ^p, S_i \in \mathbb{R} ^p\right)\}
$$


The Variational EM (VEM) consists in alternate between two steps : 
- VE step: update $q$
$$
q^{h+1}=\underset{q \in \mathcal{Q_{gauss}}}{\arg \max } J_{\theta^{h}, q}(Y)=\underset{q \in \mathcal{Q_{gauss}}}{\arg \min } K L\left[q(Z) \| p_{\theta^{h}}(Z \mid Y)\right]
$$
- M step: update $\theta$
$$
\theta^{h+1}=\underset{\theta}{\arg \max } J_{\theta, q^{h+1}}(Y)=\underset{\theta}{\arg \max } \mathbb{E}_{q^{h+1}}\left[\log p_{\theta}(Y, Z)\right]
$$

Let's compute the ELBO $J_{\theta, q}(Y)$


$$
J_{\sigma, q}(Y)=\underbrace{\mathbb{E}_{q}\left[\log p_{\theta}(Y \mid Z)\right]}_{(1)}+\underbrace{E_{q}\left[\log p_{\theta}(Z)\right]}_{(2)}+\underbrace{H(q)}_{(3)}
$$
$$
\begin{align}
(1)& =\sum_{i} \mathbb{E}_{q}\left[\log p_{\theta}\left(Y_{i} \mid Z\right)\right] \\
&=\sum_{i} \mathbb{E}_{q}\left[\log p_{\theta}\left(Y_{i} \mid Z_{i}\right)\right] \\
&=\sum_{i, j} \mathbb{E}_{q}\left[\log p_{\theta}\left(Y_{i j} \mid Z_{i j}\right)\right] \\
& =\sum_{i, j} \mathbb{E}_{q}\left[Y_{i j}\left(o_{i j}+Z_{i j}\right)-\exp \left(o_{i j}+Z_{i j}\right)\right] + cst
\end{align}
$$

We need to compute some moments of $Z$ under $q$. 

$$
\mathbb{E}_{q}\left[Z_{i j}\right]=M_{i j} \quad \quad E_{q}\left[\operatorname{exp}\left(Z_{i j}\right)\right]=\frac{1}{2} \operatorname{exp}\left(M_{i j}+\frac{(S_{ij})^2}{2}\right)
$$

So that 

$$
\begin{aligned}
(1) &=\sum_{i, j} Y_{i j}\left(o_{i j}+M_{i j}\right)-\frac{1}{2} \exp \left(o_{i j}+M_{i j}+\frac{(S_{i_{jj}})^2}{2}\right) + cst \\
&=\mathbb{1}_n^{\top}\left(Y \odot(O+M)-\frac{1}{2} \exp \left(O+M+\frac{S \odot S}{2}\right)\right)\mathbb{1}_p + cst 
\end{aligned}
$$



Where we have denoted $M = (M_{ij})_{1\leq i\leq n, 1\leq j\leq p}$ and $ S = (S_{ij})_{1\leq i\leq n, 1\leq j\leq p}$. The exponential is applied pointwise on the last equation. 

$$
\begin{aligned}
(3)=H(q) &=\sum_{i} H\left(q_{i}\right) \\
&=\sum_{i} \log \left(\sqrt{(2 \pi e)^{p}\left|S_{i}\right|}\right) \\
&=\frac 12\sum_{i} \log \left|(S_{i} \odot S_i)^2 \right|+cst \\
&=\sum_{i j} \log S_{i j}+cst \\
& =  \mathbb{1}_n ^{\top}(\log S )\mathbb{1}_p+cst\\
\end{aligned}
$$

Where the log is applied pointwise at the last equation. 

$$
\begin{aligned}
(2)=\mathbb{E}_{q}\left[\log p_{\theta}(Z)\right] &=\sum_{1} E_{q}\left[\log p_{\theta}\left(Z_{i}\right)\right] \\
&=-\frac{n}{2} \log |\Sigma|+\sum_{i} \mathbb{E}_{q}\left[-\frac{1}{2}\left(Z_{i}-X_{i} \beta\right)^{\top} \Sigma^{-1}\left(Z_{i} - X_{i} \beta\right)\right] +cst
\end{aligned}
$$

$
\text { Let } V \sim \mathcal{N} \left(\mu, \Lambda), \mu \in \mathbb{R}^{p}, \Lambda \in \mathcal S _p ^{++}\right.  \\
\text { Let's compute } \; \mathbb{E}\left[V^{\top} \Sigma^{-1} V\right]
$

We denote $\Sigma ^{-1 / 2}$ the square root Matrix of $\Sigma^{-1}$. It exists since $\Sigma ^{-1} \in \mathcal{S}_p^{++}$. 

$$
\begin{aligned}
\mathbb{E}\left[V^{\top} \Sigma^{-1} V\right] &=\mathbb{E}\left[V^{\top}\Sigma ^{-1 / 2} \Sigma^{-1 / 2} V\right]\\
&=\mathbb{E}\left[\left(\Sigma^{-1 / 2} V\right)^{\top}\left(\Sigma^{-1 / 2} V\right)\right] \\
&=\mathbb{E} \|\Sigma^{-1 / 2} V \|_{2}^{2}
\end{aligned}
$$

$
\text {Let } \tilde{V}=\Sigma^{-1 / 2} V, \quad \tilde{V} \sim \mathcal{N}\left(\Sigma^{-1 / 2} \mu,  \Sigma^{-1 / 2} \Lambda \Sigma^{-\frac{1}{2}}\right)
$

$$
\begin{aligned}
\mathbb{E}\left[V^{\top} \Sigma^{-1} V\right] &=\mathbb{E}\|\widetilde{V}\|_{2}^{2} \\
&=\sum \mathbb E \widetilde{V}_{j}^{2} \\
&=\sum \operatorname{var}\left(\widetilde{V}_{j}\right)^{2}+\mathbb{E}\left[\widetilde{V}_{j}\right]^{2}\\
&=\sum_{j}\left(\Sigma^{-1 / 2} \Lambda \Sigma^{-1 / 2}\right)_{j j}+\left(\Sigma^{-1 / 2} \mu\right)^{2}_j\\
&= \operatorname{tr}\left(\Sigma^{-1 / 2} \Lambda \Sigma^{-1 / 2}\right)+\sum_{j}\left(\left(\Sigma_{j,.}^{-1 / 2}\right)^{\top} \mu\right)^{2}\\
&=\operatorname{tr}\left(\Sigma^{-1} \Lambda\right)+\sum_{j}\left(\left(\Sigma_{j,.}^{-1/2}\right)^{\top} \mu\right)^2
\end{aligned}
$$

Since under $q$, $Z_{i}-X_{i} \beta \sim \mathcal N (M_i - X_i \beta, S_i \odot S_i ) $
$$
\begin{aligned}
(2) &=- \frac 12 \sum_{i} \operatorname{tr}\left(\Sigma^{-1} (S_{i} \odot S_i) \right) - \frac 12 \sum_{i, j}\left(\left(\Sigma_{j,.}^{-1 / 2} \right) ^{\top}\left(M_{i}-X_{i} \beta\right)\right)^2  - \frac n2 \log |\Sigma|+ cst  \\
&= - \frac 12 \operatorname{tr}\left(\Sigma^{-1}\left(\sum_{i} S_{i}\odot S_i \right)\right) - \frac 12 \sum_{i, j}\left(\Sigma^{-1 / 2}(M-X \beta)\right)_{j, i}^{\top}\left(\Sigma
^{-1 / 2}(M-X \beta)\right)_{i, j}  - \frac n2 \log |\Sigma|+ cst  \\
&=- \frac 12 \operatorname{tr}\left(\Sigma^{-1}\left(\sum_{i} S_{i} \odot S_i  \right)\right)- \frac 12 \operatorname{tr}\left(\Sigma^{-1 / 2}(M-X \beta)^{\top}(M-X \beta) \Sigma^{-1 / 2}\right)  - \frac n2 \log |\Sigma|+ cst  \\
&=- \frac 12 \operatorname{tr}\left(\Sigma^{-1}\left(\operatorname{diag}(\mathbb{1}_n^{\top} (S\odot S))+(M-X \beta)^{\top}(M-X \beta)\right)\right)  - \frac n2 \log |\Sigma|+ cst 
\end{aligned}
$$
We then have : 


$$\boxed{\begin{align} J_{\theta, q}(Y) &=\mathbb{1}_n^{\top}\left(Y \odot(O+M)-\frac{1}{2} \exp \left(O+M+\frac{S\odot S}{2}\right) + \frac 12 \log (S \odot S) \right)\mathbb{1}_p \\
& \quad  - \frac 12\operatorname{tr}\left(\Sigma^{-1}\left(\operatorname{diag}(\mathbb{1}_n^{\top} (S\odot S))+(M-X \beta)^{\top}(M-X \beta)\right)\right) \\
& \quad  - \frac n2 \log |\Sigma|+ cst 
\end{align}}$$