In [557]:
from gradient_descent import gradient_descent, minibatch_class
import utils

import matplotlib.pyplot as plt
from pandas import read_csv
import math
from timeit import default_timer as timer

import numpy as np
import torch
from torch import nn
import scipy.linalg as SLA 
from scipy.linalg import toeplitz 
torch.set_default_dtype(torch.float64)

Y = torch.from_numpy(read_csv('trichoptera.csv', sep=',').to_numpy())
O = torch.outer(Y.sum(1), torch.ones(Y.shape[1]))/1000

#data = utils.format_data(counts=Y, offsets=np.log(O))

n,p = Y.shape 
d = 2 # nb of cavariates

sizes : 

$ Y : (n,p)$ 

$O : (n,p)$ 

$\Sigma :  (p,p)$ 

covariates ($x$) : $(n,d)$

$\beta : (d,p)$

$M : (n,p)$

$S : (n,p)$ . Should be seen as $(n,p,p)$ but since all the $n$  matrix $(p,p)$ are diagonal, we only need $p$ points to encode it.

In [558]:
class PLNmodel(): 
    def __init__(self, Y, O,covariates, Sigma_init, beta_init, M_init, S_init): 
        
        '''Initialization : 
            
            Sigma_init is the initilization for Sigma as well as beta_init
            I plan to do an initialization step more advanced. 
        '''
        
        self.Y = Y
        self.O = O
        self.covariates = covariates
        
        self.Sigma = torch.clone(Sigma_init)
        #self.Sigma.requires_grad_(True)
        self.beta = torch.clone(beta_init)
        self.beta.requires_grad_(True)
        self.M = torch.clone(M_init)
        self.M.requires_grad_(True)
        self.S = torch.clone(S_init) 
        self.S.requires_grad_(True)
        
        self.n, self.p = Y.shape
        self.det_Sigma = torch.det(self.Sigma)
        self.inv_Sigma = torch.inverse(self.Sigma)
        
        
    
    def forward(self): 
        ''' 
        computes the ELBO J(theta,q) 
        
        S is of size (n,p) but it size should be (n,p,p). The size is (n,p) since it 
        represents n diagonals matrix of size (p,p), which we can encode with only (n,p) numbers. 
        '''
        
        tmp = -self.n/2*math.log(self.det_Sigma) # premiere formule ou on prend le log du det de Sigma 
    
        tmp -=1/2*( 
            
                    torch.sum(torch.mm(torch.mm(self.M,self.inv_Sigma),self.M.T).diagonal()) # we can simplify here, takes too much time 
                   +                                                  # we should remove the diagonal and do a more efficient multiplication
                   torch.sum(torch.tensor([torch.trace(torch.multiply(self.inv_Sigma,self.S[i,:])) for i in range(self.n)]))
                    ) # formula with the quadratic function and the trace 
        
        
        Gram_matrix = torch.mm(self.covariates,self.beta) # matrix with term (i,j): <x_i,beta_j>
        
        Exp_moment = torch.exp(self.M + torch.pow(self.S,2)/2)
        
        tmp += torch.sum(-torch.exp(self.O + Gram_matrix + self.M + torch.pow(self.S,2)/2) + torch.multiply(self.Y, self.O + Gram_matrix + self.M))
        
        tmp+= 1/2*torch.sum(torch.tensor([math.log(self.S[i,:].prod()) for i in range(self.n)]))
        tmp.backward()
        return tmp
    
    
    def grad_m(self): 
        pass
    
    def update(self,lr = 0.01): 
        '''fait une étape de descente de gradient sur le parametre S. On va généraliser 
        à M,beta et Sigma ensuite '''
        print('current value :',self.forward().item())
        with torch.no_grad():
            self.S += lr*self.S.grad
        self.S.grad.zero_()
    

In [562]:
Sigma_init = torch.from_numpy(toeplitz(0.5**np.arange(p)))
beta_init = torch.ones((d, p))
M_init = torch.ones((n,p))/100# some random values to initialize we divide to avoid nan values 
S_init = torch.ones((n,p))/8 # some random values to initializ. we divise to avoid nan values 

covariates = torch.zeros((n,d))# pas encore de covariates dc je prend 0 pr l'instant 

Model_bad ne marche pas, model_good marche. La seule chose qui change entre les deux modèles est l'initialisation de S. Celle de model_bad est 8 fois plus petite que celle de model_good. Si tu retranches le gradient ( i.e. faire self.S -= lr*self.s.grad dans la fonction update) alors les deux tendances s'échangent. 

J'ai regardé ce qu'il se passait quand on update M, cette fois ci ça marche avec n'importe quelle initialisation. J'ai essayé d'update Sigma et beta mais il y a un bug avec requires_grad que j'ai donc mis en commentaire, je suis entrain de debugger ça. 

In [560]:
model_bad = PLNmodel(Y,O, covariates, Sigma_init, beta_init, M_init, torch.ones((n,p))/8 )

In [561]:
for _ in range(10 ) : 
    model_bad.update()
    #print(torch.min(torch.abs(model.S)))

current value : 10050.185035569646
current value : 10045.94234560788
current value : 10041.263323694968
current value : 10036.28951374982
current value : 10031.112203976729
current value : 10025.791159702494
current value : 10020.366033206612
current value : 10014.863473876114
current value : 10009.30163202494
current value : 10003.693046797816


In [555]:
model_good = PLNmodel(Y,O, covariates, Sigma_init, beta_init, M_init, torch.ones((n,p)) )

In [556]:
for _ in range(10) : 
    model_good.update()

current value : 9495.356539235709
current value : 9671.187273212672
current value : 9746.773648736002
current value : 9797.377524515361
current value : 9836.06876480069
current value : 9867.705341506662
current value : 9894.637753890287
current value : 9918.180509782485
current value : 9939.139796361907
current value : 9958.043173250315
