In [1]:
import numpy as np 
import matplotlib.pyplot as plt
from scipy.linalg import toeplitz
from utils import sample_PLN
import scipy.linalg as SLA
import torch.linalg as TLA
import scipy
from fastPLN import fastPLN
from scipy.special import factorial
import math
import seaborn as sns 
import torch 
from pandas import read_csv

device :  cpu


In [1]:
%load_ext autoreload

%autoreload 2

$$
\begin{aligned}
W_{i} & \sim \mathcal{N}\left(0, I_{q}\right), \text { iid, } \quad i=1, \ldots, n \\
Z_{i} &=\mathbf{x}_{i}\beta +W_{i}\mathbf{C}^{\top}, \quad i \in 1, \ldots, n \\
Y_{i j} \mid Z_{i j} & \sim \mathcal{P}\left(\exp \left(o_{i j}+Z_{i j}\right)\right)
\end{aligned}
$$

We want to maximize the log likelihood with respect to $\theta$ : 


$$\max _{\theta}log P_{\theta}(Y)$$


In the MC_PLN.ipynb we tried to do a Monte Carlo to maximize the likelihood but it resulted in numerical 0. We need to derive the gradients with respect to theta : 


$$
\begin{aligned}
\nabla_{\theta} \log p_{\theta}(Y)&= \frac{\nabla_{\theta} p_{\theta}(Y)}{p_{\theta}(Y)} \\
&=\frac{\nabla_{\theta} \int p_{\theta}(Y \mid W) p(W) d W}{\int p_{\theta}(Y \mid W) p(W) d W}\\
&=\frac{\int \nabla_{\theta} p_{\theta}(Y | W) p(W) d W}{\int  p_{\theta}(Y|W) p(W) d w}\\
&= \frac{\int\left(\nabla_{\theta} \ln p_{\theta}(Y \mid W)\right) p_{\theta}(Y| W) p(W) d W}{\int p_{\theta}(Y \mid W)p(W) d W}\\
&=\int \nabla_{\theta} \ln p_{\theta}(Y \mid W) \tilde{p}_{\theta}(W) dW \\
\end{aligned}
$$
$$\tilde p_{\theta}(W):=\frac{p_{\theta}(Y \mid W) p(W) d W}{\int p_{\theta}(Y \mid W) p(W) dW}$$

We only know the numerator of $\tilde p_{\theta}(W)$. Thus we need to use importance sampling, which consits in the following : 

Let $g$ be a probability density such that $x \in \operatorname{supp}(\tilde p_{\theta}) \implies g(x)>0$. We denote $(V_i)_i \overset{iid}{\sim} g$. 

We define : 

$$
w_{i}^{(u)}=\frac{p_{\theta}\left(Y|V_i\right)}{g\left(V_{i}\right)}
$$


$$
\tilde{w}_{i}^{(u)}=\frac{w_{i}^{(u)}}{\sum_{\ell=1}^{n} w_{\ell}^{(u)}}
$$


Then, 

$$
\hat{I}_{n}^{I S, u}:=\sum_{i=1}^{n} \tilde{w}_{i}^{(u)} \nabla_{\theta} \ln p_{\theta}(Y \mid V_i)
 \stackrel{\text { Proba }}{\longrightarrow} \nabla_{\theta} \log p_{\theta}(Y)
$$

We need to choose carefully the density $g$ i.e. where $\tilde p_{\theta}\times \nabla_{\theta} \log p_{\theta}$ has a lot of mass. We can see that : 

$$\tilde p_{\theta}(W):=\frac{p_{\theta}(Y \mid W) p(W) d W}{\int p_{\theta}(Y \mid W) p(W) dW}=\frac{p_{\theta}\left(W|Y) P_{\theta}(Y)\right.}{\int p_{\theta}(W \mid Y) p_{\theta}(Y) d W}=\frac{p_{\theta}(W \mid Y)}{\int p_{\theta}(W | Y) dW }$$

Since $W|Y$ can be well-approximated with a Gaussian, we will choose a Gaussian for $g$.




In [2]:
def build_block_Sigma(p,k): 
    '''
    build a matrix per block of size (p,p). There will be k+1 blocks of size p//k.
    The first k ones will be the same size. The last one will be smaller (size (0,0) if k%p = 0)
    '''
    np.random.seed(0)
    alea = np.random.randn(k+1)**2+1# will multiply each block by some random quantities 
    Sigma = np.zeros((p,p))
    block_size,last_block_size = p//k, p%k
    for i in range(k): 
        Sigma[i*block_size : (i+1)*block_size ,i*block_size : (i+1)*block_size] = alea[i]*toeplitz(0.95**np.arange(block_size))
    if last_block_size >0 :
        Sigma[-last_block_size:,-last_block_size:] = alea[k]*toeplitz(0.98**np.arange(last_block_size))
    return Sigma+0.1*toeplitz(0.95**np.arange(p))


def C_from_Sigma(Sigma,q): 
    w,v = SLA.eigh(Sigma)
    C_reduct = v[:,-q:]@np.diag(np.sqrt(w[-q:]))
    return C_reduct

In [3]:
def log_stirling(n_):
    '''
    this function computes log(n!) even for n large. We use the Stirling formula to avoid 
    numerical infinite values of n!. It can also take tensors.
    
    args : 
         n_ : tensor. 
    return : an approximation of log(n!)
    '''
    n = torch.clone(n_) #clone the tensor by precaution
    n+= (n==0) # replace the 0 with 1. It changes anything since 0! = 1! 
    return torch.log(torch.sqrt(2*np.pi*n))+n*torch.log(n/math.exp(1)) #Stirling formula

In [None]:
class IMPS_PLNPCA(): 
    def __init__(self,q, batch_size): 
        self.q = q
        self.batch_size = batch_size 
        
    def get_Sigma(self):
        '''
        simple function to get Sigma
        '''
        return self.C@(self.C.T)
    
    def get_batch(self,batch_size): 
        '''
        get the batches required to do a  minibatch gradient ascent.  
        
        args : 
                'batch_size' int.  the batch size you want. 
                
        returns : a generator. Will generate n/batch_size samples of size batch_size (except the last one 
                    since the rest of the division is not always an integer)
                    
        '''
        #np.random.seed(0)
        indices = np.arange(self.n)
        np.random.shuffle(indices)
        self.batch_size = batch_size 
        nb_full_batch, last_batch_size  = self.n//self.batch_size, self.n % self.batch_size  
        for i in range(nb_full_batch): 
            yield   (self.Y[indices[i*self.batch_size: (i+1)*self.batch_size]], 
                    self.covariates[indices[i*self.batch_size: (i+1)*self.batch_size]],
                    self.O[indices[i*self.batch_size: (i+1)*self.batch_size]]) 
                        
                  
        if last_batch_size != 0 : 
            self.batch_size = last_batch_size
            yield   (self.Y[indices[-last_batch_size:]], 
                    self.covariates[indices[-last_batch_size:]],
                    self.O[indices[-last_batch_size:]])
            
    