In [166]:
import numpy as np 
import torch 
import math 
from torch.optim import Optimizer

In [169]:
N_samples = 30
p = 10
d = 4

In [170]:
X = torch.randn(N_samples, d)
true_beta = torch.randn(d,p)
Y = X@true_beta + torch.randn(N_samples, p)

In [176]:
def get_batch(X,Y,batch_size): 
    '''
    get the batches required to do a  minibatch gradient ascent.  

    args : 
            'batch_size' int.  the batch size you want. 

    returns : a generator. Will generate n/batch_size samples of size batch_size (except the last one 
                since the rest of the division is not always 0)
    '''
    #np.random.seed(2)
    n = Y.shape[0]
    indices = np.arange(n)
    np.random.shuffle(indices)
    # get the number of batches and the size of the last one. 
    nb_full_batch, last_batch_size  = n//batch_size, n % batch_size  
    for i in range(nb_full_batch): 
        yield   (X[indices[i*batch_size: (i+1)*batch_size]],
                 Y[indices[i*batch_size: (i+1)*batch_size]], 
                 indices[i*batch_size: (i+1)*batch_size]
                ) 
    if last_batch_size != 0 :
        yield   (X[indices[-last_batch_size:]], Y[indices[-last_batch_size:]], 
                 indices[-last_batch_size:],
                )
        


In [172]:
Y[27]

tensor([-1.0084, -0.3115, -1.2462, -2.4635, -1.0466,  2.0231, -0.2599,  3.8577,
         1.9691, -0.0273])

In [184]:
def f(x,y_i): 
    return 1/2*torch.norm(x@beta-y_i)**2

def F(beta): 
    return 1/2*torch.mean(torch.norm(Y-X@beta, dim = (1))**2)
def grad_f(beta,y_i,x_i): 
    return torch.outer(x_i, y_i-x_i@beta)

def grad_F(beta): 
    return torch.mean(torch.matmul(X.unsqueeze(2), (Y-X@beta).unsqueeze(1)) , axis = 0)
    
def batch_grad(beta,x_batch, y_batch): 
    return torch.matmul(x_batch.unsqueeze(2), (y_batch-x_batch@beta).unsqueeze(1))
def fit(optim, nb_step):
    beta = torch.zeros((d,p),requires_grad = True)
    optim = torch.optim.SGD([beta], lr = 0.5)
    for i in range(nb_step): 
        optim.zero_grad()
        loss = F(beta) 
        loss.backward()
        optim.step()
    print('res :', beta)
    print('grad :', grad_F(beta))
    for x_b,y_b, indices in get_batch(X,Y, 5): 
        print('grad :', batch_grad(beta, x_b, y_b))
fit(torch.optim.Rprop, 1000)




res : tensor([[ 0.6336, -0.2571,  0.4531,  0.7773,  0.0067, -0.6726,  0.6879, -2.3523,
         -0.1955, -0.0462],
        [ 0.4223,  1.1227, -0.2600, -1.0814,  0.4451, -1.4642,  0.4540, -0.9706,
          1.7317, -1.1076],
        [-1.8058,  0.9489,  0.8655,  0.3208, -0.8030,  0.1810,  1.3851,  0.3318,
         -0.8078, -0.9673],
        [-0.2844, -1.1583, -1.1680,  0.3981, -1.1801, -1.0732,  0.5986, -0.2830,
          0.7018, -0.8330]], requires_grad=True)
grad : tensor([[-1.5895e-08, -3.1789e-08,  1.5895e-08,  3.9736e-09, -3.9736e-09,
          9.9341e-09, -6.3578e-08,  9.9341e-08,  3.1789e-08,  3.1789e-08],
        [-1.9868e-08,  1.0331e-07,  1.5895e-08, -5.5631e-08, -1.1921e-08,
         -9.9341e-08,  7.9473e-09, -4.7684e-08,  7.9473e-08, -3.9736e-08],
        [-1.2716e-07,  2.7816e-08,  6.3578e-08,  0.0000e+00,  3.9736e-08,
          1.1921e-08,  1.5895e-08, -3.1789e-08,  1.5895e-08, -5.5631e-08],
        [-3.1789e-08, -6.5565e-08, -8.3447e-08,  0.0000e+00, -8.3447e-08,
         

In [125]:
class SAGA(Optimizer):
    '''
    This class aims at defining the SAGA optimizer in pytorch, deriving from 
    the torch.optim class. We hope that we will be able to call torch.optim.SAGA 
    and the attributes (such as .step, .zero_grad) as we do for torch.optim.SGD
    '''
    def __init__(self, params, init_grads, lr):
        '''
        initialization for the SAGA optimizer. 
        
        args : 
            params : list with each parameters
            lr : learning rate 
            init_grad : for each param in params, for each sample 
            in the samples we have, we need a first gradient to store. 
            We will associate to each sample i the gradient of the corresponding function i  
            evaluated in the starting point. It should be a list of size len(params).
            For each k, the size of init_grad[k] should be : (sample_size, param[k].shape)
        '''
        dims = [param.shape for param in params]
        print('dims :', dims)
            
        # set a warning message if all the sample size are differents. 
        sample_size = init_grads[0].shape[0]
        defaults = dict(lr=lr, table = init_grads)#, momentum=momentum, dampening=dampening,
                        #weight_decay=weight_decay, nesterov=nesterov)
        super(SAGA, self).__init__(params, defaults)
        
    
    def __setstate__(self, state):
        super(SAGA, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)
            
    @torch.no_grad()
    def step(self, batch_grads, selected_indices):
        """Performs a single optimization step.
        Args:
            batch_grads : list that contains the gradients computed for a subset 
            of the samples. should be a list of size nb_params and each item
            should be of size (batch_size, shape of the parameter)
            
            selected_indices : list of indices, the ones we used to estimate the gradient with.
            
        """
        grads = [torch.mean(batch_grad, axis = 0) for batch_grad in batch_grads]
        print('grads :', grads)
        loss = None
        
        for group in self.param_groups:
            params_with_grad = []
            d_p_list = []
            old_grads = []
            #momentum_buffer_list = []
            #weight_decay = group['weight_decay']
            #momentum = group['momentum']
            #dampening = group['dampening']
            #nesterov = group['nesterov']
            lr = group['lr']
            for i,param in enumerate(group['params']):
                if param.grad is not None:
                    params_with_grad.append(param)
                    d_p_list.append(grads[i])
                    print('groupe table', group['table'][i].shape)
                    old_grads.append(group['table'][i][selected_indices])
                    
                    state = self.state[param]
                    #if 'momentum_buffer' not in state:
                    #    momentum_buffer_list.append(None)
                    #else:
                    #    momentum_buffer_list.append(state['momentum_buffer'])
            saga_step(params_with_grad,d_p_list,interval)
            '''
            F.sgd(params_with_grad,
                  d_p_list,
                  momentum_buffer_list,
                  weight_decay=weight_decay,
                  momentum=momentum,
                  lr=lr,
                  dampening=dampening,
                  nesterov=nesterov)
            
            # update momentum_buffers in state
            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
                state = self.state[p]
                state['momentum_buffer'] = momentum_buffer
            '''
        return loss
    

In [128]:
saga = SAGA([y,y_bis],init_grads, 0.1)
y.grad = torch.clone(y)
y_bis.grad = torch.clone(y_bis)
saga.step(1)

dims : [torch.Size([15, 4]), torch.Size([8, 6])]
groupe table torch.Size([3, 15, 4])
groupe table torch.Size([3, 8, 6])


NameError: name 'saga_step' is not defined

In [70]:
y = torch.zeros((15,4), requires_grad = True)
y_bis = torch.zeros((8,6), requires_grad = True)

In [73]:
first_init = torch.cat((2*(y-2).unsqueeze(0), 2*(y-1).unsqueeze(0), 2*(y-4).unsqueeze(0)),0)
second_init = torch.cat((2*(y_bis-2).unsqueeze(0), 2*(y_bis-1).unsqueeze(0), 2*(y_bis-4).unsqueeze(0)),0)
print(first_init.shape)
init_grads = [first_init, second_init]

torch.Size([3, 15, 4])
