In [1]:
import matplotlib.pyplot as plt
import autograd.numpy as np
import scipy.optimize as opt
import scipy as sci
import random
import matplotlib.pyplot as plt
from autograd import grad, jacobian
import sys
import torch

# DDN with Learnable parameters

In this tutorial, we extends robust pooling to adaptive robust pooling; we introduce adaptive feature projections.

Previously, we computed the gradient of robust pooling respect to input x: $Dy(x)$. The parameter $c$ is fixed during proprogation, thus there is no need compute its gradient. Now we relax this restriction, include $c$ as a trainable parameter and compute its gradient $Dy(c)$.

## Closed-form Gradient  for adaptive robust pooling
In this section we compute the closed-form gradient of 3 different robust pooling function: peseudo-Huber, Huber, Welsch.

Recall the gradient for unconstrained function: 

$D(y(c)) = -H^{-1}B$ 

where $H= D^2_{YY}f(c, y)$ and $B= D^2_{c Y} f(c,y) $

### Pseudo- Huber

$y \in  \text{argmin}_y  \sum_i^n c^2 (\sqrt {1+ \frac {(y-x_i)^2} {c^2}}-1)$


###  Huber

$y \in \text{argmin}_y \sum_{i=1}^n $ $\begin{cases} 
\frac {1} {2} (y-x_i)^2 & \text{$ |y-x_i| \leq c $} \\
\alpha(|y-x_i|-\frac {1} {2} c) & \text{otherwise}\\
\end{cases}$


### Welsch

$y \in  \text{argmin}_y  \sum_{i=1}^n (1-exp(- \frac {(y-x_i)^2} {2c^2}))$


Below is the implmentation of $Dy(\alpha)$, I adapted some previous code from DDN repository.

In [2]:
def dyc_closed_form(x,y,c,p='pseudo-huber'):
    c_sq=c**2
    if p=='pseudo-huber':
        dyy = np.array([np.power(1.0 + np.power(y - xi, 2.0) / c_sq, -1.5) for xi in x])
        dyc =  np.sum([np.power(y-xi,3)/(np.power(np.power((y-xi)/c,2)+1,1.5)*np.power(c,3)) for xi in x])
    elif p=='huber':
        dyy = np.array([1.0 if np.abs(y - xi) <= c else 0.0 for xi in x])
        dyc = np.sum(np.array([0.0 if np.abs(y - xi) <= c else (1.0 if y-xi>0 else -1.0) for xi in x]))
    else:
        z = np.power(x - y, 2.0)
        dyy = np.array([(c_sq - zi) / (c_sq * c_sq) * np.exp(-0.5 * zi / c_sq) for zi in z])
        dyc=np.sum(np.array([-np.exp(-0.5 * np.power((y - xi)/c,2))*((2*(y-xi)*c_sq-np.power(y-xi,3))/(c**5)) for xi in x])) 
    return -1.0 * dyc/np.sum(dyy)

We can also compute the gradient using pytorch autograd libray

In [3]:
# the objective function from ddn.basic.node
def f(x, y, c,p='pseudo-huber'):
    c_sq=c**2
    if p=='pseudo-huber':
        phi= lambda z: (c**2) * (np.sqrt(1.0 + np.power(z, 2.0) / (c**2)) - 1.0)
    elif p=='huber':
        phi = lambda z: np.where(np.abs(z) <= c, 0.5 * np.power(z, 2.0), c * np.abs(z) - 0.5 * c_sq)
    elif p=='welsch':
        phi = lambda z: 1.0 - np.exp(-0.5 * np.power(z, 2.0) / c_sq)
    elif p=='trunc-quad':
        phi = lambda z: np.minimum(0.5 * np.power(z, 2.0), 0.5 * c_sq)
    return np.sum([phi(y - xi) for xi in x])

# the solve objective function from ddn.basic.node
def solve(x,c ,f, p='pseudo-huber'):
    result = opt.minimize(lambda y : f(x, y, c,p), np.mean(x))
    return result.x

def dyc(x,y,c,p='pseudo-huber'):
    fY = grad(f, 1)
    fYY = jacobian(fY, 1)
    fCY = jacobian(fY, 2)
    return -1.0 * np.linalg.pinv(fYY(x, y, c,p)).dot(fCY(x, y, c,p))

We can check the correctness of closed form graident by comparing it with autograd gradient

In [4]:

n = 10 # number of input points
y_target = np.array([0.0])
x_init = np.random.rand(n)
# add an outlier
x_init[np.random.randint(len(x_init))] += 100.0 * np.random.rand(1)
x_init=np.array([ 1.4748, -0.0034,  2.1072, -0.0675, -0.7821, -0.9080, -2.0427,
          -1.9460,  1.7862,  0.1601])
c_init = random.uniform(0.1, 10)

In [5]:
print('x:',x_init)
print('c:',c_init)
#valid the analyic gradient is the same as autograd solution
print("error between autograd and closed-form:")
y_init = solve(x_init,c_init,f,'pseudo-huber')
print("pseudo-huber ",abs(dyc_closed_form(x_init,y_init,c_init,'pseudo-huber')-dyc(x_init,y_init,c_init,'pseudo-huber')))
y_init = solve(x_init,c_init,f,'huber')
print("huber ",abs(dyc_closed_form(x_init,y_init,c_init,'huber')-dyc(x_init,y_init,c_init,'huber')))
y_init = solve(x_init,c_init,f,'welsch')
print("welsch ",abs(dyc_closed_form(x_init,y_init,c_init,'welsch')-dyc(x_init,y_init,c_init,'welsch')))

x: [ 1.4748 -0.0034  2.1072 -0.0675 -0.7821 -0.908  -2.0427 -1.946   1.7862
  0.1601]
c: 6.3957496996326855
error between autograd and closed-form:
pseudo-huber  [2.92734587e-18]
huber  [0.]
welsch  [1.08420217e-18]


## Closed-form Gradient  for adaptive sphere and ball projections
Similar to robust pooling, we can compute the gradient respect to ball projection. 


The adaptive sphere and ball projection problems in general is defined:

\begin{array}{lll}
    y \in & \text{argmin}_u & \frac{1}{2} \|u - x\|^2_2 \\
    & \text{subject to} & \|x\|_p = r \\
    && r>0\\
\end{array}

We first define the problem and constraint.

In [6]:
# objective 
def f(r,y,x):
    return 0.5* np.dot(y-x,y-x)

# constraint
def h(r,y,norm):
    if norm=='L1':
        return np.sum(np.abs(y))-r
    if norm=='Ln':
        return np.max(np.abs(y)) - r
    elif norm=='L2':
        return np.dot(y,y) - r**2
# forward solve
def solve_opt(x,r,f,norm):
    result = opt.minimize(lambda y: f(r, y ,x), np.ones(np.shape(x)[0]),constraints=[{'type':'eq', 'fun': lambda y: h(r,y,norm)}] )
    return result.x

# forward solve (L2 norm only)
def solve_analyical(x,r):
    return r / np.sqrt(np.dot(x, x)) * x

We define the gradient for L2, L1 and Ln. We write the gradient by autograd for check too.

In [7]:

def gradient_L2(r,x):
    y = solve_analyical(x,r)
    return r*y/(np.sum(y*y))

def gradient_L1(r,x):
    y= solve_opt(x,r,f,'L1')
    a = np.sign(y)
    return a/(a@a)

# Ln in numpy is not working properly, 
# however it pytorch version is working and tested by GradCheck library.
# please see ddn/pytorch/adaptive_projections
def gradient_Ln(r,x):
    y= solve_opt(x,r,f,'Ln')
    a = np.array([0 if np.abs(yi)<np.max(np.abs(y)) else 1 for yi in y])
    return a/(a@a)

def gradient_by_auto_diff(r,x,norm):
    fY = grad(f, 1)
    hY = grad(h,1)
    hR = grad(h,0)
    frY = jacobian(fY, 0)
    fYY = jacobian(fY, 1)
    hYY = jacobian(hY, 1)
    hrY= jacobian(hY, 0)
    y= solve_opt(x,r,f,norm)
    indx = np.nonzero(hY(r, y,norm))
    if len(indx[0]) == 0:
        nu= 0.0
    nu = fY(r, y, x)[indx[0][0]] / hY(r, y, norm)[indx[0][0]]
    H = fYY(r, y, x) - nu * hYY(r, y, norm)
    a = hY(r, y, norm)
    B = frY(r, y, x) - nu * hrY(r, y, norm)
    C = hR(r, y, norm)
    con = np.stack((a, B), axis=1)
    try:
        v = sci.linalg.solve(H, con, assume_a='pos')
    except:
         return np.full((2, 1), np.nan).squeeze()
    return (np.outer(v[:, 0], (v[:, 0].dot(B) - C) / v[:, 0].dot(a)) - v[:, 1:1 + 1]).squeeze()

Random generate x and r, check error between the gradient computed by different form.

In [8]:
x= np.array([-1.2419,  1.1800, -0.7854, -0.7418])
#r= random.uniform(0.1, 10)
r=np.array(0.6128)
print('x:',x)
print('r:',r)
print("error:")
print("Dy(r) analytical and autograd in L2: ", abs(np.sum(gradient_L2(r,x)-gradient_by_auto_diff(r,x,'L2') )))
print("Dy(r) analytical and autograd in L1: ", abs(np.sum(gradient_L1(r,x)-gradient_by_auto_diff(r,x,'L1') )))
print("Dy(r) analytical and autograd in Ln: ", abs(np.sum(gradient_Ln(r,x)-gradient_by_auto_diff(r,x,'Ln') )))

x: [-1.2419  1.18   -0.7854 -0.7418]
r: 0.6128
error:
Dy(r) analytical and autograd in L2:  1.2798437478145352e-06
Dy(r) analytical and autograd in L1:  0.0
Dy(r) analytical and autograd in Ln:  2.0


