Consider the following (neural network) training optimization problem. 

\begin{equation*}
\begin{aligned}
\text{minimize}_{x:=(z_1,z_2,w_1,w_2)}&\quad \left(\frac{1}{2N_{train}}\sum_{j=1}^{N_{train}}\left(w_1\sigma(u_j^Tz_1)+w_2\sigma(u_j^Tz_2)-v_j\right)^2\right)+\frac{\lambda}{2}\left(\|z_1\|^2+\|z_2\|^2+w_1^2+w_2^2\right).
\end{aligned}
\end{equation*}

The notation is as follows.

- $N_{train}$ denotes the number of training data points.


- $u_j \in \mathbb{R}^{784\times 1}$ is the data of the $j$th training image (equivalent to $j$th column of "U_train" in the code discussed on February 3)


- $v_j \in \{+1,-1\}$ denotes the true label of the $j$th image for being a $5$ digit or not 


- $z_1 \in \mathbb{R}^{784\times 1}$ and $z_2 \in \mathbb{R}^{784\times 1}$


- $w_1 \in \mathbb{R}$ and $w_2 \in \mathbb{R}$ 


- $\sigma(y) = \dfrac{e^y-e^{-y}}{e^y+e^{-y}}$ for $y \in \mathbb{R}$


- $\lambda >0$ is a regularization parameter


- Note that the decision vector $x$ is given by $x:=(z_1,z_2,w_1,w_2)$ that is in $\mathbb{R}^{1570}$



Obtain the formula of the gradient mapping of the objective function. You may utilize the chain rule of calculus. Also, note that we have $\nabla_y \sigma (y) = 1-\sigma^2(y)$.

<span style="color:lightseagreen">
    
$\nabla f(x)=\frac{1}{2N}\sum_{j=1}^N \nabla f_j(x)=\frac{1}{2N}\sum_{j=1}^N\begin{bmatrix}
\left(w_1\left(1-\sigma^2(u_j^Tz_1)\right)u_j\times2\left(w_1\sigma(u_j^Tz_1)+w_2\sigma(u_j^Tz_2)-v_j\right)+2\lambda z_1\right)_{784\times1}\\
\left(w_2\left(1-\sigma^2(u_j^Tz_2)\right)u_j\times2\left(w_1\sigma(u_j^Tz_1)+w_2\sigma(u_j^Tz_2)-v_j\right)+2\lambda z_2\right)_{784\times1}\\
\left(\sigma(u_j^Tz_1))\times2\left(w_1\sigma(u_j^Tz_1)+w_2\sigma(u_j^Tz_2)-v_j\right)+2\lambda w_1\right)_{1\times1}\\
\left((\sigma(u_j^Tz_2))\times2\left(w_1\sigma(u_j^Tz_1)+w_2\sigma(u_j^Tz_2)-v_j\right)+2\lambda w_2\right)_{1\times1}\\
\end{bmatrix}_{1570\times1}$
    
</span>

<pre>
</pre>

In [5]:
x_0 = np.zeros((N1*(N0+1),1))

def convert_x_to_z_and_w(x):
    z= np.reshape(x[:-N1,:],(N0,N1))  # matrix z is N0*N1
    w = np.reshape(x[-N1:,:],(N1,1)) # vector w is N1*1
    return z,w

def sigma(x):
    y = np.clip(x, -709, 709)
    output=(exp(y)-exp(-y))/(exp(y)+exp(-y))
    return output

def local_obj_fj(x,j):
    convert_results = convert_x_to_z_and_w(x)
    z = convert_results[0]
    w = convert_results[1]
    first_part = (sum(w[[i],:]*sigma(np.dot(U_train[:,[j]].T,z[:,[i]]))for i in range (N1)) - v_train[0,j] )**2 
    return first_part/2 + (0.5*lamda*(LA.norm(z,'fro')**2+ LA.norm(w)**2)) 

def obj_f(x):
    output = sum(local_obj_fj(x,j) for j in range (N))
    return output/N

def stoch_grad_F(x,j):
    convert_results = convert_x_to_z_and_w(x)
    z = convert_results[0]
    w = convert_results[1]
    p_z1 = w[[0],:]*(1-(sigma(np.dot(U_train[:,[j]].T,z[:,[0]])))**2)*((sum(w[[i],:]*sigma(np.dot(U_train[:,[j]].T,z[:,[i]]))for i in range (N1)) - v_train[0,j] )**2)*U_train[:,[j]] 
    p_z2 = w[[1],:]*(1-(sigma(np.dot(U_train[:,[j]].T,z[:,[1]])))**2)*((sum(w[[i],:]*sigma(np.dot(U_train[:,[j]].T,z[:,[i]]))for i in range (N1)) - v_train[0,j] )**2)*U_train[:,[j]] 
    p_w1 = sigma(np.dot(U_train[:,[j]].T,z[:,[0]]))*((sum(w[[i],:]*sigma(np.dot(U_train[:,[j]].T,z[:,[i]]))for i in range (N1)) - v_train[0,j] )**2) 
    p_w2 = sigma(np.dot(U_train[:,[j]].T,z[:,[1]]))*((sum(w[[i],:]*sigma(np.dot(U_train[:,[j]].T,z[:,[i]]))for i in range (N1)) - v_train[0,j] )**2) 
    stocha_grad = np.concatenate((p_z1,p_z2,p_w1,p_w2),axis=0)
    return stocha_grad + lamda*x 

def true_grad(x):
    output = sum(stoch_grad_F(x,j) for j in range (N))/N
    return output/2

NameError: name 'N1' is not defined

In [6]:
# Lambda can be 10 ** -1 or -2