In [1]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
#from scipy.optimize import minimizeini

In [2]:
data = sio.loadmat("ex4data1.mat")
raw_X = data['X']
raw_y = data['y']

In [3]:
#将值为1的列插入到数组的第一列，用于向量计算时乘偏置项b
X = np.insert(raw_X,0,values=1,axis=1)

In [4]:
X.shape

(5000, 401)

In [5]:
raw_y

array([[10],
       [10],
       [10],
       ...,
       [ 9],
       [ 9],
       [ 9]], dtype=uint8)

## 1.对y进行one-hot编码处理

In [6]:
#这个函数遍历原始的标签数组 raw_y
#对于每一个标签 i，创建一个长度为10的全零数组 y_temp，然后将 y_temp 的第 i-1 个位置设置为1，表示对应的标签。
def one_hot_encoder(raw_y):
    result = []
    for i in raw_y:
        y_temp = np.zeros(10)
        y_temp[i-1] = 1
        result.append(y_temp)
    return np.array(result)

In [7]:
y = one_hot_encoder(raw_y)

### y现在一行只有1和0两个数字，1所在的位置代表对应的数字

In [8]:
y

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [25]:
y.shape

(5000, 10)

## 2.序列化权重参数

In [9]:
theta = sio.loadmat("ex4weights.mat")
theta1,theta2 = theta['Theta1'],theta['Theta2']

In [10]:
theta1,theta2

(array([[-2.25623899e-02, -1.05624163e-08,  2.19414684e-09, ...,
         -1.30529929e-05, -5.04175101e-06,  2.80464449e-09],
        [-9.83811294e-02,  7.66168682e-09, -9.75873689e-09, ...,
         -5.60134007e-05,  2.00940969e-07,  3.54422854e-09],
        [ 1.16156052e-01, -8.77654466e-09,  8.16037764e-09, ...,
         -1.20951657e-04, -2.33669661e-06, -7.50668099e-09],
        ...,
        [-1.83220638e-01, -8.89272060e-09, -9.81968100e-09, ...,
          2.35311186e-05, -3.25484493e-06,  9.02499060e-09],
        [-7.02096331e-01,  3.05178374e-10,  2.56061008e-09, ...,
         -8.61759744e-04,  9.43449909e-05,  3.83761998e-09],
        [-3.50933229e-01,  8.85876862e-09, -6.57515140e-10, ...,
         -1.80365926e-06, -8.14464807e-06,  8.79454531e-09]]),
 array([[-0.76100352, -1.21244498, -0.10187131, -2.36850085, -1.05778129,
         -2.20823629,  0.56383834,  1.21105294,  2.21030997,  0.44456156,
         -1.18244872,  1.04289112, -1.60558756,  1.30419943,  1.37175046,
       

In [11]:
theta1.shape,theta2.shape

((25, 401), (10, 26))

### 这个函数是一个序列化函数，它用于将两个数组 a 和 b 扁平化后进行串联，返回一个串联后的一维数组。

In [12]:
def serialize(a,b):
    return np.append(a.flatten(),b.flatten())

In [13]:
theta_serialize = serialize(theta1,theta2)
theta_serialize.shape

(10285,)

## 3.解序列化操作

In [14]:
def deserialize(theta_serialize):
    theta1 = theta_serialize[:25*401].reshape(25,401)
    theta2 = theta_serialize[25*401:].reshape(10,26)
    return theta1,theta2

In [15]:
theta1,theta2 = deserialize(theta_serialize)

In [16]:
theta1.shape,theta2.shape

((25, 401), (10, 26))

## 4.前向传播

In [17]:
def sigmod(z):
    return 1/(1+np.exp(-z))

In [18]:
def feed_forward(theta_serialize,X):
    theta1,theta2 = deserialize(theta_serialize)
    a1 = X
    z2 = np.dot(a1,theta1.T)
    a2 = sigmod(z2)
    a2 = np.insert(a2,0,values=1,axis=1)
    z3 = np.dot(a2,theta2.T)
    h = sigmod(z3)
    return a1,z2,a2,z3,h

In [22]:
a1,z2,a2,z3,h = feed_forward(theta_serialize,X)

In [34]:
h

array([[1.12661530e-04, 1.74127856e-03, 2.52696959e-03, ...,
        4.01468105e-04, 6.48072305e-03, 9.95734012e-01],
       [4.79026796e-04, 2.41495958e-03, 3.44755685e-03, ...,
        2.39107046e-03, 1.97025086e-03, 9.95696931e-01],
       [8.85702310e-05, 3.24266731e-03, 2.55419797e-02, ...,
        6.22892325e-02, 5.49803551e-03, 9.28008397e-01],
       ...,
       [5.17641791e-02, 3.81715020e-03, 2.96297510e-02, ...,
        2.15667361e-03, 6.49826950e-01, 2.42384687e-05],
       [8.30631310e-04, 6.22003774e-04, 3.14518512e-04, ...,
        1.19366192e-02, 9.71410499e-01, 2.06173648e-04],
       [4.81465717e-05, 4.58821829e-04, 2.15146201e-05, ...,
        5.73434571e-03, 6.96288990e-01, 8.18576980e-02]])

In [24]:
h.shape

(5000, 10)

## 5.损失函数

### 5-1.不带正则化的损失函数

In [19]:
def cost(theta_serialize,X,y):
    a1,z2,a2,z3,h = feed_forward(theta_serialize,X)
    J = -np.sum(y*np.log(h) + (1-y)*np.log(1-h))/len(X)
    return J

In [20]:
J = cost(theta_serialize,X,y)

In [21]:
J

0.2876291651613189

### 5-1.带正则化的损失函数

In [30]:
def reg_cost(theta_serialize,X,y,lamda):
    #忽略每个权重矩阵中的第一列，即偏置项b
    #因为偏置项不会参与正则化。所以在代码中使用 theta1[:,1:] 和 theta2[:,1:] 分别表示去除了第一列的权重矩阵。
    sum1 = np.sum(np.power(theta1[:,1:],2))
    sum2 = np.sum(np.power(theta2[:,1:],2))
    reg = (sum1 + sum2)*lamda / (2*len(X))
    return cost(theta_serialize,X,y) + reg

In [31]:
reg_cost(theta_serialize,X,y,1)

0.38376985909092365

## 6.反向传播

### 6.1 无正则化的梯度

In [35]:
#对sigmoid函数进行求导
def gra_sigmoid(z):
    return sigmod(z)*(1-sigmod(z))

In [None]:
def gradient(theta_serialize,X,y):
    theta1,theta2 = deserialize(theta_serialize)
    a1,z2,a2,z3,h = feed_forward(theta_serialize,X)
    d3 = h - y
    d2 = np.dot(d3,theta2[:,1:]) * gra_sigmoid(z2)#感觉这里是a2
    D2 = (d3.T @ a2) / len(X)
    D1 = (d2.T @ a1)/ len(X)
    return serialize(D1,D2)

### 6.2 带正则化的梯度