In [168]:
import numpy as np
import sys

sys.path.append('./book_material')
from dataset.mnist import *
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True)

##wb是模型本身的参数因此放在构造函数中无需手动进行更改 其他则由输入的x决定
class Affine:
    def __init__(self,w,b):
        self.w=w
        self.b=b
        self.x=None
        self.dw=None
        self.db=None
        self.dx=None
    def forward(self,x):
        if x.ndim==1:
            x = x.reshape(1,-1)
        self.x=x
        output=np.dot(self.x, self.w)+self.b
        return output
    def backward(self,d_out):

        self.dx=np.dot(d_out,self.w.T)
        self.dw=np.dot(self.x.T,d_out)
        self.db=np.sum(d_out,axis=0)
        return self.dx

class Relu:
    def __init__(self):
        self.mask=None
    def forward(self,x):
        self.mask=x>0
        out=self.mask*x
        return out
    def backward(self,d_out):
        dx=self.mask*d_out
        return dx



class Softmaxwithloss:
    def __init__(self):
        self.t=None
        self.x=None
        self.y=None
        self.dx=None
        self.batch_size = None
        self.w_rate=0.1
        self.w=None
    def forward(self,x):
        if x.ndim==1:
            x = x.reshape(1,-1)
        self.x=x-np.max(x,axis=-1,keepdims=True)
        self.batch_size = x.shape[0]
        self.y=np.exp(self.x)/np.sum(np.exp(self.x),axis=-1,keepdims=True)

        return self.y
    def loss(self,x,t):



        out=self.forward(x)
        self.t=t
        if t.ndim!=1:
            loss_rate=np.sum(-self.t*np.log(out+1e-7))/self.batch_size
        else:
            loss_rate=-np.sum(np.log(out[np.arange(len(t)),t]+1e-7))/self.batch_size


        return  loss_rate
    
    def backward(self,t=None):

        self.t=t
        if self.t.ndim!=1:
            dx = (self.y - self.t) / self.batch_size
        else:
            y_c=self.y.copy()

            y_c[np.arange(len(self.t)),self.t] -=1
            dx=y_c/self.batch_size

        self.dx=dx

        return self.dx

class MultiLayersNetwork:
    def __init__(self, input_size, output_size, hidden_size_list=None,weight=1):
        if hidden_size_list is None:
            self.hidden_size_list = [100, 100, 100]
        else:
            self.hidden_size_list = hidden_size_list
        self.input_size = input_size

        self.output_size = output_size
        self.params=dict()
        self.sourcedata=None
        self.layers= dict()
        self.weight=weight
        self.w_dict=None
        #生成层
        parameter_size_list=self.hidden_size_list
        parameter_size_list.insert(0,input_size)
        parameter_size_list.append(self.output_size)
        self.hidden_size_list=parameter_size_list

        for i in range(len(parameter_size_list) - 1): # 遍历所有 Affine 层
            scale = np.sqrt(self.weight / parameter_size_list[i])
            self.params['W' + str(i)] = np.random.randn(parameter_size_list[i], parameter_size_list[i + 1]) * scale # 使用 randn 初始化
            self.params['b' + str(i)] = np.zeros(parameter_size_list[i + 1]) # 偏置初始化为 0
            self.layers['affine' + str(i)] = Affine(self.params['W' + str(i)], self.params['b' + str(i)])

            if i < len(parameter_size_list) - 2: # 除了最后一层 Affine，都添加 Relu
                self.layers['relu' + str(i)] = Relu()
            else: # 最后一层 Affine 之后添加 Softmaxwithloss
                self.layers['Activation_function'] = Softmaxwithloss()


        # #倒数第二层之前全部用relu
        # for i in range(len(parameter_size_list)-2):
        #     scale = np.sqrt(weight / parameter_size_list[i])
        #     self.params['W'+str(i)]=np.random.randn(parameter_size_list[i],parameter_size_list[i+1])*scale
        #     self.params['b'+str(i)]=np.zeros(parameter_size_list[i+1])
        #     #每一层就自己层的权重和偏置
        #     self.layers['affine'+str(i)]=Affine(self.params['W'+str(i)],self.params['b'+str(i)])
        #     self.layers['relu'+str(i)]=Relu()
        #
        # if i==len(parameter_size_list)-3:
        #     i+=1
        #     scale = np.sqrt(1.0 / parameter_size_list[i])
        #     self.params['W'+str(i)]=np.random.randn(parameter_size_list[i],parameter_size_list[i+1])*scale
        #     self.params['b'+str(i)]=np.zeros(parameter_size_list[i+1])
        #     self.layers['affine'+str(i)]=Affine(self.params['W'+str(i)],self.params['b'+str(i)])
        #     self.layers['Activation_function']=Softmaxwithloss()


    def predict(self, x):
        inputs=x
        for key,func in self.layers.items():
            inputs=func.forward(inputs)
        return inputs

    def loss(self,x,t,weight_decay_lambda=0):
        w_decay=0
        for w_key in self.params.keys():
            if 'W' in w_key:
                w_decay += 0.5*weight_decay_lambda * np.sum(self.params[w_key]**2)

        inputs=x
        for key,func in self.layers.items():
            if key=='Activation_function':
                loss_value=func.loss(inputs,t)+w_decay
            else:
                inputs=func.forward(inputs)



        return loss_value

    def backward(self,t,d_out=1):

        back_list=list(self.layers.keys())
        back_list.reverse()
        d_out=d_out
        for key in back_list:
            if key=='Activation_function':

                d_out=self.layers[key].backward(t=t)

            else:
                d_out=self.layers[key].backward(d_out)

    def gradient(self,t,weight_decay_lambda=0):

        self.backward(d_out=1,t=t)
        grads=dict()
        for idx in range(len(self.hidden_size_list)-1):
            grads['W'+str(idx)]=self.layers['affine'+str(idx)].dw+weight_decay_lambda*(self.params['W'+str(idx)])
            grads['b'+str(idx)]=self.layers['affine'+str(idx)].db
        return grads
    def accuracy(self,x,t):
        if t.ndim!=1: t=np.argmax(t,axis=1)
        y=np.argmax(self.predict(x),axis=1)
        return np.sum(y==t)/y.shape[0]



mln7=MultiLayersNetwork(input_size=784,output_size=10,hidden_size_list=[100,100,100,100,100,100,100],weight=2)

mask=np.random.choice(60000,size=100)
x_mask=x_train[mask]
t_mask=t_train[mask]
loss_list=[]
for i in range(500):

    loss_list.append(mln7.loss(x_mask,t_mask,weight_decay_lambda=0.01))
    grads=mln7.gradient(t=t_mask)
    for key in grads.keys():
        mln7.params[key] -=0.1*grads[key]
loss_list

[np.float64(9.65615195198184),
 np.float64(9.25664739601985),
 np.float64(9.122695708252897),
 np.float64(9.002863552343008),
 np.float64(8.938651284807008),
 np.float64(9.04997517320442),
 np.float64(9.418782175594025),
 np.float64(9.180413841401636),
 np.float64(8.964067641950509),
 np.float64(8.775051381526435),
 np.float64(8.596785851279913),
 np.float64(8.45672937168652),
 np.float64(8.516599017098716),
 np.float64(8.734937388584669),
 np.float64(8.68459239325705),
 np.float64(8.399438371247133),
 np.float64(8.241906022998153),
 np.float64(8.025905368804668),
 np.float64(8.271424529095164),
 np.float64(9.43672435490759),
 np.float64(9.119920572314744),
 np.float64(8.822409227480213),
 np.float64(8.433838161428925),
 np.float64(8.228495044214675),
 np.float64(8.035107269102461),
 np.float64(7.92579565673385),
 np.float64(7.839317136037325),
 np.float64(7.951216295782973),
 np.float64(8.172951401387486),
 np.float64(8.202536158314503),
 np.float64(7.9394347124722415),
 np.float64(7.

In [169]:
mln7.accuracy(x_mask,t_mask)
mln7.accuracy(x_test,t_test)

np.float64(0.6856)

In [108]:
#0.01方差正态分布
mln=MultiLayersNetwork(input_size=784,output_size=10,hidden_size_list=[100,100],weight=0.1)
mln.accuracy(x_test,t_test)
mln.loss(x_test,t_test)
loss_list=[]
for i in range(20):
    loss_list.append(mln.loss(x_test,t_test))
    grads=mln.gradient(t=t_test)
    for key in grads.keys():
        mln.params[key] -=0.1*grads[key]


In [82]:
###Xavier
mln=MultiLayersNetwork(input_size=784,output_size=10,hidden_size_list=[100,100],weight=1)
mln.accuracy(x_test,t_test)
mln.loss(x_test,t_test)
loss_list=[]
for i in range(20):
    loss_list.append(mln.loss(x_train,t_train))
    grads=mln.gradient(t=t_train)
    for key in grads.keys():
        mln.params[key] -=0.1*grads[key]


KeyboardInterrupt: 

In [112]:
###HE
mln=MultiLayersNetwork(input_size=784,output_size=10,hidden_size_list=[100,100],weight=2)
mln.accuracy(x_test,t_test)
mln.loss(x_test,t_test)
loss_list=[]
for i in range(20):
    loss_list.append(mln.loss(x_train,t_train))
    grads=mln.gradient(t=t_train)
    for key in grads.keys():
        mln.params[key] -=0.1*grads[key]


KeyboardInterrupt: 

### 当训练量较少，但是层数过多时发生的过拟合 300样本，七层网络

In [125]:
mask=np.random.choice(300,size=60000)
x_mask=x_train[mask]
t_mask=t_train[mask]
mln7=MultiLayersNetwork(input_size=784,output_size=10,hidden_size_list=[100,100,100],weight=1)
mln7.accuracy(x_mask,t_mask)
mln7.loss(x_mask,t_mask)
loss_list=[]
for i in range(100):
    loss_list.append(mln7.loss(x_mask,t_mask,w_dict=True))
    grads=mln7.gradient(t=t_mask)
    for key in grads.keys():
        mln7.params[key] -=0.1*grads[key]

{'W3': array([[-1.67502582e-02, -5.90624451e-02,  6.97634106e-02,
        -9.31957455e-02, -8.74001981e-02, -9.97067515e-02,
        -1.84763995e-01, -8.54078224e-02, -1.24587972e-02,
         3.61153356e-02],
       [-1.28506720e-03, -4.24574173e-02,  1.20171001e-02,
        -1.27603745e-04, -2.21936955e-01, -7.22236839e-03,
         8.91941187e-02, -4.99702711e-02,  2.97199307e-02,
         2.78473541e-02],
       [ 9.40518284e-02, -5.47133746e-02, -5.22683086e-02,
         1.07896146e-01, -4.28396229e-02,  7.25810323e-02,
         8.81298181e-03,  1.84915710e-03,  5.36693744e-02,
         3.52941006e-02],
       [ 5.84891742e-02,  5.54133672e-02,  5.75297086e-02,
        -1.30892451e-02,  9.63717764e-02, -3.02854902e-01,
         1.21401109e-01,  6.71452935e-02, -1.40310108e-01,
        -1.66677052e-02],
       [-5.64932528e-02,  5.81929962e-02,  1.82648688e-01,
         1.15810209e-01, -3.13321170e-02, -7.21777394e-02,
        -1.06246194e-01,  1.74962642e-02, -4.85901639e-02,
    

In [129]:
mln7.accuracy(x_mask,t_mask)

np.float64(0.11416666666666667)

### 权值衰减

In [128]:
loss_list

[np.float64(2.8045440728959727),
 np.float64(553039.7270136209),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),
 np.float64(5.532166632608758e+35),