In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from tqdm import tqdm

In [2]:
from LeNet import LeNet
from data_process import load_data,data_convert
from evaluate import softmax,cal_accuracy

In [3]:
mnist_dir = "./mnist_data/"
train_data_dir = "train-images.idx3-ubyte"
train_label_dir = "train-labels.idx1-ubyte"
test_data_dir = "t10k-images.idx3-ubyte"
test_label_dir = "t10k-labels.idx1-ubyte"

In [4]:
train_images, train_labels, test_images, test_labels = load_data(mnist_dir, train_data_dir, train_label_dir, test_data_dir, test_label_dir)
print("Got data. ") 

Loading MNIST data from files...
./mnist_data/train-images.idx3-ubyte
Load images from ./mnist_data/train-images.idx3-ubyte, number: 60000, data shape: (60000, 784)
Load images from ./mnist_data/train-labels.idx1-ubyte, number: 60000, data shape: (60000, 1)
Load images from ./mnist_data/t10k-images.idx3-ubyte, number: 10000, data shape: (10000, 784)
Load images from ./mnist_data/t10k-labels.idx1-ubyte, number: 10000, data shape: (10000, 1)
Got data. 


In [None]:
for i in range(2):
    img = np.reshape(train_images [i, :], (28, 28))
    label = np.argmax(train_images [i, :])
    plt.matshow(img, cmap = plt.get_cmap('gray'))
    plt.figure(figsize=(1,1))
    plt.show()


In [5]:
x,y = data_convert(train_images, train_labels,60000,10)

In [6]:
def shuffle_batch(batch_size):

    index = np.random.randint(0,len(x),batch_size)
    return x[index],y.T[index].T

In [56]:
def cal_accuracy(params,x_val,y_val):
    model = LeNet()
    model.set_params(params)
    y_pred = model.fit(x_val,x_val.shape[0])
    y_pred = y_pred.argmax(axis=1)
    # TODO: Compute the accuracy among the test set and store it in acc
    y_val=y_val.reshape(len(y_val))
    res = y_pred-y_val    #相减，相同的项相减后为0
    incorrect = np.count_nonzero(res)   #计算有多少个不为0的项，即y_pred和y不相同的项的个数
    acc = 1-incorrect/len(y_val)         #计算正确率
    
    return acc

In [24]:
acc = cal_accuracy(model,test_images[0:1000],test_labels[0:1000])

In [25]:
acc

0.10799999999999998

In [9]:
model = LeNet()

In [10]:
y_pred = model.fit(x[0:500],500)

In [None]:
model1 = LeNet5()

In [40]:
y_pred.sum()

18109420.93518278

In [43]:
loss,grad,y_pred = softmax(y_pred,y[:,0:500])

In [45]:
grad.shape

(500, 10)

In [46]:
model.Output.input.shape

(500, 84)

In [47]:
model.back_prop(grad)

In [48]:
X_train,y_train = shuffle_batch(batch_size)

In [49]:
X_train.shape

(256, 784)

In [50]:
y_train.shape

(10, 256)

In [68]:
y_pred = model.fit(X_train,batch_size)

In [75]:
loss, grad, y_pred = softmax(y_pred, y_train)

In [76]:
loss

2.296080310884536

In [78]:
model.fit(test_images[0:1000],1000)

array([[ -311038.26092776,  2347997.88797649, -2974666.4590697 , ...,
         5313881.22390467, -1448695.98336833, -3506334.66723409],
       [ -758438.50679917,  4092248.24306851, -4572812.96937503, ...,
         3992572.43368527,  -453619.59396196, -3663667.28934029],
       [-3473010.34364727,  5395064.43221974, -3528840.92945717, ...,
         1462421.31534039, -1940125.20047131, -2705539.86555242],
       ...,
       [  559300.09698867,  3857175.45142322, -6556484.41845315, ...,
         7009696.7194365 ,  -842266.61207645, -6413441.87842566],
       [-2857349.19972392,  5163445.26056176, -6140983.65689367, ...,
         5135980.56540975, -1590087.40999815,  -781562.56580953],
       [ 1360533.60968179,  1731876.33502661,   680191.8657714 , ...,
         5275267.33554773, -2118652.76435024, -1849100.40695828]])

In [12]:
batch_size = 256

In [93]:
X_train,y_train = shuffle_batch(batch_size)
y_pred = model.fit(X_train,batch_size)

loss, grad, y_pred = softmax(y_pred, y_train)
model.back_prop(grad)
model.update(0.01)

In [12]:
from model import LeNet5

In [20]:
X_train,y_train = shuffle_batch(batch_size)

In [68]:
test = LeNet5()
test1 = LeNet()

In [21]:
test.forward(X_train.reshape(batch_size,1,28,28))

array([[ 1.86463489e-12,  4.84332626e-12, -1.15880767e-11, ...,
        -2.62081587e-11,  3.42113586e-11,  9.51039950e-12],
       [ 1.21048123e-11,  2.07958146e-12, -8.72917644e-12, ...,
        -2.31779807e-11,  1.35778951e-11,  3.66870632e-12],
       [ 1.50951152e-11,  1.85397744e-12, -2.43616250e-11, ...,
        -3.49754897e-11,  2.95744043e-11, -5.33663434e-12],
       ...,
       [ 1.81419571e-11, -1.07726864e-11, -8.94751678e-12, ...,
        -3.65070293e-11,  2.81650830e-11,  2.83181743e-11],
       [ 6.49723682e-12,  5.89989282e-13, -9.60876514e-12, ...,
        -1.40868091e-11,  2.50974280e-11,  1.24962576e-11],
       [ 1.08262102e-11,  1.54681872e-11, -2.49178731e-11, ...,
        -4.12261834e-11,  2.76279702e-11,  7.06280136e-12]])

In [22]:
test1.set_params(test.get_params())

get


In [23]:
test1.fit(X_train,batch_size)

array([[ 1.86463489e-12,  4.84332626e-12, -1.15880767e-11, ...,
        -2.62081587e-11,  3.42113586e-11,  9.51039950e-12],
       [ 1.21048123e-11,  2.07958146e-12, -8.72917644e-12, ...,
        -2.31779807e-11,  1.35778951e-11,  3.66870632e-12],
       [ 1.50951152e-11,  1.85397744e-12, -2.43616250e-11, ...,
        -3.49754897e-11,  2.95744043e-11, -5.33663434e-12],
       ...,
       [ 1.81419571e-11, -1.07726864e-11, -8.94751678e-12, ...,
        -3.65070293e-11,  2.81650830e-11,  2.83181743e-11],
       [ 6.49723682e-12,  5.89989282e-13, -9.60876514e-12, ...,
        -1.40868091e-11,  2.50974280e-11,  1.24962576e-11],
       [ 1.08262102e-11,  1.54681872e-11, -2.49178731e-11, ...,
        -4.12261834e-11,  2.76279702e-11,  7.06280136e-12]])

In [69]:
y_pred = test1.fit(X_train,batch_size)

In [72]:
y_pred

array([[ 11493.5363824 ,  25461.11848911,  14904.68775691, ...,
          6208.57045711, -13391.94105291,  23778.64055244],
       [  5109.38427061,  24979.63484023,   5357.09949956, ...,
         -9106.51029215,  -6597.68246864,  30087.41446535],
       [ 10427.09133908,  18355.39163483,  12541.82215144, ...,
          8359.48601334, -15055.07925902,  29390.6863899 ],
       ...,
       [ 28533.82198856,  22153.44759396,  13839.07520864, ...,
          3781.66221755, -22845.75581953,  37344.15831072],
       [ 17578.66646593,   9612.40138327,   9372.31154885, ...,
         -2378.65114922, -16464.27826381,  25525.14221269],
       [-11683.7889507 ,  31807.87485058,  22716.97984874, ...,
         -2773.98723976, -29644.3436814 ,  23438.2849941 ]])

In [71]:
y_pred += 1e-5

In [59]:
np.exp(y_pred)

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.55877435e-90, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [57]:
y_pred = y_pred - y_pred.max(axis=1)[:,None] #防止溢出

In [26]:
loss, grad, acc = softmax(y_pred, y_train)

In [29]:
loss, grad, acc = softmax_loss(y_pred, y_train.argmax(axis=0))

In [30]:
loss

2.3025850929926275

In [31]:
grad

array([[ 0.00039063, -0.00351562,  0.00039062, ...,  0.00039062,
         0.00039063,  0.00039063],
       [ 0.00039063, -0.00351563,  0.00039062, ...,  0.00039062,
         0.00039063,  0.00039063],
       [ 0.00039063,  0.00039063,  0.00039062, ...,  0.00039062,
         0.00039063,  0.00039062],
       ...,
       [ 0.00039063,  0.00039062, -0.00351563, ...,  0.00039062,
         0.00039063,  0.00039063],
       [ 0.00039063, -0.00351563,  0.00039062, ...,  0.00039062,
         0.00039063,  0.00039063],
       [ 0.00039063,  0.00039063,  0.00039062, ...,  0.00039062,
         0.00039063,  0.00039063]])

In [91]:
grad1 = grad/256

In [92]:
grad1

array([[ 0.00039062,  0.00039062,  0.00039063, ...,  0.00039062,
         0.00039062,  0.00039063],
       [-0.00351563,  0.00039062,  0.00039062, ...,  0.00039062,
         0.00039062,  0.00039063],
       [ 0.00039062,  0.00039062,  0.00039062, ...,  0.00039062,
         0.00039062,  0.00039063],
       ...,
       [ 0.00039062,  0.00039062,  0.00039062, ...,  0.00039062,
         0.00039062,  0.00039063],
       [-0.00351562,  0.00039062,  0.00039063, ...,  0.00039062,
         0.00039062,  0.00039063],
       [ 0.00039062,  0.00039062,  0.00039062, ...,  0.00039062,
        -0.00351563,  0.00039063]])

In [165]:
test.backward(grad)

In [166]:
test1.back_prop(grad)

In [95]:
acc = cal_accuracy(model.get_params(),test_images[0:1000],test_labels[0:1000])

In [96]:
acc

0.09899999999999998

In [73]:
def softmax(y_pred,y):
    batch_size ,_ = y_pred.shape
    y_pred = y_pred / y_pred.max(axis=1)[:,None] #防止溢出
    y_pred +=1e-5
    y_pred = np.exp(y_pred)
    y_sum = y_pred.sum(axis = 1)
    y_pred = y_pred/y_sum[:,None]
    loss = -np.log(y_pred).T * y
    loss = loss.sum()/batch_size
    grad = y_pred - y.T
    grad /= batch_size
    acc = (y_pred.argmax(axis=1) == y.argmax(axis=0)).mean()
    return loss,grad,acc

In [18]:
def softmax_loss(y_pred, y):
    # y_pred: (N, C)
    # y: (N, 1)
    N = y_pred.shape[0]
    ex = np.exp(y_pred)
    sumx = np.sum(ex, axis=1)
    loss = np.mean(np.log(sumx)-y_pred[range(N), list(y)])
    grad = ex/sumx.reshape(N, 1)
    grad[range(N), list(y)] -= 1
    grad /= N
    acc = np.mean(np.argmax(ex/sumx.reshape(N, 1), axis=1) == y.reshape(1, y.shape[0]))
    return loss, grad, acc

In [152]:
(y_pred.argmax(axis=1) == y_train.argmax(axis=0)).mean()

0.1171875

In [140]:
loss,grad,_ = softmax(y_pred,y_train)

In [147]:
y_train.shape

(10, 256)

In [141]:
loss

2.3106604049807493

In [142]:
grad.sum()

1.3183898417423734e-15

In [143]:
loss,grad,_ = softmax_loss(y_pred,y_train.argmax(axis=0))

In [144]:
loss

2.2982749070385187

In [145]:
grad.sum()

-5.149960319306146e-18

In [None]:
pbar = tqdm(range(0, int(x.shape[0]/batch_size)), ncols=100)
for i in pbar:
    X_train,y_train = shuffle_batch(batch_size)
    y_pred = model.fit(X_train,batch_size)

    loss, grad, acc = softmax(y_pred, y_train)
    model.back_prop(grad)
    model.update(0.1)
    pbar.set_postfix(loss=loss, acc=acc)

In [14]:
def cal_accuracy(model,x_val,y_val):
    y_pred = model.fit(x_val,x_val.shape[0])
    y_pred = y_pred.argmax(axis=1)
    # TODO: Compute the accuracy among the test set and store it in acc
    y_val=y_val.reshape(len(y_val))
    res = y_pred-y_val    #相减，相同的项相减后为0
    incorrect = np.count_nonzero(res)   #计算有多少个不为0的项，即y_pred和y不相同的项的个数
    acc = 1-incorrect/len(y_val)         #计算正确率
    
    return acc

In [13]:
X_train,y_train = shuffle_batch(batch_size)
y_pred = model.fit(X_train,batch_size)

loss, grad, y_pred = softmax(y_pred, y_train)
#acc = cal_accuracy(model,test_images,test_labels)

In [18]:
loss

2.397517375194011

In [28]:
acc = cal_accuracy(model,test_images[:1000],test_labels[:1000])

In [47]:
acc

0.15200000000000002

In [9]:
class Adam:
    def __init__(self, model, lr=1e-3, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.model = model
        self.m = None
        self.v = None
        self.params = None
        self.grad = None

    def step(self):
        self.params = model.get_params()
        self.grad = model.get_grad()
        if self.m is None:
            self.m, self.v = [], []
            for param in self.params:
                self.m.append(np.zeros_like(param))
            for g in self.grad:
                self.v.append(np.zeros_like(g))
            assert(len(self.m) == len(self.params))
            assert(len(self.v) == len(self.grad))

        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)

        for i in range(len(self.params)):
            self.m[i] += (1 - self.beta1) * (self.grad[i] - self.m[i])
            self.v[i] += (1 - self.beta2) * (self.grad[i] ** 2 - self.v[i])
            self.params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)

In [10]:
class Adam1:
    def __init__(self, params, lr=1e-3, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        self.params_grad = params

    def step(self):
        if self.m is None:
            self.m, self.v = [], []
            for param in self.params_grad:
                self.m.append(np.zeros_like(param['value']))
                self.v.append(np.zeros_like(param['grad']))

        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)

        for i in range(len(self.params_grad)):
            self.m[i] += (1 - self.beta1) * (self.params_grad[i]['grad'] - self.m[i])
            self.v[i] += (1 - self.beta2) * (self.params_grad[i]['grad'] ** 2 - self.v[i])
            self.params_grad[i]['value'] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)



In [33]:
batch_size = 256


model = LeNet()
optimizer = Adam(model)

In [44]:
X_train,y_train = shuffle_batch(batch_size)
y_pred = model.fit(X_train,batch_size)

loss, grad, acc = softmax(y_pred, y_train)
model.back_prop(grad)
#model.update(0.001)
optimizer.step()

In [74]:
batch_size = 256


model = LeNet()
optimizer = Adam(model,0.2)
for e in range(10):
    pbar = tqdm(range(0, int(x.shape[0]/batch_size)), ncols=100)
    for i in pbar:
        X_train,y_train = shuffle_batch(batch_size)
        y_pred = model.fit(X_train,batch_size)

        loss, grad, acc = softmax(y_pred, y_train)
        model.back_prop(grad)
        #model.update(0.001)
        optimizer.step()
        pbar.set_postfix(loss=loss, acc=acc)

# val_X = data["X_val"]
# val_y = data["y_val"]
# y_pred = model.forward(val_X)
# y_pred = np.argmax(y_pred, axis=1)
# acc = np.mean(y_pred == val_y.reshape(1, val_y.shape[0]))
# if acc > best_acc:
#     best_acc = acc
#     best_weight = model.get_params()
# pbar.set_postfix(val_acc=acc)

  loss = -np.log(y_pred).T * y
  loss = -np.log(y_pred).T * y
 14%|█████▉                                    | 33/234 [00:33<03:21,  1.00s/it, acc=0.52, loss=nan]


KeyboardInterrupt: 

In [16]:
model1 = LeNet5()
batch_size = 256
epochs = 1
lr = 0.75

optimizer1 = Adam1(model1.get(), lr)

for e in range(10):
        # add tqdm
        pbar = tqdm(range(0, int(x.shape[0]/batch_size)), ncols=100)
        for i in pbar:
            X_train,y_train = shuffle_batch(batch_size)
            y_pred = model1.forward(X_train.reshape(batch_size,1,28,28))
            #loss, grad, acc = softmax_loss(y_pred, y_train.argmax(axis=0))
            loss, grad, acc = softmax(y_pred, y_train)
            model1.backward(grad)
            #model.update(0.5)
            optimizer1.step()
            pbar.set_postfix(loss=loss, acc=acc)

 44%|█████████████████▎                     | 104/234 [01:13<01:31,  1.42it/s, acc=0.109, loss=2.33]


KeyboardInterrupt: 

In [193]:
X_train, y_train = shuffle_batch(256)

In [194]:
X_train.shape

(256, 784)

In [197]:
y_train.argmax(axis=0).shape

(256,)

In [9]:
import dataloader
from optimizer import Adam

from loss import softmax_loss
epochs = 1
lr = 0.001
batch_size = 256
modell = LeNet5()
optimizer = Adam(modell.get_params(), lr)

pbar = tqdm(range(0, int(x.shape[0]/batch_size)), ncols=150)
for i in pbar:
    X_train, y_train = shuffle_batch(256)
    y_pred = modell.forward(X_train.reshape(256,1,28,28))
    loss, grad, acc = softmax_loss(y_pred, y_train.argmax(axis=0))
    modell.backward(grad)
    modell.update()
    pbar.set_postfix(loss=loss, acc=acc)

get


100%|██████████████████████████████████████████████████████████████████████████████████████████| 234/234 [03:26<00:00,  1.13it/s, acc=0.082, loss=2.3]


In [433]:
m, n = train_images.shape #(60000,784),28*28 image
    # data processing
x, y = data_convert(train_images, train_labels, m, 10) # x:[m,n], y:[1,m]