this is the first multilayer network, muiltlayer perceptron
aka MLP

In [8]:
## not use models in torch
import torch
import torchvision
from torch.utils import data
from torchvision import transforms
from torch import nn
def get_fashion_mnist_labels(labels):
    """返回Fashion-MNIST数据集的文本标签。"""
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

batch_size = 256

def get_dataloader_workers():
    return 8
def load_data_fashion_mnist(batch_size ,resize = None):
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0,transforms.Resize(resize))
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root='../data',train=True,transform=trans, download=True)
    mnist_test = torchvision.datasets.FashionMNIST(root='../data',train=False,transform=trans, download=True)

    return (
        data.DataLoader(mnist_train,batch_size,shuffle=True,num_workers=get_dataloader_workers()),
        data.DataLoader(mnist_test,batch_size,shuffle=False,num_workers=get_dataloader_workers())
    )
class Accumulator:  #@save
    """vector adder。"""
    '''
    zip函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，
    然后返回由这些元组组成的列表。
    如果各个迭代器的元素个数不一致，则返回列表长度与最短的对象相同，
    利用 * 号操作符，可以将元组解压为列表。
    '''
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
def accuracy(y_hat,y):
    '''
    :param y_hat: the possibilities of every type in examples
    :param y: the index of corre type
    :return:
    '''
    if len(y_hat.shape) >1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1) # the max of a line => index or type(in this case)
        # or get the predict index of correct type
        cmp = y_hat.type(y.dtype) == y
        # it may out a vector like [f,t,t,t,t,f,f,t]
        return float(cmp.type(y.dtype).sum())

def evaluate_accuracy(net,data_iter): #@save
    if isinstance(net, torch.nn.Module):
        net.eval()
    metric = Accumulator(2)
    for X, y in data_iter: # add the correct number of every batch
        metric.add(accuracy(net(X),y),y.numel()) # correct number, data number
    return metric[0]/metric[1]
def train_epoch(net, train_iter, loss, updater):
    """
    train for one epoch
    :param net:
    :param train_iter:
    :param loss:
    :param updater:
    :return:
    """
    if isinstance(net, torch.nn.Module):
        net.train()
    metric = Accumulator(3)
    for X, y in train_iter:
        y_hat = net(X) # calculate result
        l = loss(y_hat,y) # the loss value of every example
        if isinstance(updater, torch.optim.Optimizer):
            # the updater should get the grad of every params and then updates those params
            updater.zero_grad() # set grad to be zero
            l.backward() # the backword function well calculate the grad of every node in calculating graph
            # the l is the sum of the loss value of every example
            updater.step()
            metric.add(float(l)*len(y), accuracy(y_hat,y),y.size().numel())
        else:
            l.sum().backward() # the loss value of all example
            updater(X.shape[0])
            metric.add(float(l.sum()),accuracy(y_hat,y),y.numel())
    return metric[0] / metric[2], metric[1]/metric[2]

def train(net, train_iter, test_iter, loss, num_epochs, updater):
    for epoch in range(num_epochs):
        train_matrics = train_epoch(net,train_iter,loss,updater)
        test_acc = evaluate_accuracy(net,test_iter)
        print('train: loss:{},acc:{}'.format(train_matrics[0],train_matrics[1]))
        print('test: acc:{}'.format(test_acc))

In [9]:
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)
num_inputs, num_outputs, num_hiddens = 784,10,1024
W1 = nn.Parameter(torch.randn(num_inputs,num_hiddens,requires_grad=True)*0.01)
b1 = nn.Parameter(torch.zeros(num_hiddens,requires_grad=True))
W2 = nn.Parameter(torch.randn(num_hiddens,num_outputs,requires_grad=True)*0.01)
b2 = nn.Parameter(torch.zeros(num_outputs,requires_grad=True))

params=[W1,b1,W2,b2]

def relu(X):
    a = torch.zeros_like(X)
    return torch.max(X,a)

def net(X):
    X = X.reshape((-1,num_inputs))
    H = relu(X@W1+b1)
    return H @ W2 + b2
loss = nn.CrossEntropyLoss()

num_eoochs, lr = 10,0.1
updater = torch.optim.SGD(params,lr=lr)

train(net,train_iter,test_iter,loss,num_eoochs,updater)


train: loss:0.9274468812306722,acc:0.6921833333333334
test: acc:0.7593
train: loss:0.5683494225184123,acc:0.8025333333333333
test: acc:0.8059
train: loss:0.502886217546463,acc:0.8246666666666667
test: acc:0.816
train: loss:0.46216569860776263,acc:0.83795
test: acc:0.8326
train: loss:0.44189897316296894,acc:0.8454833333333334
test: acc:0.8272
train: loss:0.4202599067846934,acc:0.852
test: acc:0.8348
train: loss:0.40500399600664777,acc:0.8577666666666667
test: acc:0.8331
train: loss:0.3952644912560781,acc:0.8610833333333333
test: acc:0.8453
train: loss:0.3811689414024353,acc:0.8643166666666666
test: acc:0.8363
train: loss:0.3731047079722087,acc:0.8688166666666667
test: acc:0.855


In [None]:
## use model in torch

net = nn.Sequential(nn.Flatten(),nn.Linear(784,256),nn.ReLU(),nn.Dropout(0.4),nn.Linear(256,10))
net_decay = nn.Sequential(nn.Flatten(),nn.Linear(784,256),nn.ReLU(),nn.Linear(256,10))
net_dropout = nn.Sequential(
    nn.Flatten(),
    nn.Linear(784,256),
    nn.ReLU(),
    # nn.Dropout(0.2),
    nn.Linear(256,256),
    nn.ReLU(),
    # nn.Dropout(0.5),
    nn.Linear(256,10))

def init_weight(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight,std=0.01)

net.apply(init_weight)
net_decay.apply(init_weight)
net_dropout.apply(init_weight)

batch_size,lr,num_eoochs = 256,0.1,10
loss = nn.CrossEntropyLoss()
trainer = torch.optim.SGD(net.parameters(),lr=lr) #
# params = list(net.parameters())# n
# print(len(list(params)))# o decay
trainer_decay = torch.optim.SGD([{"params":net_decay[1].weight,'wight_decay':10},{"params":net_decay[1].bias},{"params":net_decay[3].weight,'wight_decay':5},{"params":net_decay[3].bias}],lr=lr)
# for p in net.parameters():
#     print(p)
train_iter, test_iter = load_data_fashion_mnist(batch_size)
train(net,train_iter,test_iter,loss,num_eoochs,trainer)
print(f"NET L2:{net[1].weight.norm().item()},{net[4].weight.norm().item()}")
print("===")_iter, test_iter = load_data_fashion_mnist(batch_size)
train(net_decay,train_iter,test_iter,loss,num_eoochs,trainer_decay)
print(f"NET_decay L2:{net_decay[1].weight.norm().item()},{net_decay[3].weight.norm().item()}")
train_dropout =  torch.optim.SGD(net_dropout.parameters(),lr=lr) #
train(net_decay,train_iter,test_iter,loss,num_eoochs,train_dropout)
print(f"NET_dropout L2:{net_dropout[1].weight.norm().item()},{net_dropout[4].weight.norm().item()}")

train: loss:0.5007064084688823,acc:0.8249166666666666
test: acc:0.8231
train: loss:0.4700194479147593,acc:0.83475
test: acc:0.8319
train: loss:0.44932981621424356,acc:0.8427
test: acc:0.8292
train: loss:0.4345707261244456,acc:0.8454166666666667
test: acc:0.8323
train: loss:0.4195893929640452,acc:0.8518166666666667
test: acc:0.8389
train: loss:0.4094060695807139,acc:0.8548166666666667
test: acc:0.8486
train: loss:0.4008685843785604,acc:0.8568166666666667
test: acc:0.8519
NET L2:8.291254043579102,6.659847736358643
===
train: loss:1.0455683462778727,acc:0.6364166666666666
test: acc:0.7219
train: loss:0.6006122962474822,acc:0.7889
test: acc:0.7829
train: loss:0.518011437590917,acc:0.8180333333333333
test: acc:0.8171
train: loss:0.48089712238311766,acc:0.8315
test: acc:0.8241
train: loss:0.45446574199994405,acc:0.8401166666666666
test: acc:0.8208
train: loss:0.4330106850624085,acc:0.8470166666666666
test: acc:0.8375
train: loss:0.41847219700813293,acc:0.8532666666666666
test: acc:0.839
trai

AttributeError: 'ReLU' object has no attribute 'weight'

#### why the L2 of net parameters has increased after add weight decay???

In [61]:
# dropout
# let some param in a set of params to be zero so as the result would not rely on some params
batch_size,lr,num_eoochs = 256,0.5,10
net_dropout = nn.Sequential(
    nn.Flatten(),
    nn.Linear(784,256),
    nn.ReLU(),
    # nn.Dropout(0.2),
    nn.Linear(256,256),
    nn.ReLU(),
    # nn.Dropout(0.5),
    nn.Linear(256,10))
net_dropout.apply(init_weight)
trainer_d = torch.optim.SGD(net_dropout.parameters(),lr=lr) #
train_dropout =  torch.optim.SGD(net_dropout.parameters(),lr=lr) #
train(net_decay,train_iter,test_iter,loss,num_eoochs,trainer_d)
print(f"NET_dropout L2:{net_dropout[1].weight.norm().item()},{net_dropout[4].weight.norm().item()}")

train: loss:0.3649889024098714,acc:0.8715
test: acc:0.8535
train: loss:0.3649889111359914,acc:0.8715
test: acc:0.8535
train: loss:0.36498890368143716,acc:0.8715
test: acc:0.8535
train: loss:0.3649889089743296,acc:0.8715
test: acc:0.8535
train: loss:0.3649889001051585,acc:0.8715
test: acc:0.8535
train: loss:0.3649889039198558,acc:0.8715
test: acc:0.8535
train: loss:0.3649889044602712,acc:0.8715
test: acc:0.8535
train: loss:0.3649889039993286,acc:0.8715
test: acc:0.8535
train: loss:0.36498890665372213,acc:0.8715
test: acc:0.8535
train: loss:0.36498890754381813,acc:0.8715
test: acc:0.8535


AttributeError: 'ReLU' object has no attribute 'weight'