## 创建网络
解释gluon如何工作，之前使用的nn.Sequential，它是nn.Block的一个简单形式，我们并没有深入了解它们。

本教程和接下来几个教程，我们将详细解释如何使用这两个类来定义神经网络、初始化参数、以及保存和读取模型。

我们重新把多层感知机 — 使用Gluon里的网络定义搬到这里作为开始的例子（为了简单起见，这里我们丢掉了Flatten层）

In [3]:
from mxnet import nd
from mxnet.gluon import nn

net = nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(256, activation= 'relu'))
    net.add(nn.Dense(10))
print(net)

Sequential(
  (0): Dense(None -> 256, Activation(relu))
  (1): Dense(None -> 10, linear)
)


## 使用 `nn.Block` 定义
`nn.Sequential`是`nn.Block`的简单形式，如下用`nn.Block`实现同样的网络

In [4]:
class MLP(nn.Block):
    """
    __init__:创建参数。上面例子我们使用了包含了参数的dense层
    """
    def __init__(self, ** kwargs):
        """
        super(MLP, self).__init__(**kwargs)函数：这句话调用nn.Block的__init__函数，
        它提供了prefix（指定名字）和params（指定模型参数）两个参数。
        """
        super(MLP, self).__init__(** kwargs)
        """
        调用nn.Block提供的name_scope()函数。nn.Dense的定义放在这个scope里面。
        它的作用是给里面的所有层和参数的名字加上前缀（prefix）使得他们在系统里面独一无二。
        默认自动会自动生成前缀，我们也可以在创建的时候手动指定。推荐在构建网络时，每个层至少在一个name_scope()里
        """
        with self.name_scope():
            self.dense0 = nn.Dense(256)
            self.dense1 = nn.Dense(10)
    # 定义前向网络计算
    def forward(self, x):
        return self.dense1(nd.relu(self.dense0(x)))

In [8]:
net2 = MLP()
print(net2)
print(net2.dense0.weight)
net2.initialize()
x = nd.random_uniform(shape=(4,20))
print(net2(x))

MLP(
  (dense1): Dense(None -> 10, linear)
  (dense0): Dense(None -> 256, linear)
)
Parameter mlp3_dense0_weight (shape=(256, 0), dtype=<class 'numpy.float32'>)

[[-0.00280519  0.05682168  0.00845101 -0.07698126 -0.06098332  0.05909928
   0.03550563 -0.01684654 -0.02854338  0.08916292]
 [-0.00262184  0.04471714  0.04032315 -0.07430363 -0.02478844  0.03125754
   0.01593504 -0.06641655  0.02483857  0.01304245]
 [ 0.0029594   0.0288551  -0.03639418 -0.11928524 -0.0427849   0.03652045
   0.08650892 -0.01271457 -0.02150599  0.09514914]
 [ 0.03211017 -0.00701749 -0.05499509 -0.068024   -0.05132047  0.03421191
   0.04185638  0.00668136 -0.02798838  0.06224423]]
<NDArray 4x10 @cpu(0)>


In [10]:
print('default prefix:', net2.dense0.name)

net3 = MLP(prefix='another_mlp_')
print('customized prefix:', net3.dense0.name)

default prefix: mlp3_dense0
customized prefix: another_mlp_dense0


## nn.Block到底是什么东西？
在gluon里，nn.Block是一个一般化的部件。整个神经网络可以是一个nn.Block，单个层也是一个nn.Block。我们可以（近似）无限地嵌套nn.Block来构建新的nn.Block。

nn.Block主要提供这个东西

1. 存储参数
2. 描述forward如何执行
3. 自动求导
## 那么现在可以解释nn.Sequential了吧
nn.Sequential是一个nn.Block容器，它通过add来添加nn.Block。它自动生成forward()函数，其就是把加进来的nn.Block逐一运行。

一个简单的实现是这样的:

In [12]:
class Sequential(nn.Block):
    def __init__(self, ** kwargs):
        super(Sequential, self).__init__(** kwargs)
    
    def add(self,block):
        self._children.append(block)
    
    def forward(self, x):
        for block in self._children:
            x = block(x)
        return x
# 可以和 nn.Sequential() 一样来使用自定义的类
net4 = Sequential()
with net4.name_scope():
    net4.add(nn.Dense(256, activation= 'relu'))
    net4.add(nn.Dense(10))
net4.initialize()
print(net4(x))


[[ 0.00104139  0.00639622  0.03454763 -0.01168697  0.07835874 -0.02142669
  -0.00592941 -0.05849914  0.04525637  0.00457406]
 [ 0.02241309 -0.04296242  0.03766069 -0.00279771  0.0500275  -0.03436268
  -0.00224496 -0.02926439  0.04133548  0.0105134 ]
 [-0.03016665  0.01821524  0.04116601 -0.01005435  0.04715824 -0.01262928
  -0.00871191 -0.05950979  0.00829933 -0.00059525]
 [ 0.0063363   0.00877344  0.0233789   0.0061211   0.03473819 -0.02619518
  -0.00732469 -0.01876338  0.05770727 -0.00276955]]
<NDArray 4x10 @cpu(0)>


## nn.Block和nn.Sequential的嵌套使用
现在我们知道了nn下面的类基本都是nn.Block的子类，他们可以很方便地嵌套使用

In [18]:
class RecMLP(nn.Block):
    def __init__(self, **kwargs):
        super(RecMLP, self).__init__(**kwargs)
        self.net = nn.Sequential()
        with self.name_scope():
            self.net.add(nn.Dense(256, activation="relu"))
            self.net.add(nn.Dense(128, activation="relu"))
            self.dense = nn.Dense(64)

    def forward(self, x):
        return nd.relu(self.dense(self.net(x)))

rec_mlp = nn.Sequential()
rec_mlp.add(RecMLP())
rec_mlp.add(nn.Dense(10))
rec_mlp.initialize()
print(rec_mlp)
print(rec_mlp(x))

Sequential(
  (0): RecMLP(
    (net): Sequential(
      (0): Dense(None -> 256, Activation(relu))
      (1): Dense(None -> 128, Activation(relu))
    )
    (dense): Dense(None -> 64, linear)
  )
  (1): Dense(None -> 10, linear)
)

[[  3.62937967e-03  -3.64664244e-03  -3.40712070e-03  -3.43851116e-03
   -3.74851632e-03   1.42319086e-05  -9.07513604e-04  -2.00890796e-03
    2.85684969e-03  -4.77109279e-04]
 [  1.67747529e-03  -4.78566950e-03  -3.25752934e-03  -1.41125685e-03
   -3.41535360e-03   1.90782256e-03   2.87884723e-05  -1.88271201e-03
    1.25917548e-03  -5.82630892e-05]
 [  2.97360495e-03  -6.60494808e-03  -4.10671020e-03  -2.74736341e-03
   -8.19559395e-03   1.49177841e-03  -1.15507864e-03  -3.66006698e-03
    1.57480477e-03   1.19225704e-03]
 [  4.24453150e-03  -6.48297556e-03  -4.90886392e-03  -5.40651660e-03
   -5.40666375e-03  -6.01428968e-04  -5.18403889e-04  -3.52386502e-03
    2.24238355e-03  -2.10732245e-03]]
<NDArray 4x10 @cpu(0)>


In [21]:
class RecMLP1(nn.Block):
    def __init__(self, **kwargs):
        super(RecMLP1, self).__init__(**kwargs)
        
        with self.name_scope():
            self.dense0 = nn.Dense(256, activation="relu")
            self.dense1 = nn.Dense(128, activation="relu")
            self.dense2 = nn.Dense(64)
#             self.denses = [nn.Dense(256, activation="relu"), nn.Dense(128, activation="relu"), nn.Dense(64) ]
            # __init__,forward,  函数输入参数要求Block类， self.denses为type list

    def forward(self, x):
#         for dense in self.denses:
#             x = dense(x)
        x = self.dense2(self.dense1(self.dense0(x)))
        return nd.relu(x)

rec_mlp1 = RecMLP1()
rec_mlp1.initialize()
print(rec_mlp1)
print(rec_mlp1(x))

RecMLP1(
  (dense1): Dense(None -> 128, Activation(relu))
  (dense0): Dense(None -> 256, Activation(relu))
  (dense2): Dense(None -> 64, linear)
)

[[ 0.0094601   0.01150595  0.0043      0.01612011  0.00722826  0.          0.
   0.          0.00676135  0.          0.          0.00687655  0.          0.
   0.02170436  0.00075789  0.00781166  0.          0.          0.
   0.01066998  0.01752731  0.03945627  0.          0.00320272  0.
   0.00817525  0.          0.          0.          0.          0.00442885
   0.01928487  0.01963375  0.          0.          0.          0.00038812
   0.01876653  0.01475956  0.00426263  0.0241504   0.01523386  0.00143459
   0.          0.02752425  0.00350364  0.          0.01664411  0.00395365
   0.00881247  0.02001468  0.00148547  0.00085829  0.          0.00322197
   0.          0.          0.00403514  0.          0.00884278  0.          0.
   0.00168418]
 [ 0.          0.00558753  0.00849366  0.0035863   0.          0.
   0.00692048  0.01070072  0.      

## 初始化模型参数


In [22]:
from mxnet.gluon import nn
from mxnet import nd

def get_net():
    net = nn.Sequential()
    with net.name_scope():
        net.add(nn.Dense(4, activation="relu"))
        net.add(nn.Dense(2))
    return net

x = nd.random.uniform(shape=(3,5))

In [24]:
import sys
try:
    net = get_net()
    net(x)
except RuntimeError as err:
    sys.stderr.write(str(err))

Parameter sequential13_dense0_bias has not been initialized. Note that you should initialize parameters and create Trainer with Block.collect_params() instead of Block.params because the later does not include Parameters of nested child Blocks

In [25]:
net.initialize()
net(x)


[[-0.00058652  0.00016143]
 [-0.00042701  0.00025122]
 [ 0.00015206  0.00101848]]
<NDArray 3x2 @cpu(0)>

## 访问模型参数
之前我们提到过可以通过`weight`和`bias`访问`Dense`的参数，他们是`Parameter`这个类。

然后我们可以通过`data`来访问参数，`grad`来访问对应的梯度

我们也可以通过<font color=red> collect_params</font>来访问`Block`里面所有的参数（这个会包括所有的子`Block`）。它会返回一个名字到对应`Parameter`的`dict`。既可以用正常[]来访问参数，也可以用`get()`，它不需要填写名字的前缀。

In [27]:
w = net[0].weight
b = net[0].bias
print('name: ', net[0].name, '\nweight: ', w, '\nbias: ', b)
print('weight data: ', w.data(), '\nweight grad: ', w.grad())

name:  sequential13_dense0 
weight:  Parameter sequential13_dense0_weight (shape=(4, 5), dtype=<class 'numpy.float32'>) 
bias:  Parameter sequential13_dense0_bias (shape=(4,), dtype=<class 'numpy.float32'>)
weight data:  
[[ 0.04835007 -0.01382367  0.00507843  0.0601008   0.02523782]
 [-0.05605391  0.01528487  0.06234222 -0.05621308  0.0517284 ]
 [-0.05711614 -0.00641727 -0.06216478 -0.02426187 -0.05788545]
 [-0.03741582 -0.03679574  0.01602506  0.04753181 -0.06536956]]
<NDArray 4x5 @cpu(0)> 
weight grad:  
[[ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]]
<NDArray 4x5 @cpu(0)>


In [32]:
params = net.collect_params()
print(params)
print(params['sequential13_dense0_bias'].data())
print(params.get('dense0_weight').data())
print(params.get('dense1_bias').data())

sequential13_ (
  Parameter sequential13_dense0_weight (shape=(4, 5), dtype=<class 'numpy.float32'>)
  Parameter sequential13_dense0_bias (shape=(4,), dtype=<class 'numpy.float32'>)
  Parameter sequential13_dense1_weight (shape=(2, 4), dtype=<class 'numpy.float32'>)
  Parameter sequential13_dense1_bias (shape=(2,), dtype=<class 'numpy.float32'>)
)

[ 0.  0.  0.  0.]
<NDArray 4 @cpu(0)>

[[ 0.04835007 -0.01382367  0.00507843  0.0601008   0.02523782]
 [-0.05605391  0.01528487  0.06234222 -0.05621308  0.0517284 ]
 [-0.05711614 -0.00641727 -0.06216478 -0.02426187 -0.05788545]
 [-0.03741582 -0.03679574  0.01602506  0.04753181 -0.06536956]]
<NDArray 4x5 @cpu(0)>

[ 0.  0.]
<NDArray 2 @cpu(0)>


## 使用不同的初始函数来初始化
我们一直在使用默认的initialize来初始化权重（除了指定GPU ctx外）。它会把所有权重初始化成在[-0.07, 0.07]之间均匀分布的随机数。我们可以使用别的初始化方法。例如使用均值为0，方差为0.02的正态分布
## 共享模型参数


In [34]:
from mxnet import init
params.initialize(init=init.Normal(sigma=0.02), force_reinit=True)
print(net[0].weight.data(), net[0].bias.data())


[[ 0.01000567  0.00703028 -0.0004781   0.00348184 -0.01082114]
 [-0.00994558  0.02120297  0.02790563 -0.00359711 -0.01372356]
 [ 0.01808388  0.02528431 -0.0129267   0.02320206  0.01585069]
 [ 0.02222755 -0.00252373 -0.01147721  0.01738829  0.03108984]]
<NDArray 4x5 @cpu(0)> 
[ 0.  0.  0.  0.]
<NDArray 4 @cpu(0)>


## 共享模型参数
有时候我们想在层之间共享同一份参数，我们可以通过Block的params输出参数来手动指定参数，而不是让系统自动生成。

In [35]:
net = nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(4, activation="relu"))
    net.add(nn.Dense(4, activation="relu"))
    net.add(nn.Dense(4, activation="relu", params=net[-1].params))
    net.add(nn.Dense(2))
    
net.initialize()
net(x)
print(net[1].weight.data())
print(net[2].weight.data())


[[-0.0514059  -0.01693203 -0.01760576 -0.01759853]
 [-0.02458332  0.03483035  0.02521617 -0.03670699]
 [ 0.04137486 -0.04594057  0.0005507  -0.00709917]
 [-0.02852607 -0.02737442  0.05403472  0.04748648]]
<NDArray 4x4 @cpu(0)>

[[-0.0514059  -0.01693203 -0.01760576 -0.01759853]
 [-0.02458332  0.03483035  0.02521617 -0.03670699]
 [ 0.04137486 -0.04594057  0.0005507  -0.00709917]
 [-0.02852607 -0.02737442  0.05403472  0.04748648]]
<NDArray 4x4 @cpu(0)>


##  自定义初始化方法
下面我们自定义一个初始化方法。它通过重载_init_weight来实现不同的初始化方法。（注意到Gluon里面bias都是默认初始化成0）

In [43]:
class MyInit(init.Initializer):
    def __init__(self):
        super(MyInit, self).__init__()
        self._verbose = True
    def _init_weight(self, _, arr):
        # 初始化权重，使用out=arr后我们不需指定形状
        print('init weight', arr.shape)
        nd.random.uniform(low=5, high=10, out=arr)
        
net = get_net()
net.initialize(MyInit())
print(x.shape)
net(x)
net[0].weight.data()

(3, 5)
init weight (4, 5)
init weight (2, 4)



[[ 7.5135479   7.02971172  5.76813793  5.12156582  7.1275382 ]
 [ 6.71305466  9.53085899  8.11115551  5.04255772  6.39533997]
 [ 9.8443203   6.04874992  8.45446873  5.57851601  5.49708176]
 [ 7.88570118  6.44887924  8.47634983  7.69536781  8.35978508]]
<NDArray 4x5 @cpu(0)>

In [42]:
help(nd.random.uniform)

Help on function uniform in module mxnet.ndarray.random:

uniform(low=0, high=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs)
    Draw random samples from a uniform distribution.
    
    Samples are uniformly distributed over the half-open interval *[low, high)*
    (includes *low*, but excludes *high*).
    
    Parameters
    ----------
    low : float or NDArray
        Lower boundary of the output interval. All values generated will be
        greater than or equal to low. The default value is 0.
    high : float or NDArray
        Upper boundary of the output interval. All values generated will be
        less than high. The default value is 1.0.
    shape : int or tuple of ints
        The number of samples to draw. If shape is, e.g., `(m, n)` and `low` and
        `high` are scalars, output shape will be `(m, n)`. If `low` and `high`
        are NDArrays with shape, e.g., `(x, y)`, then output will have shape
        `(x, y, m, n)`, where `m*n` samples are drawn for 

当然我们也可以通过Parameter.set_data来直接改写权重。注意到由于有延后初始化，所以我们通常可以通过调用一次net(x)来确定权重的形状先

In [37]:
net = get_net()
net.initialize()
net(x)

print('default weight:', net[1].weight.data())

w = net[1].weight
w.set_data(nd.ones(w.shape))

print('init to all 1s:', net[1].weight.data())

default weight: 
[[ 0.06581051 -0.0137601  -0.06643037 -0.03522212]
 [ 0.05432612  0.0008213   0.02920524 -0.02654669]]
<NDArray 2x4 @cpu(0)>
init to all 1s: 
[[ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]]
<NDArray 2x4 @cpu(0)>


In [65]:
"""
 |  params
 |      Returns this :py:class:`Block`'s parameter dictionary (does not include its
 |      children's parameters)
"""
net0 = get_net()
net.add(net0)
# params = net.collect_params()
# params.initialize(init=init.Uniform(),force_reinit= True)
net.params

sequential19_ (

)

In [61]:
help(net.initialize)

Help on method initialize in module mxnet.gluon.block:

initialize(init=<mxnet.initializer.Uniform object at 0x00000000046946A0>, ctx=None, verbose=False) method of mxnet.gluon.nn.basic_layers.Sequential instance
    Initializes :py:class:`Parameter` s of this :py:class:`Block` and its children.
    
    Equivalent to ``block.collect_params().initialize(...)``



In [39]:
"""
 |  collect_params(self)
 |      Returns a :py:class:`ParameterDict` containing this :py:class:`Block` and all of its
 |      children's Parameters.
"""
net.collect_params

<bound method Block.collect_params of Sequential(
  (0): Dense(5 -> 4, Activation(relu))
  (1): Dense(4 -> 2, linear)
)>

In [56]:
# net.params('dense1_weight').data()
print(net.collect_params().get('dense1_weight').data())
net.name



[[ 8.62073898  9.74430466  6.94433689  5.01351595]
 [ 6.13541985  8.23598289  7.27429962  8.00196075]]
<NDArray 2x4 @cpu(0)>


'sequential19'

In [57]:
help(net)

Help on Sequential in module mxnet.gluon.nn.basic_layers object:

class Sequential(mxnet.gluon.block.Block)
 |  Stacks Blocks sequentially.
 |  
 |  Example::
 |  
 |      net = nn.Sequential()
 |      # use net's name_scope to give child Blocks appropriate names.
 |      with net.name_scope():
 |          net.add(nn.Dense(10, activation='relu'))
 |          net.add(nn.Dense(20))
 |  
 |  Method resolution order:
 |      Sequential
 |      mxnet.gluon.block.Block
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, key)
 |  
 |  __init__(self, prefix=None, params=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __len__(self)
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  add(self, *blocks)
 |      Adds block on top of the stack.
 |  
 |  forward(self, x)
 |      Overrides to implement forward computation using :py:class:`NDArray`. Only
 |      accepts positional arguments.
 |      
 |      Parameters
 |     