In [1]:
#use Block to build model
from mxnet import nd
from mxnet.gluon import nn

class MLP(nn.Block):
    # 声明带有模型参数的层，这里声明了两个全连接层
    def __init__(self, **kwargs):
        # 调用MLP父类Block的构造函数来进行必要的初始化。这样在构造实例时还可以指定其他函数
        # 参数，如“模型参数的访问、初始化和共享”一节将介绍的模型参数params
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Dense(256, activation='relu')  # 隐藏层
        self.output = nn.Dense(10)  # 输出层

    # 定义模型的前向计算，即如何根据输入x计算返回所需要的模型输出
    def forward(self, x):
        return self.output(self.hidden(x))



In [2]:
X = nd.random.uniform(shape=(2, 20))
net = MLP()
net.initialize()
net(X)




[[ 0.09543004  0.04614332 -0.00286655 -0.07790346 -0.05130241  0.02942038
   0.08696645 -0.0190793  -0.04122177  0.05088576]
 [ 0.0769287   0.03099706  0.00856576 -0.044672   -0.06926838  0.09132431
   0.06786592 -0.06187843 -0.03436674  0.04234696]]
<NDArray 2x10 @cpu(0)>

In [3]:
class MySequential(nn.Block):
    def __init__(self, **kwargs):
        super(MySequential, self).__init__(**kwargs)

    def add(self, block):
        # block是一个Block子类实例，假设它有一个独一无二的名字。我们将它保存在Block类的
        # 成员变量_children里，其类型是OrderedDict。当MySequential实例调用
        # initialize函数时，系统会自动对_children里所有成员初始化
        self._children[block.name] = block

    def forward(self, x):
        # OrderedDict保证会按照成员添加时的顺序遍历成员
        for block in self._children.values():
            x = block(x)
        return x



In [4]:
net = MySequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize()
net(X)



[[ 0.00362229  0.00633331  0.03201145 -0.01369375  0.10336448 -0.0350802
  -0.00032165 -0.01676024  0.06978628  0.01303309]
 [ 0.03871717  0.02608212  0.03544958 -0.02521311  0.11005436 -0.01430663
  -0.03052467 -0.03852826  0.06321152  0.0038594 ]]
<NDArray 2x10 @cpu(0)>

In [5]:
class FancyMLP(nn.Block):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        # 使用get_constant创建的随机权重参数不会在训练中被迭代（即常数参数）
        self.rand_weight = self.params.get_constant(
            'rand_weight', nd.random.uniform(shape=(20, 20)))
        self.dense = nn.Dense(20, activation='relu')

    def forward(self, x):
        x = self.dense(x)
        # 使用创建的常数参数，以及NDArray的relu函数和dot函数
        x = nd.relu(nd.dot(x, self.rand_weight.data()) + 1)
        # 复用全连接层。等价于两个全连接层共享参数
        x = self.dense(x)
        # 控制流，这里我们需要调用asscalar函数来返回标量进行比较
        while x.norm().asscalar() > 1:
            x /= 2
        if x.norm().asscalar() < 0.8:
            x *= 10
        return x.sum()



In [6]:
net = FancyMLP()
net.initialize()
net(X)




[18.571953]
<NDArray 1 @cpu(0)>

In [7]:
class NestMLP(nn.Block):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential()
        self.net.add(nn.Dense(64, activation='relu'),
                     nn.Dense(32, activation='relu'))
        self.dense = nn.Dense(16, activation='relu')

    def forward(self, x):
        return self.dense(self.net(x))

net = nn.Sequential()
net.add(NestMLP(), nn.Dense(20), FancyMLP())

net.initialize()
net(X)




[24.86621]
<NDArray 1 @cpu(0)>

In [8]:
# 可以通过继承Block类来构造模型。
# Sequential类继承自Block类。
# 虽然Sequential类可以使模型构造更加简单，但直接继承Block类可以极大地拓展模型构造的灵活性



In [9]:
#model init
from mxnet import init, nd
from mxnet.gluon import nn

net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize()  # 使用默认初始化方式

X = nd.random.uniform(shape=(2, 20))
Y = net(X)  # 前向计算



In [10]:
net[0].params, type(net[0].params)



(dense10_ (
   Parameter dense10_weight (shape=(256, 20), dtype=float32)
   Parameter dense10_bias (shape=(256,), dtype=float32)
 ),
 mxnet.gluon.parameter.ParameterDict)

In [12]:
net[0].params['dense10_weight'], net[0].weight


(Parameter dense10_weight (shape=(256, 20), dtype=float32),
 Parameter dense10_weight (shape=(256, 20), dtype=float32))

In [13]:
net[0].weight.data()




[[-0.06046963  0.00624272 -0.03472826 ... -0.01759475  0.0686483
  -0.06360765]
 [-0.01273243 -0.02659053 -0.04718638 ...  0.02570021  0.02275064
  -0.0166979 ]
 [-0.03555115  0.01875034  0.02322027 ...  0.06564643  0.04601197
  -0.01915742]
 ...
 [ 0.03173313  0.01789995  0.02519771 ... -0.06176154 -0.03986754
  -0.04898471]
 [ 0.00564718  0.04665586 -0.00028374 ...  0.05332779  0.02100175
  -0.06427249]
 [ 0.0438781   0.05357236  0.02753124 ...  0.04084889 -0.01963295
   0.05668835]]
<NDArray 256x20 @cpu(0)>

In [14]:
net[0].weight.grad()


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 256x20 @cpu(0)>

In [15]:
net[1].bias.data()



[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 10 @cpu(0)>

In [16]:
net.collect_params()

sequential2_ (
  Parameter dense10_weight (shape=(256, 20), dtype=float32)
  Parameter dense10_bias (shape=(256,), dtype=float32)
  Parameter dense11_weight (shape=(10, 256), dtype=float32)
  Parameter dense11_bias (shape=(10,), dtype=float32)
)

In [17]:
net.collect_params('.*weight')



sequential2_ (
  Parameter dense10_weight (shape=(256, 20), dtype=float32)
  Parameter dense11_weight (shape=(10, 256), dtype=float32)
)

In [18]:
# 非首次对模型初始化需要指定force_reinit为真
net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
net[0].weight.data()[0]




[ 0.00456489 -0.00413096 -0.00670578 -0.01211046 -0.01173558 -0.00717132
 -0.00955144  0.00219873 -0.001119    0.00037409 -0.0045823   0.00580286
  0.00251145 -0.01799514  0.00045524 -0.00941019 -0.00045153 -0.0007181
 -0.00303942  0.00052552]
<NDArray 20 @cpu(0)>

In [19]:
net.initialize(init=init.Constant(1), force_reinit=True)
net[0].weight.data()[0]



[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
<NDArray 20 @cpu(0)>

In [20]:
net[0].weight.initialize(init=init.Xavier(), force_reinit=True)
net[0].weight.data()[0]




[-0.10389185  0.07822403 -0.1289716  -0.1410463  -0.07610903 -0.10696874
 -0.01996909 -0.07058676  0.00648634  0.10942626  0.08052795 -0.09453681
  0.13527533 -0.01967503 -0.11284603 -0.05156991 -0.11588816  0.02459455
  0.02644953  0.12870744]
<NDArray 20 @cpu(0)>

In [21]:
#define init function
class MyInit(init.Initializer):
    def _init_weight(self, name, data):
        print('Init', name, data.shape)
        data[:] = nd.random.uniform(low=-10, high=10, shape=data.shape)
        data *= data.abs() >= 5

net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()[0]



Init dense10_weight (256, 20)
Init dense11_weight (10, 256)



[ 7.142498  -9.206991  -7.6536884 -0.        -0.        -0.
 -0.        -5.294743  -0.        -8.822595   0.        -7.5231113
 -0.         7.135105   0.        -0.         0.        -0.
 -0.        -9.560527 ]
<NDArray 20 @cpu(0)>

In [22]:
net[0].weight.set_data(net[0].weight.data() + 1)
net[0].weight.data()[0]




[ 8.142498  -8.206991  -6.6536884  1.         1.         1.
  1.        -4.294743   1.        -7.8225946  1.        -6.5231113
  1.         8.135105   1.         1.         1.         1.
  1.        -8.560527 ]
<NDArray 20 @cpu(0)>

In [23]:
net = nn.Sequential()
shared = nn.Dense(8, activation='relu')
net.add(nn.Dense(8, activation='relu'),
        shared,
        nn.Dense(8, activation='relu', params=shared.params),
        nn.Dense(10))
net.initialize()

X = nd.random.uniform(shape=(2, 20))
net(X)

net[1].weight.data()[0] == net[2].weight.data()[0]



[1. 1. 1. 1. 1. 1. 1. 1.]
<NDArray 8 @cpu(0)>

In [1]:
#lazy init
from mxnet import init, nd
from mxnet.gluon import nn

class MyInit(init.Initializer):
    def _init_weight(self, name, data):
        print('Init', name, data.shape)
        # 实际的初始化逻辑在此省略了

net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'),
        nn.Dense(10))

net.initialize(init=MyInit())



In [2]:
X = nd.random.uniform(shape=(2, 20))
Y = net(X)


Init dense0_weight (256, 20)
Init dense1_weight (10, 256)


In [3]:
#when we know the all shapes lazy init will not happen
#such as use already init model reshape 
net.initialize(init=MyInit(), force_reinit=True)


Init dense0_weight (256, 20)
Init dense1_weight (10, 256)


In [4]:
#define the input parameters init happen
net = nn.Sequential()
net.add(nn.Dense(256, in_units=20, activation='relu'))
net.add(nn.Dense(10, in_units=256))

net.initialize(init=MyInit())


Init dense2_weight (256, 20)
Init dense3_weight (10, 256)


In [5]:
#no model parameters layer
class CenteredLayer(nn.Block):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)

    def forward(self, x):
        return x - x.mean()


In [6]:
layer = CenteredLayer()
layer(nd.array([1, 2, 3, 4, 5]))




[-2. -1.  0.  1.  2.]
<NDArray 5 @cpu(0)>

In [7]:
net = nn.Sequential()
net.add(nn.Dense(128),
        CenteredLayer())



In [8]:
net.initialize()
y = net(nd.random.uniform(shape=(4, 8)))
y.mean().asscalar()



4.3655746e-11

In [9]:
#layer has parameters
from mxnet import gluon
params = gluon.ParameterDict()
params.get('param2', shape=(2, 3))
params


(
  Parameter param2 (shape=(2, 3), dtype=<class 'numpy.float32'>)
)

In [10]:
class MyDense(nn.Block):
    # units为该层的输出个数，in_units为该层的输入个数
    def __init__(self, units, in_units, **kwargs):
        super(MyDense, self).__init__(**kwargs)
        self.weight = self.params.get('weight', shape=(in_units, units))
        self.bias = self.params.get('bias', shape=(units,))

    def forward(self, x):
        linear = nd.dot(x, self.weight.data()) + self.bias.data()
        return nd.relu(linear)



In [11]:
dense = MyDense(units=3, in_units=5)
dense.params



mydense0_ (
  Parameter mydense0_weight (shape=(5, 3), dtype=<class 'numpy.float32'>)
  Parameter mydense0_bias (shape=(3,), dtype=<class 'numpy.float32'>)
)

In [12]:
dense.initialize()
dense(nd.random.uniform(shape=(2, 5)))




[[0.         0.         0.08819557]
 [0.04655819 0.         0.04173457]]
<NDArray 2x3 @cpu(0)>

In [13]:
net = nn.Sequential()
net.add(MyDense(8, in_units=64),
        MyDense(1, in_units=8))
net.initialize()
net(nd.random.uniform(shape=(2, 64)))




[[0.09766567]
 [0.08787445]]
<NDArray 2x1 @cpu(0)>

In [14]:
x = nd.ones(3)
nd.save('x', x)


In [15]:
x2 = nd.load('x')
x2


[
 [1. 1. 1.]
 <NDArray 3 @cpu(0)>]

In [16]:
y = nd.zeros(4)
nd.save('xy', [x, y])
x2, y2 = nd.load('xy')
(x2, y2)


(
 [1. 1. 1.]
 <NDArray 3 @cpu(0)>,
 
 [0. 0. 0. 0.]
 <NDArray 4 @cpu(0)>)

In [17]:
mydict = {'x': x, 'y': y}
nd.save('mydict', mydict)
mydict2 = nd.load('mydict')
mydict2


{'x': 
 [1. 1. 1.]
 <NDArray 3 @cpu(0)>,
 'y': 
 [0. 0. 0. 0.]
 <NDArray 4 @cpu(0)>}

In [18]:
class MLP(nn.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Dense(256, activation='relu')
        self.output = nn.Dense(10)

    def forward(self, x):
        return self.output(self.hidden(x))

net = MLP()
net.initialize()
X = nd.random.uniform(shape=(2, 20))
Y = net(X)



In [19]:
filename = 'mlp.params'
net.save_parameters(filename)



In [20]:
net2 = MLP()
net2.load_parameters(filename)



In [21]:
Y2 = net2(X)
Y2 == Y




[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 2x10 @cpu(0)>

In [22]:
#use GPU 

import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn

mx.cpu(), mx.gpu(), mx.gpu(1)


(cpu(0), gpu(0), gpu(1))

In [23]:
x = nd.array([1, 2, 3])
x



[1. 2. 3.]
<NDArray 3 @cpu(0)>

In [24]:
x.context


cpu(0)

In [25]:
a = nd.array([1, 2, 3], ctx=mx.gpu())
a



[1. 2. 3.]
<NDArray 3 @gpu(0)>

In [28]:
#need two gpus when use gpu（1）
B = nd.random.uniform(shape=(2, 3), ctx=mx.gpu(0))
B



[[0.6686509  0.17409194 0.3850025 ]
 [0.24678314 0.35134333 0.8404298 ]]
<NDArray 2x3 @gpu(0)>

In [27]:
y = x.copyto(mx.gpu())
y



[1. 2. 3.]
<NDArray 3 @gpu(0)>

In [29]:
z = x.as_in_context(mx.gpu())
z




[1. 2. 3.]
<NDArray 3 @gpu(0)>

In [30]:
y.as_in_context(mx.gpu()) is y



True

In [31]:
y.copyto(mx.gpu()) is y

False

In [32]:
(z + 2).exp() * y


[ 20.085537 109.1963   445.2395  ]
<NDArray 3 @gpu(0)>

In [33]:
net = nn.Sequential()
net.add(nn.Dense(1))
net.initialize(ctx=mx.gpu())



In [34]:
net(y)




[[-0.04389585]
 [-0.0877917 ]
 [-0.13168755]]
<NDArray 3x1 @gpu(0)>

In [35]:
net[0].weight.data()




[[-0.04389585]]
<NDArray 1x1 @gpu(0)>

In [None]:
# MXNet可以指定用来存储和计算的设备，如使用内存的CPU或者使用显存的GPU。在默认情况下，MXNet会将数据创建在内存，然后利用CPU来计算。
# MXNet要求计算的所有输入数据都在内存或同一块显卡的显存上。

