In [1]:
#example for model parallelism using compiled mxnet with USE_DIST_KVSTORE=1
import mxnet as mx
from mxnet import gluon, init, nd, autograd
from mxnet.gluon import nn, rnn
from mxnet.contrib import text
import numpy as np
import time
import os
import logging

In [2]:
class LR1(nn.Block):
    def __init__(self, **kwargs):
        super(LR1, self).__init__(**kwargs)
        self.fc1 = nn.Dense(2)
        self.fc2 = nn.Dense(2)

    def forward(self, inputs):
        fc1 = self.fc1(inputs)
        outputs = self.fc2(fc1)
        return outputs  
    
class LR2(nn.Block):
    def __init__(self,  **kwargs):
        super(LR2, self).__init__(**kwargs)
        self.fc1 = nn.Dense(2)
        self.fc2 = nn.Dense(1)

    def forward(self, inputs):
        fc1 = self.fc1(inputs)
        outputs = self.fc2(fc1)
        return outputs  
    
class LinearRegression(object):
    
    def __init__(self, ctx1, ctx2):
        self.ctx1 = ctx1
        self.ctx2 = ctx2
        self.net1 = LR1()
        self.net1.initialize(mx.init.Xavier(magnitude=2.24), ctx= ctx1)
        self.net2 = LR2()
        self.net2.initialize(mx.init.Xavier(magnitude=2.24), ctx= ctx2)
        
    def accuracy(self, X, Y):
        # root mean squared error
        mse = mx.metric.MSE()
        mse.update(labels = X, preds = Y)
        return mse.get()
        
    def fit(self, X,Y, epochs = 10, learning_rate = 0.001, every = 1, optimisation = 'sgd'):
        self.X = X
        self.Y = Y
        self.trainer1 = gluon.Trainer(self.net1.collect_params(), optimisation, {'learning_rate': learning_rate, 'clip_gradient':5}, kvstore='device')
        self.trainer2 = gluon.Trainer(self.net2.collect_params(), optimisation, {'learning_rate': learning_rate, 'clip_gradient':5}, kvstore='device')
        self.loss = gluon.loss.L2Loss()
        for epoch in range(1, epochs + 1):
            with autograd.record():
                output1 = self.net1(self.X)
                output2 = output1.copyto(self.ctx2)
                predicted = self.net2(output2)
                l = self.loss(predicted,self.Y)
                acc = self.accuracy(predicted, self.Y)
            if epoch % every ==0:
                print(epoch, acc)
            l.backward()
            #self.trainer1.set_learning_rate(learning_rate)
            #self.trainer2.set_learning_rate(learning_rate)
            self.trainer1.step(1000, ignore_stale_grad=False)
            self.trainer2.step(1000, ignore_stale_grad=False)

In [3]:
def synthetic_data(w, b, num_examples):
    """generate y = X w + b + noise"""
    X = nd.random.normal(scale=1, shape=(num_examples, len(w)))
    y = nd.dot(X, w) + b
    y += nd.random.normal(scale=0.01, shape=y.shape)
    return X, y

true_w = nd.array([2,-3.4])
true_b = 4.2
features, labels = synthetic_data(true_w, true_b, 1000)

In [4]:
ctx1 = mx.gpu(0)
ctx2 = mx.gpu(1)
y = labels.copyto(ctx2).reshape(1000,1)
x = features.copyto(ctx1).reshape(1000, 2)

In [5]:
lr = LinearRegression(ctx1, ctx2)

In [None]:
lr.fit(x,y, epochs = 100, every = 1, learning_rate = 0.01)

In [7]:
lr.net2(lr.net1(x[:5]).copyto(lr.ctx2))


[[ 5.9986997]
 [ 2.2654257]
 [12.184246 ]
 [ 2.188433 ]
 [11.668544 ]]
<NDArray 5x1 @gpu(1)>

In [8]:
y[:5]


[[ 6.000587 ]
 [ 2.2676215]
 [12.192286 ]
 [ 2.1933131]
 [11.677933 ]]
<NDArray 5x1 @gpu(1)>

In [7]:
pr['dense4_weight'].data()[:] = pr['dense4_weight'].data().grad

NameError: name 'pr' is not defined

In [32]:
pr['dense4_weight'].data()


[[0. 0.]
 [0. 0.]]
<NDArray 2x2 @gpu(0)>