In [2]:
import torch
import torchvision
import torchvision.transforms as T
import time
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import sampler
import torchvision.datasets as dset
from torch.utils.data import DataLoader
from torch.autograd import Variable
import numpy as np
import copy
import networkx as nx 
import random
import torch.optim as optim
from scipy.stats import t

import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [80]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

manualSeed = 1

np.random.seed(manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
torch.cuda.manual_seed(manualSeed)
torch.cuda.manual_seed_all(manualSeed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

cpu


In [12]:
DEVICE_NUM = 100
DATASIZE_LOCAL = int(60000/DEVICE_NUM)
SERVER_NUM = 10
DEVICE_PER_SERVER = int(DEVICE_NUM/SERVER_NUM)
BATCH_SIZE = 32
STEP_NUM = 5
LABEL_DIVERSITY = 6
ACTIVE_PER_SERVER = 3
CLOUD_STEP_NUM = 1

In [83]:
np.random.seed(seed=0)

# generate non-IID datasets stored on edge devices
trainset = torchvision.datasets.MNIST('.data/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ]))

testset = torchvision.datasets.MNIST('.data/', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ]))

trainloader = []
testloader_sub = []
for device_ID in range(DEVICE_NUM):
    label_set = random.sample(range(0, 10), LABEL_DIVERSITY)
    idx = trainset.targets.clone().detach() == label_set[0]
    for label_val in label_set[1:]:
        idx += trainset.targets.clone().detach() == label_val
    indx = np.random.permutation(np.where(idx==1)[0])[0:DATASIZE_LOCAL]
    trainset_indx = torch.utils.data.Subset(trainset, indx)
    trainloader.append(torch.utils.data.DataLoader(trainset_indx, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2))
    idx = testset.targets.clone().detach() == label_set[0]
    for label_val in label_set[1:]:
        idx += testset.targets.clone().detach() == label_val
    test_indx = torch.utils.data.Subset(testset, np.where(idx==1)[0])
    testloader_sub.append(torch.utils.data.DataLoader(test_indx, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2))
    

testloader = torch.utils.data.DataLoader(testset, batch_size=10000,
                                         shuffle=False, num_workers=8)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to .data/MNIST/raw/train-images-idx3-ubyte.gz


ImportError: FloatProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [84]:
def avg_dict(para_set):
    para_copy = copy.deepcopy(para_set)
    N = float(len(para_copy))
    return { k : sum(t[k] for t in para_copy)/N for k in para_copy[0] }

def weighted_dict(para_set, weight):
    para_copy = copy.deepcopy(para_set)
    for k in range(len(para_copy)):
        para_copy[k].update((x, y*weight[k]) for x, y in para_copy[k].items())
    return { k : sum(t[k] for t in para_copy) for k in para_copy[0] }

def sub_dict(primal_set1, primal_set2):
    primal_set1_copy = copy.deepcopy(primal_set1)
    primal_set2_copy = copy.deepcopy(primal_set2)
    return { k: primal_set1_copy[k] - primal_set2_copy.get(k, 0) for k in primal_set1_copy }

def mul_dict(primal_set, ratio):
    primal_set_copy = copy.deepcopy(primal_set)
    return { k: primal_set_copy[k] * ratio for k in primal_set_copy }

def l2_reg_para(primal_set):
    primal_set_copy = copy.deepcopy(primal_set)
    return torch.sum(torch.stack([torch.norm(x)**2 for x in primal_set_copy]))

def sub_para(primal_set1, primal_set2):
    primal_set1_copy = copy.deepcopy(primal_set1)
    primal_set2_copy = copy.deepcopy(primal_set2)
    return [i - j for i, j in zip(primal_set1_copy, primal_set2_copy)]

In [85]:
input_size = 784
hidden_sizes = [128, 64]
output_size = 10

net = nn.Sequential(  nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.LogSoftmax(dim=1))
net_const = nn.Sequential(  nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.LogSoftmax(dim=1))

for p in net_const.parameters():
    p.requires_grad = False

if torch.cuda.device_count() != 0:
    global gpu_dtype
    gpu_dtype = torch.cuda.FloatTensor
    
    net.cuda()
    net = net.type(gpu_dtype)
    
    net_const.cuda()
    net_const = net_const.type(gpu_dtype)

In [86]:
para = net.state_dict()
para_device = []
para_server = []
mean_device2server = []

for i in range(DEVICE_NUM):
    para_device.append(copy.deepcopy(para))
    
for i in range(SERVER_NUM):
    para_server.append(copy.deepcopy(para))
    mean_device2server.append(copy.deepcopy(para))

In [13]:
np.random.seed(seed=1)
learning_rate = 0.005
num_epochs = 500
criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001)
RHO = 1
lrz = 1
print('(sync-cloud architecture) Device number: %d, server number: %d, training for %d epochs with learning rate %f, RHO %f, lrz %f' % 
      (DEVICE_NUM, SERVER_NUM, num_epochs, learning_rate, RHO, lrz))
runtime_record = 0
wake_up_time_server = np.zeros(SERVER_NUM)
for epoch in range(num_epochs):
    print('Starting epoch %d / %d' % (epoch+1, num_epochs))
    start_time = time.time()
    
    for server_ID in range(SERVER_NUM):
        wake_up_time = np.random.exponential(scale=0.5, size=DEVICE_PER_SERVER)
        idx = np.argpartition(wake_up_time, ACTIVE_PER_SERVER)
        ACTIVE_DEVICE = idx[:ACTIVE_PER_SERVER]
        net_const.load_state_dict(copy.deepcopy(para_server[server_ID]))
        z_n = list(net_const.parameters())
        para_tmp = []
        for device_ind in range(DEVICE_PER_SERVER):
            if device_ind in ACTIVE_DEVICE:
                device_ID = device_ind + server_ID * DEVICE_PER_SERVER
                stopping_iter = random.randint(1, STEP_NUM)    
                net.load_state_dict(copy.deepcopy(para_device[device_ID]))
                iter_count = 0
                for i, data in enumerate(trainloader[device_ID], 0):
                    if torch.cuda.device_count() != 0:
                        inputs, labels = data[0].cuda(), data[1].cuda()
                    else:
                        inputs, labels = data[0], data[1]
                    inputs = inputs.view(inputs.shape[0], -1)
                    optimizer.zero_grad()
                    outputs = net(inputs)
                    fitting_loss = criterion(outputs, labels)
                    penalty = None
                    for (Ww, Zz) in zip(net.parameters(), z_n):
                        if penalty is None:
                            penalty = torch.norm(Ww-Zz)**2
                        else:
                            penalty = penalty + torch.norm(Ww-Zz) ** 2
                    loss = fitting_loss + RHO * penalty
                    loss.backward()
                    optimizer.step()
                    iter_count += 1
                    if iter_count == stopping_iter:
                        break
                para_device[device_ID] = copy.deepcopy(net.state_dict())
                wake_up_time[device_ind] += stopping_iter*np.random.exponential(scale=0.5)
        wake_up_time_server[server_ID] = max(wake_up_time[ACTIVE_DEVICE])

    
    runtime_record += max(wake_up_time_server)
    lrz = lrz*(1-0.0001)
    for server_ID in range(SERVER_NUM):
        para_server[server_ID] = sub_dict(para_server[server_ID], mul_dict(sub_dict(para_server[server_ID], avg_dict(para_device)),  lrz))

    para_update = avg_dict(para_server)
    net_const.load_state_dict(para_update)
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            if torch.cuda.device_count() != 0:
                images, labels = data[0].cuda(), data[1].cuda()
            else:
                images, labels = data[0], data[1]
            images = images.view(images.shape[0], -1)
            outputs = net_const(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        print('[%d, %d] server test accuracy: %.2f %% , runtime: %.2f'% (num_epochs + 1, epoch + 1, 100 * float(correct) / total, runtime_record))    
        
    print("--- %s seconds ---" % (time.time() - start_time))

Training for 250 epochs with learning rate 0.010000
Starting epoch 1 / 250
[251, 1] server test accuracy: 12.60 %


AssertionError: Torch not compiled with CUDA enabled