# single node deep learning sample 

- Databricks Runtime 5.1 ML,GPU

## prepare storage
https://docs.azuredatabricks.net/applications/deep-learning/distributed-deep-learning/ddl-storage.html#ddl-fuse

In [5]:
FUSE_DIR = 'horovod_pytorch'

## prepare network with pytorch

### simple CNN

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)

### senseless big network to challenge the limit of GPU-Memory

In [7]:
class BigNet(nn.Module):
    def __init__(self):
        super(BigNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 1000)
        self.fc2 = nn.Linear(1000,5000)
        self.fc3 = nn.Linear(5000,10000)
        self.fc4 = nn.Linear(10000,5000)
        self.fc5 = nn.Linear(5000,1000)
        self.fc6 = nn.Linear(1000,500)
        self.fc7 = nn.Linear(500,100)
        self.fc8 = nn.Linear(100,50)
        self.fc9 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc2(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc3(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc4(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc5(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc6(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc7(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc8(x))
        x = F.dropout(x, training=self.training)
        x = self.fc9(x)
        return F.log_softmax(x)

## prepare functions to train

In [21]:
def train_one_epoch(model, device, data_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(data_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        #if batch_idx % log_interval == 0:
        #    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        #        epoch, batch_idx * len(data), len(data_loader.dataset),
        #        100. * batch_idx / len(data_loader), loss.item()))

In [9]:
from time import time
import os

LOG_DIR = os.path.join(FUSE_DIR, str(time()), 'MNISTDemo')
os.makedirs(LOG_DIR)

In [10]:
def save_checkpoint(model, optimizer, epoch):
    filepath = LOG_DIR + '/checkpoint-{epoch}.pth.tar'.format(epoch=epoch)
    state = {'model': model.state_dict(),'optimizer': optimizer.state_dict(),}
    torch.save(state, filepath)

In [19]:
import torch.optim as optim
from torchvision import datasets, transforms
import tqdm

def train(learning_rate,small=True):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_dataset = datasets.MNIST(
        'data', 
        train=True,
        download=True,
        transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))
    data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    if small:
        model = Net().to(device)
    else:
        model = BigNet().to(device)
  
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

    for epoch in tqdm.trange(1, num_epochs + 1):
        train_one_epoch(model, device, data_loader, optimizer, epoch)
        save_checkpoint(model, optimizer, epoch)

## train simple CNN

In [13]:
# 40min parameter, may over-fit 
batch_size = 100
num_epochs = 300
momentum = 0.5
log_interval = 100

In [22]:
import time

start = time.time()
train(learning_rate = 0.001)
process_time = time.time() - start

print("processing time:"+str(process_time))



  0%|          | 1/300 [00:08<44:38,  8.96s/it][A
  1%|          | 2/300 [00:17<44:28,  8.95s/it][A
  1%|          | 3/300 [00:26<44:18,  8.95s/it][A
  1%|▏         | 4/300 [00:35<43:59,  8.92s/it][A
  2%|▏         | 5/300 [00:44<43:57,  8.94s/it][A
  2%|▏         | 6/300 [00:53<43:46,  8.94s/it][A
  2%|▏         | 7/300 [01:02<43:39,  8.94s/it][A
  3%|▎         | 8/300 [01:11<43:29,  8.94s/it][A
  3%|▎         | 9/300 [01:20<43:20,  8.94s/it][A
  3%|▎         | 10/300 [01:29<43:09,  8.93s/it][A
  4%|▎         | 11/300 [01:38<43:03,  8.94s/it][A
  4%|▍         | 12/300 [01:47<42:53,  8.94s/it][A
  4%|▍         | 13/300 [01:56<42:44,  8.93s/it][A
  5%|▍         | 14/300 [02:05<42:36,  8.94s/it][A
  5%|▌         | 15/300 [02:14<42:27,  8.94s/it][A
  5%|▌         | 16/300 [02:22<42:19,  8.94s/it][A
  6%|▌         | 17/300 [02:31<42:12,  8.95s/it][A
  6%|▌         | 18/300 [02:40<42:05,  8.95s/it][A
  6%|▋         | 19/300 [02:49<41:57,  8.96s/it][A
  7%|▋         | 20

processing time:2671.546637058258


## train senseless big network

In [23]:
# Setting training parameters
batch_size = 300
num_epochs = 30
momentum = 0.5
log_interval = 100

In [24]:
start = time.time()
train(learning_rate = 0.001,small=False)
process_time = time.time() - start

print("processing time:"+str(process_time))



  3%|▎         | 1/30 [00:17<08:19, 17.21s/it][A
  7%|▋         | 2/30 [00:34<08:00, 17.18s/it][A
 10%|█         | 3/30 [00:52<07:50, 17.44s/it][A
 13%|█▎        | 4/30 [01:09<07:30, 17.34s/it][A
 17%|█▋        | 5/30 [01:27<07:15, 17.42s/it][A
 20%|██        | 6/30 [01:44<06:59, 17.48s/it][A
 23%|██▎       | 7/30 [01:59<06:22, 16.65s/it][A
 27%|██▋       | 8/30 [02:14<05:53, 16.08s/it][A
 30%|███       | 9/30 [02:28<05:26, 15.54s/it][A
 33%|███▎      | 10/30 [02:43<05:06, 15.31s/it][A
 37%|███▋      | 11/30 [02:57<04:45, 15.01s/it][A
 40%|████      | 12/30 [03:12<04:27, 14.87s/it][A
 43%|████▎     | 13/30 [03:27<04:15, 15.01s/it][A
 47%|████▋     | 14/30 [03:42<03:59, 14.99s/it][A
 50%|█████     | 15/30 [03:57<03:47, 15.15s/it][A
 53%|█████▎    | 16/30 [04:13<03:33, 15.24s/it][A
 57%|█████▋    | 17/30 [04:28<03:18, 15.27s/it][A
 60%|██████    | 18/30 [04:44<03:04, 15.37s/it][A
 63%|██████▎   | 19/30 [04:59<02:49, 15.38s/it][A
 67%|██████▋   | 20/30 [05:14<02:31, 1

processing time:466.7347915172577
