## Running and renewing dataloader as an interator
#### It seemed like it might have broken shuffle, but this small example shows otherwise

In [1]:
from   torch.utils.data import Dataset, DataLoader
import random

In [2]:
# The dataset and the meta dataset. We look at another alternaive below

class Test_dataset(Dataset):

    def __init__(self, max_int):
        self._max_int = max_int
        self._samples = list(range(1, max_int + 1))

    def __len__(self):
        return self._max_int

    def __getitem__(self, index):
        return self._samples[index]

class Meta_test_dataset(Dataset):

    def __init__(self, max_int, lower_batch_size):
        self._max_int          = max_int
        self._lower_batch_size = lower_batch_size
        
        self._meta_data = ['a', 'b', 'c', 'd']

        self._lower_dataset    = Test_dataset(max_int)
        self._lower_dataloader = DataLoader(self._lower_dataset, batch_size=lower_batch_size, shuffle=True)
        self._lower_iter       = iter(self._lower_dataloader)

        num_full_x_batches = len(self._lower_dataset) // self._lower_batch_size
        equal_size_last_x_batch = len(self._lower_dataset) % self._lower_batch_size == 0
        self._len_this_meta_dataset = num_full_x_batches + (0 if equal_size_last_x_batch else 1)

    def __len__(self):
        return self._len_this_meta_dataset

    def __getitem__(self, index):
        # Get batch part concerning the observations, x sentences
        try:
            lower_data = self._lower_iter.next()
        except StopIteration:
            # This reinitializes the dataloader as an iterator, allowing the retrieval of a single batch at a time.
            self._lower_iter = iter(self._lower_dataloader)
            lower_data = self._lower_iter.next()

        return (lower_data, random.choice(self._meta_data))

In [4]:
test_meta_dataset = Meta_test_dataset(10, 2)
test_meta_dataloader = DataLoader(test_meta_dataset, batch_size=1, shuffle=True)

for epoch in range(2):
    for batch in test_meta_dataloader:
        print("EPOCH: " + str(epoch) + " ||| Data --> " + str(batch))
    print()

EPOCH: 0 ||| Data --> [tensor([[7, 8]]), ('a',)]
EPOCH: 0 ||| Data --> [tensor([[4, 5]]), ('c',)]
EPOCH: 0 ||| Data --> [tensor([[ 6, 10]]), ('c',)]
EPOCH: 0 ||| Data --> [tensor([[3, 9]]), ('d',)]
EPOCH: 0 ||| Data --> [tensor([[1, 2]]), ('d',)]

EPOCH: 1 ||| Data --> [tensor([[4, 5]]), ('d',)]
EPOCH: 1 ||| Data --> [tensor([[2, 7]]), ('a',)]
EPOCH: 1 ||| Data --> [tensor([[10,  8]]), ('b',)]
EPOCH: 1 ||| Data --> [tensor([[1, 9]]), ('a',)]
EPOCH: 1 ||| Data --> [tensor([[6, 3]]), ('d',)]



##### Second attempt

In [36]:
# The dataset and the meta dataset alternaive

class Test_dataset(Dataset):

    def __init__(self, max_int):
        self._max_int = max_int
        self._samples = list(range(1, max_int + 1))

    def __len__(self):
        return self._max_int

    def __getitem__(self, index):
        return self._samples[index]

class Meta_test_dataset(Dataset):

    def __init__(self, max_int, lower_batch_size):
        self._max_int          = max_int
        self._lower_batch_size = lower_batch_size
        
        self._meta_data = ['a', 'b', 'c', 'd']

        self._lower_dataset    = Test_dataset(max_int)
        self._lower_dataloader = DataLoader(self._lower_dataset, batch_size=lower_batch_size, shuffle=True)
        self._infinite_lower_dl = self._get_subdataset_batch()

        num_full_x_batches = len(self._lower_dataset) // self._lower_batch_size
        equal_size_last_x_batch = len(self._lower_dataset) % self._lower_batch_size == 0
        self._len_this_meta_dataset = num_full_x_batches + (0 if equal_size_last_x_batch else 1)

    def __len__(self):
        return self._len_this_meta_dataset

    def _get_subdataset_batch(self):
        while(True):
            for batch in self._lower_dataloader:
                yield batch

    def __getitem__(self, index):
        return (next(self._infinite_lower_dl), random.choice(self._meta_data))

In [37]:
test_meta_dataset = Meta_test_dataset(10, 2)
test_meta_dataloader = DataLoader(test_meta_dataset, batch_size=1, shuffle=True)

for epoch in range(3):
    for batch in test_meta_dataloader:
        print("EPOCH: " + str(epoch) + " ||| Data --> " + str(batch))
    print()

EPOCH: 0 ||| Data --> [tensor([[10,  3]]), ('c',)]
EPOCH: 0 ||| Data --> [tensor([[1, 6]]), ('a',)]
EPOCH: 0 ||| Data --> [tensor([[7, 9]]), ('c',)]
EPOCH: 0 ||| Data --> [tensor([[8, 2]]), ('a',)]
EPOCH: 0 ||| Data --> [tensor([[5, 4]]), ('a',)]

EPOCH: 1 ||| Data --> [tensor([[5, 1]]), ('d',)]
EPOCH: 1 ||| Data --> [tensor([[9, 6]]), ('d',)]
EPOCH: 1 ||| Data --> [tensor([[ 3, 10]]), ('b',)]
EPOCH: 1 ||| Data --> [tensor([[8, 4]]), ('b',)]
EPOCH: 1 ||| Data --> [tensor([[7, 2]]), ('c',)]

EPOCH: 2 ||| Data --> [tensor([[3, 9]]), ('a',)]
EPOCH: 2 ||| Data --> [tensor([[8, 6]]), ('a',)]
EPOCH: 2 ||| Data --> [tensor([[2, 7]]), ('d',)]
EPOCH: 2 ||| Data --> [tensor([[1, 5]]), ('d',)]
EPOCH: 2 ||| Data --> [tensor([[ 4, 10]]), ('a',)]



## Gradient Accumulation
#### We start by testing only the accumulation of gradients. Further on we will test the accumulated mini-batch normalisation

In [5]:
import sys
sys.path.append("..") # So it's possible to retrieve packages at a higher top level. (than the directory where the notebook is running)

import torch
import torch.nn as nn
from   torch.utils.data import Dataset, DataLoader
from   models.mlp import MLP

In [10]:
class Test_dataset(Dataset):

    def __init__(self):
        self._x = [1, 2, 3, 4]
        self._y = [0, 1, 2, 3]

    def __len__(self):
        return len(self._x)

    def __getitem__(self, index):
        return (torch.tensor(self._x[index], dtype=torch.float32), torch.tensor(self._y[index], dtype=torch.long))

model1 = MLP([1, 4], [nn.ReLU()])
for name, param in model1.named_parameters():
    print(name, param.data)
model2 = MLP([1, 4], [nn.ReLU()])
model2.load_state_dict(model1.state_dict())
for name, param in model2.named_parameters():
    print(name, param.data)

loss_function = torch.nn.CrossEntropyLoss(reduction='sum')

optimiser1 = torch.optim.Adam(model1.parameters())
optimiser2 = torch.optim.Adam(model2.parameters())

dl = DataLoader(Test_dataset(), shuffle=False, batch_size=2)

_layers.0.weight tensor([[ 0.8378],
        [-0.3861],
        [-0.7101],
        [-0.8737]])
_layers.0.bias tensor([ 0.5251,  0.0785,  0.9374, -0.5521])
_layers.0.weight tensor([[ 0.8378],
        [-0.3861],
        [-0.7101],
        [-0.8737]])
_layers.0.bias tensor([ 0.5251,  0.0785,  0.9374, -0.5521])


In [11]:
accumulate = 2

optimiser1.zero_grad()
for batch_num, batch in enumerate(dl):

    inputs, labels = batch
    print(inputs, labels)
    
    preds = model1(inputs.unsqueeze(1))[0]
    loss = loss_function(preds, labels)
    print("loss", loss)
    loss.backward()

for name, param in model1.named_parameters():
    print(name, param.grad)
optimiser1.zero_grad()

tensor([1., 2.]) tensor([0, 1])
loss tensor(3.0935, grad_fn=<NllLossBackward>)
tensor([3., 4.]) tensor([2, 3])
loss tensor(7.1096, grad_fn=<NllLossBackward>)
_layers.0.weight tensor([[7.4358],
        [0.0000],
        [0.1752],
        [0.0000]])
_layers.0.bias tensor([2.1120, 0.0000, 0.1752, 0.0000])


In [12]:
optimiser2.zero_grad()

total_loss = 0
for batch_num, batch in enumerate(dl):

    inputs, labels = batch
    print(inputs, labels)

    preds = model2(inputs.unsqueeze(1))[0]
    loss = loss_function(preds, labels)
    total_loss += loss
    print("total_loss", total_loss)
    print("loss", loss)
    
total_loss.backward()
print()
for name, param in model2.named_parameters():
    print(name, param.grad)

tensor([1., 2.]) tensor([0, 1])
total_loss tensor(3.0935, grad_fn=<AddBackward0>)
loss tensor(3.0935, grad_fn=<NllLossBackward>)
tensor([3., 4.]) tensor([2, 3])
total_loss tensor(10.2031, grad_fn=<AddBackward0>)
loss tensor(7.1096, grad_fn=<NllLossBackward>)

_layers.0.weight tensor([[7.4358],
        [0.0000],
        [0.1752],
        [0.0000]])
_layers.0.bias tensor([2.1120, 0.0000, 0.1752, 0.0000])


#### Now we test the accumulated mini-batch normalisation, by changing the learning rate

In [35]:
class Test_dataset(Dataset):

    def __init__(self):
        self._x = [1, 2, 3, 4]
        self._y = [0, 1, 2, 3]

    def __len__(self):
        return len(self._x)

    def __getitem__(self, index):
        return (torch.tensor(self._x[index], dtype=torch.float32), torch.tensor(self._y[index], dtype=torch.long))

model1 = MLP([1, 4], [nn.ReLU()])
for name, param in model1.named_parameters():
    print(name, param.data)
model2 = MLP([1, 4], [nn.ReLU()])
model2.load_state_dict(model1.state_dict())
for name, param in model2.named_parameters():
    print(name, param.data)

loss_function1 = torch.nn.CrossEntropyLoss(reduction='mean')
loss_function2 = torch.nn.CrossEntropyLoss(reduction='sum')

mini_batch_size = 2
accumulate = 2


optimiser1 = torch.optim.SGD(model1.parameters(), lr=0.01)
optimiser2 = torch.optim.SGD(model2.parameters(), lr=0.01/(accumulate*mini_batch_size))

dl = DataLoader(Test_dataset(), shuffle=False, batch_size=2)

_layers.0.weight tensor([[-0.2644],
        [-0.8340],
        [-0.9433],
        [-0.0489]])
_layers.0.bias tensor([0.4230, 0.7725, 0.5209, 0.2457])
_layers.0.weight tensor([[-0.2644],
        [-0.8340],
        [-0.9433],
        [-0.0489]])
_layers.0.bias tensor([0.4230, 0.7725, 0.5209, 0.2457])


In [36]:
print(optimiser1)
optimiser1.zero_grad()

for batch_num, batch in enumerate(dl):

    inputs, labels = batch
    print(inputs, labels)
    
    preds = model1(inputs.unsqueeze(1))[0]
    loss = loss_function1(preds, labels)/accumulate
    print("loss", loss)
    loss.backward()

print()
for name, param in model1.named_parameters():
    print("GRAD:", name, param.grad)

# Perform optimiser step
optimiser1.step()
optimiser1.zero_grad()
print()
for name, param in model1.named_parameters():
    print("VALUE:", name, param.data)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    momentum: 0
    nesterov: False
    weight_decay: 0
)
tensor([1., 2.]) tensor([0, 1])
loss tensor(0.6865, grad_fn=<DivBackward0>)
tensor([3., 4.]) tensor([2, 3])
loss tensor(0.6902, grad_fn=<DivBackward0>)

GRAD: _layers.0.weight tensor([[-0.1833],
        [ 0.0000],
        [ 0.0000],
        [-0.3299]])
GRAD: _layers.0.bias tensor([-0.1833,  0.0000,  0.0000,  0.0212])

VALUE: _layers.0.weight tensor([[-0.2626],
        [-0.8340],
        [-0.9433],
        [-0.0456]])
VALUE: _layers.0.bias tensor([0.4248, 0.7725, 0.5209, 0.2455])


In [37]:
print(optimiser2)
optimiser2.zero_grad()

for batch_num, batch in enumerate(dl):

    inputs, labels = batch
    print(inputs, labels)

    preds = model2(inputs.unsqueeze(1))[0]
    loss = loss_function(preds, labels)
    print("loss", loss, loss/(accumulate*mini_batch_size))
    print("loss check", loss)
    loss.backward()

print()
for name, param in model2.named_parameters():
    print("GRAD:", name, param.grad)

# Perform optimiser step
optimiser2.step()
optimiser2.zero_grad()
print()
for name, param in model2.named_parameters():
    print("VALUE:", name, param.data)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.0025
    momentum: 0
    nesterov: False
    weight_decay: 0
)
tensor([1., 2.]) tensor([0, 1])
loss tensor(2.7460, grad_fn=<NllLossBackward>) tensor(0.6865, grad_fn=<DivBackward0>)
loss check tensor(2.7460, grad_fn=<NllLossBackward>)
tensor([3., 4.]) tensor([2, 3])
loss tensor(2.7608, grad_fn=<NllLossBackward>) tensor(0.6902, grad_fn=<DivBackward0>)
loss check tensor(2.7608, grad_fn=<NllLossBackward>)

GRAD: _layers.0.weight tensor([[-0.7330],
        [ 0.0000],
        [ 0.0000],
        [-1.3197]])
GRAD: _layers.0.bias tensor([-0.7330,  0.0000,  0.0000,  0.0848])

VALUE: _layers.0.weight tensor([[-0.2626],
        [-0.8340],
        [-0.9433],
        [-0.0456]])
VALUE: _layers.0.bias tensor([0.4248, 0.7725, 0.5209, 0.2455])


# AllenNLP fix

In [2]:
def a(test_path: str = None) -> None:
    print(test_path)

a()
a('test')

None
test
