**Import CIFAR-10 dataset**

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np

In [2]:
def create_CIFAR_data():

    transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

    return trainset, trainloader, testset, testloader

**Process CIFAR-10 dataset**

In [3]:
def get_binary_label(targets, index):
    ''' Cats have index 3, dogs have index 5 '''

    zeros = torch.zeros_like(targets)
    ones = torch.ones_like(targets)

    labels = torch.where(targets == index, ones, zeros)

    return labels

In [40]:
import random

def create_unbalanced_CIFAR10(trainset, class_sizes = [625,625,625,5000,625,5000,625,625,625,625]):

  labels = np.array(trainset.targets)
  classes, sizes = np.unique(labels, return_counts=True)
  print(sizes)

  imbalanced_indices = []

  for i in range(len(classes)):
    indices = list(np.where(labels == i)[0])
    class_size = class_sizes[i]
    imbalanced_indices.extend(random.sample(indices, class_size))

  
  trainset.targets = labels[imbalanced_indices]
  trainset.data = trainset.data[imbalanced_indices]
  classes, sizes = np.unique(trainset.targets, return_counts=True)
  print(sizes)

  return trainset

**Creating ResNet model**

In [10]:
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from typing import Type, Any, Callable, Union, List, Optional

In [11]:
def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
    '''
    Implementation is taken from the PyTorch GitHub repository
    https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
    '''

    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
    '''
    Implementation is taken from the PyTorch GitHub repository
    https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
    '''    
    
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

In [12]:
class BasicBlock(nn.Module):
    '''
    Implementation is taken from the PyTorch GitHub repository
    https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
    '''
    
    expansion: int = 1

    def __init__(self, inplanes: int, planes: int, stride: int = 1, downsample: Optional[nn.Module] = None, groups: int = 1, base_width: int = 64, dilation: int = 1, norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
        super(BasicBlock, self).__init__()
        
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

In [13]:
class Bottleneck(nn.Module):
    '''
    Implementation is taken from the PyTorch GitHub repository
    https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
    '''

    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.

    expansion: int = 4

    def __init__(self, inplanes: int, planes: int, stride: int = 1, downsample: Optional[nn.Module] = None, groups: int = 1, base_width: int = 64, dilation: int = 1, norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
        super(Bottleneck, self).__init__()
        
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        
        width = int(planes * (base_width / 64.)) * groups
        
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

In [14]:
class ResNetSplitShared(nn.Module):
    ''' https://stackoverflow.com/questions/66786787/pytorch-multiple-branches-of-a-model '''

    def __init__(self, block: Type[Union[BasicBlock, Bottleneck]], layers: List[int], num_classes: int = 10, zero_init_residual: bool = False, groups: int = 1, width_per_group: int = 64, replace_stride_with_dilation: Optional[List[bool]] = None, norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
        super(ResNetSplitShared, self).__init__()
        
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        
        self.groups = groups
        self.base_width = width_per_group
        
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
        ##### SHARED LAYERS #####
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.layer1 = self._make_shared_layer(block, 64, layers[0])
        self.layer2 = self._make_shared_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        
        ##### BRANCH 1 LAYERS #####
        self.branch1_inplanes = 128
        self.branch1layer3 = self._make_branch1_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.branch1layer4 = self._make_branch1_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.branch1fc = nn.Linear(512 * block.expansion, num_classes)


        ##### BRANCH 2 LAYERS #####
        self.branch2_inplanes = 128
        self.branch2layer3 = self._make_branch2_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.branch2layer4 = self._make_branch2_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])      
        self.branch2fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]


    def _make_shared_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
        
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        
        if dilate:
            self.dilation *= stride
            stride = 1
        
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        
        self.inplanes = planes * block.expansion
        
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)


    def _make_branch1_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
        
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        
        if dilate:
            self.dilation *= stride
            stride = 1
        
        if stride != 1 or self.branch1_inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.branch1_inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.branch1_inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        
        self.branch1_inplanes = planes * block.expansion
        
        for _ in range(1, blocks):
            layers.append(block(self.branch1_inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)


    def _make_branch2_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
        
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        
        if dilate:
            self.dilation *= stride
            stride = 1
        
        if stride != 1 or self.branch2_inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.branch2_inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.branch2_inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        
        self.branch2_inplanes = planes * block.expansion
        
        for _ in range(1, blocks):
            layers.append(block(self.branch2_inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)


    def get_branch_params(self):

        self.shared_params = [
                        {'params': self.conv1.parameters()},
                        {'params': self.bn1.parameters()},
                         {'params': self.layer1.parameters()},
                         {'params': self.layer2.parameters()},
        ]
        self.branch1_params = [
                        {'params': self.branch1layer3.parameters()},
                        {'params': self.branch1layer4.parameters()},
                         {'params': self.branch1fc.parameters()},
        ]
        self.branch2_params = [
                        {'params': self.branch2layer3.parameters()},
                        {'params': self.branch2layer4.parameters()},
                         {'params': self.branch2fc.parameters()},
        ]

        return self.shared_params, self.branch1_params, self.branch2_params


    def _forward_shared_branch(self, x:Tensor) -> Tensor:
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        
        return out


    def _forward_branch_1(self, shared_out: Tensor) -> Tensor:
        branch1_out = self.branch1layer3(shared_out)
        branch1_out = self.branch1layer4(branch1_out)
        branch1_out = self.avgpool(branch1_out)
        branch1_out = torch.flatten(branch1_out, 1)
        branch1_out = self.branch1fc(branch1_out)

        return branch1_out


    def _forward_branch_2(self, shared_out: Tensor) -> Tensor:
        branch2_out = self.branch2layer3(shared_out)
        branch2_out = self.branch2layer4(branch2_out)
        branch2_out = self.avgpool(branch2_out)
        branch2_out = torch.flatten(branch2_out, 1)
        branch2_out = self.branch2fc(branch2_out)

        return branch2_out
    
    
    def forward(self, x: Tensor) -> Tensor:
        shared = self._forward_shared_branch(x)
        branch_one_out = self._forward_branch_1(shared)
        branch_two_out = self._forward_branch_2(shared)

        return branch_one_out, branch_two_out


In [15]:
def ResNetSplit18Shared():
    return ResNetSplitShared(BasicBlock, [2,2,2,2])

**Congestion Avoidance scheduler**


*   Based on accumulated gradients (multiplied by the lr in each epoch)
*   As is based on lr*grad is only truly suitable for standard SGD
* Gradients are reset after a congestion on one branch (for that branch only) -- This treats the new position as a new start point to accumulate gradients from



In [16]:
def congestion_avoid(model, optimizer, branch1_acc, branch2_acc, condition, branch_one_grads, branch_two_grads, min_epochs, mult):

    global epoch_count_one
    global epoch_count_two
    #global lr_one_cumulative
    #global lr_two_cumulative

    boolean_one = False
    boolean_two = False

    branch1_cond = (branch1_acc < condition * branch2_acc) and (epoch_count_two >= min_epochs)
    branch2_cond = (branch2_acc < condition * branch1_acc) and (epoch_count_one >= min_epochs)

    if branch1_cond:
        boolean_one = True
        print('Branch 1 condition has been met ..... : ' + str(100*condition) + '%')
        for name, value in model.named_parameters():
            with torch.no_grad():
                value += mult * branch_two_grads[name]
        epoch_count_two = 0
        #lr_two_cumulative = 0

    elif branch2_cond:
        boolean_two = True
        print('Branch 2 condition has been met ..... : ' + str(100*condition) + '%')
        for name, value in model.named_parameters():
            with torch.no_grad():
                value += mult * branch_one_grads[name]
        epoch_count_one = 0
        #lr_one_cumulative = 0
    
    else:
        print('No condition is met ..... : ' + str(100*condition) + '%')

    return optimizer, model, boolean_one, boolean_two

**Training the ResNet model**


*   Accumulate the gradient * lr
*   Reset the accumulated gradients to zero on a branch if in the previous epoch we had to roll it back



In [17]:
def train_congestion_avoider(trainloader, device, model, optimizer, branch_one_criterion, branch_two_criterion, branch_one_class, branch_two_class, boolean_one, boolean_two):

    global branch_one_grads
    global branch_two_grads
    global epoch_count_one
    global epoch_count_two

    ''' 
        model = The model to be trained
        shared_optim, branch1_optim, branch2_optim = the optimizers used to determine how network weights are updated in each section of the network (e.g. SGD)
        prior_shared_params, prior_branch1_params, prior_branch2_params = The network parameters from the previous epoch, used by 'congestion_scheduler' to roll back the weights by one epoch
        branch_x_criterion = The criterion used to define the loss function
        branch_classes = Must be a list of length 2. Defines the classes that each branch of the model is learning to classify
        epoch = The current epoch in training
     '''

    import copy

    model.train()
    branch_one_train_loss = 0
    branch_two_train_loss = 0
    branch_one_correct = 0
    branch_two_correct = 0
    branch_one_total = 0
    branch_two_total = 0
    branch_one_grads_tmp = {}
    branch_two_grads_tmp = {}
    start_time = time.time()
    
    #if (epoch % reset_epochs == 0) or boolean_two:
    if boolean_two:
        # SHOULD I RESET THE GRADIENTS HERE OR SHOULD IT ALWAYS BE A ROLLING SUM!!!!
        branch_one_grads = {}
        epoch_count_one = 0
    #if (epoch % reset_epochs == 0) or boolean_one:
    if boolean_one:
        # SHOULD I RESET THE GRADIENTS HERE OR SHOULD IT ALWAYS BE A ROLLING SUM!!!!
        branch_two_grads = {}
        epoch_count_two = 0
    
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        branch_one_targets = get_binary_label(targets, index=branch_one_class)
        branch_two_targets = get_binary_label(targets, index=branch_two_class)
        inputs, branch_one_targets, branch_two_targets = inputs.to(device), branch_one_targets.to(device), branch_two_targets.to(device)
        optimizer.zero_grad()
        branch_one_outputs, branch_two_outputs = model(inputs)
        branch_one_loss = branch_one_criterion(branch_one_outputs, branch_one_targets)
        branch_two_loss = branch_two_criterion(branch_two_outputs, branch_two_targets)
        
        # Back-propagate the loss due to 'cats'
        branch_one_loss.backward(retain_graph=True)
        with torch.no_grad():
            for name, parameter in model.named_parameters():
                try:
                    branch_two_grads_tmp[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                    if name not in branch_one_grads.keys():
                        branch_one_grads[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                    else:
                        branch_one_grads[name] += torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                except:
                    pass

        branch_two_loss.backward(retain_graph=True)
        with torch.no_grad():
            for name, parameter in model.named_parameters():
                if parameter.grad is not None:
                    try:
                        if name not in branch_two_grads.keys():
                            if name in branch_two_grads_tmp.keys():
                                branch_two_grads[name] = (torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr']) - branch_two_grads_tmp[name])
                            else:
                                branch_two_grads[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                        else:
                            if name in branch_two_grads_tmp.keys():
                                branch_two_grads[name] += (torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr']) - branch_two_grads_tmp[name])
                            else:
                                branch_two_grads[name] += torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                    except:
                        pass
        optimizer.zero_grad()

        total_loss = branch_one_loss + branch_two_loss
        total_loss.backward()
        optimizer.step()

        branch_one_train_loss += branch_one_loss.item()
        branch_two_train_loss += branch_two_loss.item()
        _, branch_one_predicted = branch_one_outputs.max(1)
        _, branch_two_predicted = branch_two_outputs.max(1)
        branch_one_total += branch_one_targets.size(0)
        branch_two_total += branch_two_targets.size(0)
        branch_one_correct += branch_one_predicted.eq(branch_one_targets).sum().item()
        branch_two_correct += branch_two_predicted.eq(branch_two_targets).sum().item()

    epoch_count_one += 1
    epoch_count_two += 1

    branch_one_acc = 100.*branch_one_correct/branch_one_total
    branch_two_acc = 100.*branch_two_correct/branch_two_total

    print("total train iters ", len(trainloader), '| time: %.3f sec Cat Loss: %.3f | Cat Acc: %.3f%% (%d/%d) | Dog Loss: %.3f | Dog Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), branch_one_train_loss/(batch_idx+1), 
           branch_one_acc, branch_one_correct, branch_one_total, 
           branch_two_train_loss/(batch_idx+1), branch_two_acc, 
           branch_two_correct, branch_two_total))

    return branch_one_acc, branch_two_acc, branch_one_grads, branch_two_grads

In [18]:
def train_congestion_avoider_debug(trainloader, device, model, optimizer, branch_one_criterion, branch_two_criterion, branch_one_class, branch_two_class, boolean_one, boolean_two):

    global branch_one_grads
    global branch_two_grads
    global epoch_count_one
    global epoch_count_two

    ''' 
        model = The model to be trained
        shared_optim, branch1_optim, branch2_optim = the optimizers used to determine how network weights are updated in each section of the network (e.g. SGD)
        prior_shared_params, prior_branch1_params, prior_branch2_params = The network parameters from the previous epoch, used by 'congestion_scheduler' to roll back the weights by one epoch
        branch_x_criterion = The criterion used to define the loss function
        branch_classes = Must be a list of length 2. Defines the classes that each branch of the model is learning to classify
        epoch = The current epoch in training
     '''

    import copy

    model.train()
    branch_one_train_loss = 0
    branch_two_train_loss = 0
    branch_one_correct = 0
    branch_two_correct = 0
    branch_one_total = 0
    branch_two_total = 0
    branch_one_grads_tmp = {}
    branch_two_grads_tmp = {}
    total_grads = {}
    start_time = time.time()
    
    #if (epoch % reset_epochs == 0) or boolean_two:
    if boolean_two:
        # SHOULD I RESET THE GRADIENTS HERE OR SHOULD IT ALWAYS BE A ROLLING SUM!!!!
        branch_one_grads = {}
        epoch_count_one = 0
    #if (epoch % reset_epochs == 0) or boolean_one:
    if boolean_one:
        # SHOULD I RESET THE GRADIENTS HERE OR SHOULD IT ALWAYS BE A ROLLING SUM!!!!
        branch_two_grads = {}
        epoch_count_two = 0
    # The trainloader here needs to reference the imbalanced dataset (maybe only 2 classes)

    inputs, targets = next(iter(trainloader))
    inputs_2, targets_2 = next(iter(trainloader))
    inputs_3, targets_3 = next(iter(trainloader))
    inputs_4, targets_4 = next(iter(trainloader))
    inputs_5, targets_5 = next(iter(trainloader))
    inputs_6, targets_6 = next(iter(trainloader))
    inputs_7, targets_7 = next(iter(trainloader))
    inputs_8, targets_8 = next(iter(trainloader))
    inputs_9, targets_9 = next(iter(trainloader))
    inputs_10, targets_10 = next(iter(trainloader))

    for index, (input, target) in enumerate(zip([inputs,inputs_2,inputs_3,inputs_4,inputs_5,inputs_6,inputs_7,inputs_8,inputs_9,inputs_10], [targets,targets_2,targets_3,targets_4,targets_5,targets_6,targets_7,targets_8,targets_9,targets_10])):
        print('\nIMAGE ', index+1)
        branch_one_targets = get_binary_label(targets, index=branch_one_class)
        branch_two_targets = get_binary_label(targets, index=branch_two_class)
        inputs, branch_one_targets, branch_two_targets = inputs.to(device), branch_one_targets.to(device), branch_two_targets.to(device)
        optimizer.zero_grad()
        branch_one_outputs, branch_two_outputs = model(inputs)
        branch_one_loss = branch_one_criterion(branch_one_outputs, branch_one_targets)
        branch_two_loss = branch_two_criterion(branch_two_outputs, branch_two_targets)
        
        # Back-propagate the loss due to 'cats'
        branch_one_loss.backward(retain_graph=True)
        with torch.no_grad():
            for name, parameter in model.named_parameters():
                #if parameter.grad is not None:
                try:
                    branch_two_grads_tmp[name] = torch.mul(copy.deepcopy(parameter.grad), 1)
                    if name not in branch_one_grads.keys():
                        #if name == 'module.conv1.weight':
                        #    print('Branch one backward --> conv1 grad (NOT ADDING): ', torch.sum(parameter.grad))
                        branch_one_grads[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                        #branch_one_grads[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                        branch_two_grads_tmp[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                    else:
                        #if name == 'module.conv1.weight':
                        #    print('Branch one backward --> conv1 grad (ADDING): ', torch.sum(parameter.grad))
                        branch_one_grads[name] += torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                        #branch_one_grads[name] += torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                        branch_two_grads_tmp[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                except:
                    #print('ERROR! Parameter: ', name, ': ', parameter.grad)
                    pass
        #print('BRANCH ONE GRADS conv1: ', torch.sum(branch_one_grads['module.conv1.weight']))
        #model.zero_grad()

        branch_two_loss.backward(retain_graph=True)
        with torch.no_grad():
            for name, parameter in model.named_parameters():
                if parameter.grad is not None:
                    try:
                        if name not in branch_two_grads.keys():
                            if name in branch_two_grads_tmp.keys():
                                #if name == 'module.conv1.weight':
                                #    print('Branch two backward --> conv1 grad (NOT ADDING): ', torch.sum(parameter.grad- branch_two_grads_tmp[name]))
                                branch_two_grads[name] = (torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr']) - branch_two_grads_tmp[name])
                            else:
                                #if name == 'module.conv1.weight':
                                #    print('Branch two backward --> conv1 grad (NOT ADDING): ', torch.sum(parameter.grad))
                                branch_two_grads[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                            #branch_two_grads[name] = (torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr']) - branch_two_grads_tmp[name])
                        else:
                            if name in branch_two_grads_tmp.keys():
                                #if name == 'module.conv1.weight':
                                #    print('Branch two backward --> conv1 grad (ADDING): ', torch.sum(parameter.grad - branch_two_grads_tmp[name]))
                                branch_two_grads[name] += (torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr']) - branch_two_grads_tmp[name])
                            else:
                                #if name == 'module.conv1.weight':
                                #    print('Branch two backward --> conv1 grad (ADDING): ', torch.sum(parameter.grad))
                                branch_two_grads[name] += torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                            #branch_two_grads[name] += (torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr']) - branch_two_grads_tmp[name])
                    except:
                        pass
        #print('BRANCH TWO GRADS conv1: ', torch.sum(branch_two_grads['module.conv1.weight']))
        optimizer.zero_grad()

        total_loss = branch_one_loss + branch_two_loss
        total_loss.backward()
        with torch.no_grad():
            for name, parameter in model.named_parameters():
                try:
                    if name not in total_grads.keys():
                        #if name == 'module.conv1.weight':
                        #    print('Total backward --> conv1 grad: (NOT ADDING)', torch.sum(parameter.grad))
                        total_grads[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                        #total_grads[name] = torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                    else:
                        #if name == 'module.conv1.weight':
                        #    print('Total backward --> conv1 grad: (ADDING)', torch.sum(parameter.grad))
                        total_grads[name] += torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                        #total_grads[name] += torch.mul(copy.deepcopy(parameter.grad), optimizer.param_groups[0]['lr'])
                except:
                    pass
        #print('TOTAL GRADS conv1: ', torch.sum(total_grads['module.conv1.weight']))
        optimizer.step()

    branch_one_train_loss += branch_one_loss.item()
    branch_two_train_loss += branch_two_loss.item()
    _, branch_one_predicted = branch_one_outputs.max(1)
    _, branch_two_predicted = branch_two_outputs.max(1)
    branch_one_total += branch_one_targets.size(0)
    branch_two_total += branch_two_targets.size(0)
    branch_one_correct += branch_one_predicted.eq(branch_one_targets).sum().item()
    branch_two_correct += branch_two_predicted.eq(branch_two_targets).sum().item()

    epoch_count_one += 1
    epoch_count_two += 1

    branch_one_acc = 100.*branch_one_correct/branch_one_total
    branch_two_acc = 100.*branch_two_correct/branch_two_total

    print("total train iters ", len(trainloader), '| time: %.3f sec Cat Loss: %.3f | Cat Acc: %.3f%% (%d/%d) | Dog Loss: %.3f | Dog Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), branch_one_train_loss/(1), 
           branch_one_acc, branch_one_correct, branch_one_total, 
           branch_two_train_loss/(1), branch_two_acc, 
           branch_two_correct, branch_two_total))

    return branch_one_acc, branch_two_acc, branch_one_grads, branch_two_grads, total_grads

**Testing the ResNet model**


*   List item
*   List item



In [19]:
def congestion_condition(min_cond, max_cond, epoch, max_epochs):

    condition = min_cond + (max_cond - min_cond) * (epoch / max_epochs)

    return condition

In [20]:
def test_congestion_avoider(start_time, testloader, device, model, optimizer, scheduler, branch_one_grads, branch_two_grads, branch_one_class, branch_two_class, branch_one_criterion, branch_two_criterion, epoch, max_epochs, min_cond, max_cond, min_epochs, mult):
    '''Same as original with additional function to increase the congestion condition linearly over the epochs'''

    model.eval()
    branch_one_test_loss = 0
    branch_two_test_loss = 0
    branch_one_correct = 0
    branch_two_correct = 0
    branch_one_total = 0
    branch_two_total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            branch_one_targets = get_binary_label(targets, index=branch_one_class)
            branch_two_targets = get_binary_label(targets, index=branch_two_class)
            inputs, branch_one_targets, branch_two_targets = inputs.to(device), branch_one_targets.to(device), branch_two_targets.to(device)
            branch_one_outputs, branch_two_outputs = model(inputs)
            branch_one_loss = branch_one_criterion(branch_one_outputs, branch_one_targets)
            branch_two_loss = branch_two_criterion(branch_two_outputs, branch_two_targets)

            branch_one_test_loss += branch_one_loss.item()
            branch_two_test_loss += branch_two_loss.item()
            _, branch_one_predicted = branch_one_outputs.max(1)
            _, branch_two_predicted = branch_two_outputs.max(1)
            branch_one_total += branch_one_targets.size(0)
            branch_two_total += branch_two_targets.size(0)
            branch_one_correct += branch_one_predicted.eq(branch_one_targets).sum().item()
            branch_two_correct += branch_two_predicted.eq(branch_two_targets).sum().item()

        branch_one_val_acc = branch_one_correct / branch_one_total
        branch_two_val_acc = branch_two_correct / branch_two_total

        condition = congestion_condition(min_cond, max_cond, epoch, max_epochs)

        optimizer, model, boolean_one, boolean_two = congestion_avoid(model, optimizer, branch_one_val_acc, branch_two_val_acc, condition, branch_one_grads, branch_two_grads, min_epochs, mult)
        scheduler.step()

        print("total test iters ", len(testloader), '| time: %.3f sec Cat Loss: %.3f | Cat Acc: %.3f%% (%d/%d) | Dog Loss: %.3f | Dog Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), branch_one_test_loss/(batch_idx+1), 
           100.*branch_one_correct/branch_one_total, branch_one_correct, branch_one_total, 
           branch_two_test_loss/(batch_idx+1), 100.*branch_two_correct/branch_two_total, 
           branch_two_correct, branch_two_total))

    # RE-EVALUATE THE MODEL ON THE TEST SET AFTER THE WEIGHTS HAVE BEEN UPDATED
    model.eval()
    branch_one_test_loss = 0
    branch_two_test_loss = 0
    branch_one_correct = 0
    branch_two_correct = 0
    branch_one_total = 0
    branch_two_total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            branch_one_targets = get_binary_label(targets, index=branch_one_class)
            branch_two_targets = get_binary_label(targets, index=branch_two_class)
            inputs, branch_one_targets, branch_two_targets = inputs.to(device), branch_one_targets.to(device), branch_two_targets.to(device)
            branch_one_outputs, branch_two_outputs = model(inputs)
            branch_one_loss = branch_one_criterion(branch_one_outputs, branch_one_targets)
            branch_two_loss = branch_two_criterion(branch_two_outputs, branch_two_targets)

            branch_one_test_loss += branch_one_loss.item()
            branch_two_test_loss += branch_two_loss.item()
            _, branch_one_predicted = branch_one_outputs.max(1)
            _, branch_two_predicted = branch_two_outputs.max(1)
            branch_one_total += branch_one_targets.size(0)
            branch_two_total += branch_two_targets.size(0)
            branch_one_correct += branch_one_predicted.eq(branch_one_targets).sum().item()
            branch_two_correct += branch_two_predicted.eq(branch_two_targets).sum().item()

        print("total test iters ", len(testloader), '| time: %.3f sec Cat Loss: %.3f | Cat Acc: %.3f%% (%d/%d) | Dog Loss: %.3f | Dog Acc: %.3f%% (%d/%d)'
        % ((time.time()-start_time), branch_one_test_loss/(batch_idx+1), 
           100.*branch_one_correct/branch_one_total, branch_one_correct, branch_one_total, 
           branch_two_test_loss/(batch_idx+1), 100.*branch_two_correct/branch_two_total, 
           branch_two_correct, branch_two_total))


    return optimizer, branch_one_val_acc, branch_two_val_acc, boolean_one, boolean_two

**Producing the results of the training process**

*   The standard LR schedule is the Cyclic LR with decaying peaks



In [21]:
import time
import torch.optim as optim
from torch.optim.lr_scheduler import CyclicLR
import torch.backends.cudnn as cudnn

In [22]:
def get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.99, mult_factor=1, lr=0.1, min_epochs = 5):

    '''Same as above but now allowing for congestio condition to change linearly over time '''

    branch_one_grads = {}
    branch_two_grads = {}
    total_grads = {}
    epoch_count_one = 0
    epoch_count_two = 0

    # IMPORT DATA
    trainset, trainloader, testset, testloader = create_CIFAR_data()
    
    # CREATE DATASET WITH CLASS SIZES
    trainset = create_unbalanced_CIFAR10(trainset)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
    testset = create_unbalanced_CIFAR10(testset, [125,125,125,1000,125,1000,125,125,125,125])
    testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=True, num_workers=2)

    # CREATE MODEL
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = ResNetSplit18Shared()
    model = model.to(device)
    if device == 'cuda':
        print('CUDA device used...')
        model = torch.nn.DataParallel(model)
        cudnn.benchmark = True
    # CREATE LOSS OF EACH BRANCH
    branch_one_criterion = nn.CrossEntropyLoss()
    branch_two_criterion = nn.CrossEntropyLoss()
    # CREATE MODEL OPTIMIZER
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0, weight_decay=5e-4)
    scheduler = CyclicLR(optimizer, base_lr=0.0001, max_lr=lr, step_size_up=10, mode="triangular2")

    # BEGIN RECORDING THE TIME
    start_time = time.time()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    branch_one_train_accuracies = []
    branch_two_train_accuracies = []
    branch_one_test_accuracies = []
    branch_two_test_accuracies = []
    branch_one_condition = []
    branch_two_condition = []

    boolean_one = False
    boolean_two = False

    for epoch in range(epochs):
        print('\n********** EPOCH {} **********'.format(epoch + 1))
        print('Learning rate: ', optimizer.param_groups[0]['lr'])
        branch_one_train_acc, branch_two_train_acc, branch_one_grads, branch_two_grads = train_congestion_avoider(trainloader, device, model, optimizer, branch_one_criterion, branch_two_criterion, branch_one_class, branch_two_class, boolean_one, boolean_two)
        '''
        print('\nBRANCH ONE GRADS:')
        #print(branch_one_grads['module.conv1.weight'])
        for key, value in branch_one_grads.items():
            print(key, torch.sum(value))
        print('\nBRANCH TWO GRADS:')
        #print(branch_two_grads['module.conv1.weight'])
        for key, value in branch_two_grads.items():
            print(key, torch.sum(value))
        print('\nTOTAL GRADS:')
        #print(total_grads['module.conv1.weight'])
        for key, value in total_grads.items():
            print(key, torch.sum(value))
        print('\nERROR IN GRAD:')
        for key, value in total_grads.items():
            if key not in branch_one_grads.keys() and key not in branch_two_grads.keys():
                print(key, torch.sum(value))
            if key not in branch_one_grads.keys() and key in branch_two_grads.keys():
                print(key, torch.sum(value)-torch.sum(branch_two_grads[key]))
            if key in branch_one_grads.keys() and key not in branch_two_grads.keys():
                print(key, torch.sum(value)-torch.sum(branch_one_grads[key]))
            if key in branch_one_grads.keys() and key in branch_two_grads.keys():
                print(key, torch.sum(value)-torch.sum(branch_one_grads[key])-torch.sum(branch_two_grads[key]))
        '''
        branch_one_train_accuracies.append(branch_one_train_acc)
        branch_two_train_accuracies.append(branch_two_train_acc)
        print('Weight after training (SHARED): ', torch.sum(model.module.conv1.weight))
        print('Weight after training (BRANCH 1): ', torch.sum(model.module.branch1layer3[0].conv1.weight))
        print('Weight after training (BRANCH 2): ', torch.sum(model.module.branch2layer3[0].conv1.weight))
        optimizer, branch_one_val_acc, branch_two_val_acc, boolean_one, boolean_two = test_congestion_avoider(start_time, testloader, device, model, optimizer, scheduler, branch_one_grads, branch_two_grads, branch_one_class, branch_two_class, branch_one_criterion, branch_two_criterion, epoch, epochs, min_cond, max_cond, min_epochs, mult_factor)
        print('Weight after scheduler (SHARED): ', torch.sum(model.module.conv1.weight))
        print('Weight after training (BRANCH 1): ', torch.sum(model.module.branch1layer3[0].conv1.weight))
        print('Weight after training (BRANCH 2): ', torch.sum(model.module.branch2layer3[0].conv1.weight))
        branch_one_test_accuracies.append(branch_one_val_acc)
        branch_two_test_accuracies.append(branch_two_val_acc)
        branch_one_condition.append(boolean_one)
        branch_two_condition.append(boolean_two)

    return branch_one_train_accuracies, branch_two_train_accuracies, branch_one_test_accuracies, branch_two_test_accuracies, branch_one_condition, branch_two_condition


*   Cyclic LR
*   Full gradient rollback (mult = 1)
* 95% congestion condition that linearly increases to 99%
* Minimum of 10 epochs between congestion conditions on a branch


In [None]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.99, mult_factor=1, lr=0.1, min_epochs = 10)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))




KeyboardInterrupt: ignored

In [None]:
import pickle

with open('950_990cond_mult100_min10epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)


*   Cyclic LR
*   Half gradient rollback (mult = 0.5)
* congestion condition increases linearly from 95% to 99%
* Minimum of 10 epochs between congestions on a branch

In [None]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.99, mult_factor=0.5, lr=0.1, min_epochs = 10)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001
total train iters  118 | time: 12.113 sec Cat Loss: 1.096 | Cat Acc: 56.007% (8401/15000) | Dog Loss: 0.959 | Dog Acc: 62.340% (9351/15000)
Weight after training (SHARED):  tensor(2.3779, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(1.4230, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(16.5077, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 95.0%
total test iters  30 | time: 12.974 sec Cat Loss: 0.697 | Cat Acc: 65.933% (1978/3000) | Dog Loss: 0.689 | Dog Acc: 65.167% (1955/3000)
total test iters  30 | time: 13.830 sec Cat Loss: 0.697 | Ca

In [None]:
with open('950_990cond_mult050_min10epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)



*   Cyclic LR
*   Full weight rollback (Mult=1)
* Condition increases from 95% to 99%
* Minimum of 5 epochs between congestion events



In [None]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.99, mult_factor=1, lr=0.1, min_epochs = 5)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


total train iters  118 | time: 15.607 sec Cat Loss: 1.012 | Cat Acc: 59.140% (8871/15000) | Dog Loss: 1.022 | Dog Acc: 58.560% (8784/15000)
Weight after training (SHARED):  tensor(-6.4050, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(-17.2955, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(13.3547, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 95.0%
total test iters  30 | time: 16.630 sec Cat Loss: 0.704 | Cat Acc: 63.333% (1900/3000) | Dog Loss: 0.681 | Dog Acc: 66.633% (1999/3000)
total test iters  30 | time: 17.382 sec Cat Loss: 0.704 | Cat Acc: 63.333% (1900/3000) | Dog Loss: 0.681 | Dog Acc: 66.633% (1999/3000)
Weight after scheduler (SHARED):  tensor(-6.4050, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(-17.2955, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(13.3547, device='cuda:0', grad_fn=<SumBackward0>)

**

In [None]:
print(sum(b1_condition))
print(sum(b2_condition))
import pickle
with open('950_990cond_mult100_min5epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

18
1




*   Cyclic LR
*   Half-weight rollback (mult = 0.5)
* condition increases linearly from 95% to 99%
* Minimum of 5 epochs between congestion conditions



In [None]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.99, mult_factor=0.5, lr=0.1, min_epochs = 5)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


total train iters  118 | time: 12.145 sec Cat Loss: 0.978 | Cat Acc: 59.267% (8890/15000) | Dog Loss: 0.963 | Dog Acc: 59.747% (8962/15000)
Weight after training (SHARED):  tensor(-0.9756, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(-22.7949, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(37.6078, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 95.0%
total test iters  30 | time: 13.029 sec Cat Loss: 0.705 | Cat Acc: 64.033% (1921/3000) | Dog Loss: 0.690 | Dog Acc: 65.867% (1976/3000)
total test iters  30 | time: 13.764 sec Cat Loss: 0.705 | Cat Acc: 64.033% (1921/3000) | Dog Loss: 0.690 | Dog Acc: 65.867% (1976/3000)
Weight after scheduler (SHARED):  tensor(-0.9756, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(-22.7949, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(37.6078, device='cuda:0', grad_fn=<SumBackward0>)

**

In [None]:
print(sum(b1_condition))
print(sum(b2_condition))
import pickle
with open('950_990cond_mult050_min5epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

17
1


In [None]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.95, mult_factor=1, lr=0.1, min_epochs = 10)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001
total train iters  118 | time: 12.194 sec Cat Loss: 1.039 | Cat Acc: 58.760% (8814/15000) | Dog Loss: 1.006 | Dog Acc: 60.173% (9026/15000)
Weight after training (SHARED):  tensor(1.0200, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(-1.8053, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(-23.3772, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 95.0%
total test iters  30 | time: 12.998 sec Cat Loss: 0.720 | Cat Acc: 65.567% (1967/3000) | Dog Loss: 0.675 | Dog Acc: 67.167% (2015/3000)
total test iters  30 | time: 13.805 sec Cat Loss: 0.720 | 

In [None]:
print(sum(b1_condition))
print(sum(b2_condition))

with open('950cond_mult100_min10epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

3
0


In [None]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.975, max_cond = 0.975, mult_factor=1, lr=0.1, min_epochs = 10)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001
total train iters  118 | time: 11.795 sec Cat Loss: 0.971 | Cat Acc: 59.647% (8947/15000) | Dog Loss: 0.966 | Dog Acc: 59.467% (8920/15000)
Weight after training (SHARED):  tensor(0.0249, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(-20.1638, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(11.3020, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 97.5%
total test iters  30 | time: 12.640 sec Cat Loss: 0.712 | Cat Acc: 65.133% (1954/3000) | Dog Loss: 0.683 | Dog Acc: 65.867% (1976/3000)
total test iters  30 | time: 13.484 sec Cat Loss: 0.712 | 

In [None]:
print(sum(b1_condition))
print(sum(b2_condition))

with open('975cond_mult100_min10epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

9
0


In [23]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.95, mult_factor=0.5, lr=0.1, min_epochs = 10)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


total train iters  118 | time: 12.626 sec Cat Loss: 0.963 | Cat Acc: 59.333% (8900/15000) | Dog Loss: 1.073 | Dog Acc: 57.100% (8565/15000)
Weight after training (SHARED):  tensor(-1.1707, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(18.1598, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(-2.2815, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 95.0%
total test iters  30 | time: 13.539 sec Cat Loss: 0.710 | Cat Acc: 65.300% (1959/3000) | Dog Loss: 0.699 | Dog Acc: 65.700% (1971/3000)
total test iters  30 | time: 14.302 sec Cat Loss: 0.710 | Cat Acc: 65.300% (1959/3000) | Dog Loss: 0.699 | Dog Acc: 65.700% (1971/3000)
Weight after scheduler (SHARED):  tensor(-1.1707, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(18.1598, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(-2.2815, device='cuda:0', grad_fn=<SumBackward0>)

****

In [25]:
print(sum(b1_condition))
print(sum(b2_condition))
import pickle
with open('950cond_mult050_min10epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

4
0


In [41]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.975, max_cond = 0.975, mult_factor=0.5, lr=0.1, min_epochs = 10)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001
total train iters  118 | time: 11.736 sec Cat Loss: 1.034 | Cat Acc: 57.700% (8655/15000) | Dog Loss: 0.946 | Dog Acc: 60.140% (9021/15000)
Weight after training (SHARED):  tensor(2.5897, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(6.8862, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(-0.9703, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 97.5%
total test iters  30 | time: 12.478 sec Cat Loss: 0.706 | Cat Acc: 65.067% (1952/3000) | Dog Loss: 0.677 | Dog Acc: 66.500% (1995/3000)
total test iters  30 | time: 13.221 sec Cat Loss: 0.706 | Ca

In [42]:
print(sum(b1_condition))
print(sum(b2_condition))

with open('975cond_mult050_min10epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

8
0


In [43]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.95, mult_factor=1, lr=0.1, min_epochs = 5)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001
total train iters  118 | time: 11.365 sec Cat Loss: 0.978 | Cat Acc: 58.767% (8815/15000) | Dog Loss: 0.974 | Dog Acc: 59.067% (8860/15000)
Weight after training (SHARED):  tensor(1.2051, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(-5.2548, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(39.4779, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 95.0%
total test iters  30 | time: 12.119 sec Cat Loss: 0.712 | Cat Acc: 65.267% (1958/3000) | Dog Loss: 0.664 | Dog Acc: 67.633% (2029/3000)
total test iters  30 | time: 12.880 sec Cat Loss: 0.712 | C

In [44]:
print(sum(b1_condition))
print(sum(b2_condition))

with open('950cond_mult100_min5epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

5
1


In [45]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.975, max_cond = 0.975, mult_factor=1, lr=0.1, min_epochs = 5)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001
total train iters  118 | time: 11.362 sec Cat Loss: 1.090 | Cat Acc: 56.393% (8459/15000) | Dog Loss: 0.961 | Dog Acc: 59.200% (8880/15000)
Weight after training (SHARED):  tensor(-3.0995, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(0.8510, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(12.6923, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 97.5%
total test iters  30 | time: 12.168 sec Cat Loss: 0.689 | Cat Acc: 65.200% (1956/3000) | Dog Loss: 0.669 | Dog Acc: 67.300% (2019/3000)
total test iters  30 | time: 12.915 sec Cat Loss: 0.689 | C

In [46]:
print(sum(b1_condition))
print(sum(b2_condition))

with open('975cond_mult100_min5epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

19
1


In [47]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.95, max_cond = 0.95, mult_factor=0.5, lr=0.1, min_epochs = 5)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001
total train iters  118 | time: 11.311 sec Cat Loss: 1.046 | Cat Acc: 58.380% (8757/15000) | Dog Loss: 1.070 | Dog Acc: 56.847% (8527/15000)
Weight after training (SHARED):  tensor(0.1501, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(-7.8947, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(11.1994, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 95.0%
total test iters  30 | time: 12.074 sec Cat Loss: 0.692 | Cat Acc: 65.833% (1975/3000) | Dog Loss: 0.684 | Dog Acc: 65.700% (1971/3000)
total test iters  30 | time: 12.818 sec Cat Loss: 0.692 | C

In [48]:
print(sum(b1_condition))
print(sum(b2_condition))

with open('950cond_mult050_min5epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

5
0


In [49]:
start_time = time.time()
branch_one_grads = {}
branch_two_grads = {}
epoch_count_one = 0
epoch_count_two = 0

b1_train, b2_train, b1_test, b2_test, b1_condition, b2_condition = get_cong_avoidance_results(branch_one_class=3, branch_two_class=5, epochs=100, min_cond=0.975, max_cond = 0.975, mult_factor=0.5, lr=0.1, min_epochs = 5)

Files already downloaded and verified
Files already downloaded and verified
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]
[ 625  625  625 5000  625 5000  625  625  625  625]
[1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]
[ 125  125  125 1000  125 1000  125  125  125  125]
CUDA device used...

********** EPOCH 1 **********
Learning rate:  0.0001
total train iters  118 | time: 12.442 sec Cat Loss: 0.929 | Cat Acc: 60.467% (9070/15000) | Dog Loss: 1.021 | Dog Acc: 57.847% (8677/15000)
Weight after training (SHARED):  tensor(-6.1227, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 1):  tensor(0.5554, device='cuda:0', grad_fn=<SumBackward0>)
Weight after training (BRANCH 2):  tensor(-11.9341, device='cuda:0', grad_fn=<SumBackward0>)
No condition is met ..... : 97.5%
total test iters  30 | time: 13.307 sec Cat Loss: 0.711 | Cat Acc: 64.500% (1935/3000) | Dog Loss: 0.687 | Dog Acc: 66.267% (1988/3000)
total test iters  30 | time: 14.120 sec Cat Loss: 0.711 | 

In [50]:
print(sum(b1_condition))
print(sum(b2_condition))

with open('975cond_mult050_min5epoch_CORRECT.pickle', 'wb') as file:
    pickle.dump(b1_train, file)
    pickle.dump(b2_train, file)
    pickle.dump(b1_test, file)
    pickle.dump(b2_test, file)
    pickle.dump(b1_condition, file)
    pickle.dump(b2_condition, file)

19
1
