
**Install requirements**

In [0]:
!pip3 install 'torch==1.3.1'
!pip3 install 'torchvision==0.4.2'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'

**Import libraries**

In [0]:
import os
import logging

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn

import torchvision
from torchvision import transforms
from torchvision.models import alexnet

from PIL import Image
from tqdm import tqdm

**Set Arguments**

In [0]:
DEVICE = 'cuda' # 'cuda' or 'cpu'

NUM_CLASSES = 7 # Seven classes split among the various domains 

BATCH_SIZE = 64     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = 1e-2            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 5e-5  # Regularization, you can keep this at the default

NUM_EPOCHS = 25      # Total number of training epochs (iterations over dataset)
STEP_SIZE = 15       # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1          # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 10

**Define Data Preprocessing**

In [0]:
# Define transforms for training phase
train_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.CenterCrop(224),  # Crops a central square patch of the image
                                                                   # 224 because torchvision's AlexNet needs a 224x224 input!
                                                                   # Remember this when applying different transformations, otherwise you get an error
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # Normalizes tensor with mean and standard deviation
])
# Define transforms for the evaluation phase
eval_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))                                    
])

**Prepare Dataset**

In [0]:
# Clone github repository with data
if not os.path.isdir('./Homework3-PACS'):
  !git clone https://github.com/MachineLearning2020/Homework3-PACS.git

DATA_DIR = 'Homework3-PACS/PACS'

# Read each domain as a pytorch Dataset with Imagefolder
P_DS = torchvision.datasets.ImageFolder(DATA_DIR+"/photo", transform=train_transform) # SOURCE DOMAIN
A_DS = torchvision.datasets.ImageFolder(DATA_DIR+"/art_painting", transform=eval_transform) # TARGET DOMAIN
C_DS = torchvision.datasets.ImageFolder(DATA_DIR+"/cartoon", transform=train_transform) # Extra - point
S_DS = torchvision.datasets.ImageFolder(DATA_DIR+"/sketch", transform=train_transform) # Exntra - point 

**Prepare Dataloaders**

In [0]:
# Dataloaders iterate over pytorch datasets and transparently provide useful functions (e.g. parallelization and shuffling)
source_train_dataloader = DataLoader(P_DS, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
target_train_dataloader = DataLoader(A_DS, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
test_dataloader = DataLoader(A_DS, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

**AlexNet source code modified to create the Domain Classifier**

In [0]:
import torch
import torch.nn as nn
from torch.hub import load_state_dict_from_url
from torch.autograd import Function

__all__ = ['AlexNet', 'alexnet']


model_urls = {
    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
}

class ReverseLayerF(Function):
    # Forwards identity
    # Sends backward reversed gradients
    @staticmethod
    def forward(ctx, x, alpha):  # alpha MUST BE OPTIMIZED AS AN HYPERPARAMETER OF THE ALGORITHM 
        ctx.alpha = alpha

        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None

class AlexNet(nn.Module):

    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 1000),
        )

        self.dann_classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 2),
        )

        
    # # Standard forward function
    # def forward(self, x, alpha=None):
    #     x = self.features(x)
    #     x = self.avgpool(x)
    #     x = torch.flatten(x, 1)
    #     x = self.classifier(x)
    #     return x

    # DANN forward function
    def forward(self, x, alpha=None):
      features = self.features(x)

      # If we pass alpha, we can assume we are training the discriminator
      if alpha is not None:
        # flatten
        features = features.view(-1, 256 * 6 * 6) 
        # gradient reversal layer (backward gradients will be reversed)
        reverse_feature = ReverseLayerF.apply(features, alpha)
        discriminator_output = torch.flatten(reverse_feature, 1)
        discriminator_output = self.dann_classifier(discriminator_output)
        return discriminator_output

        # If we don't pass alpha, we assume we are training with supervision
      else:
        # standard forward function
        class_outputs = self.avgpool(features)
        class_outputs = torch.flatten(class_outputs, 1)
        class_outputs = self.classifier(class_outputs)
        return class_outputs


def alexnet(pretrained=False, progress=True, **kwargs):
    r"""AlexNet model architecture from the
    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    model = AlexNet(**kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls['alexnet'],
                                              progress=progress)
        model.load_state_dict(state_dict, strict=False)
        model.dann_classifier[1].weight.data = model.classifier[1].weight.data
        model.dann_classifier[1].bias.data = model.classifier[1].bias.data
    return model

**Prepare Network**

In [0]:
net = alexnet(pretrained=True) # Loading AlexNet model
buffnet = alexnet(pretrained=True)
# AlexNet has 1000 output neurons, corresponding to the 1000 ImageNet's classes
# We need 7 output for this Homework
net.classifier[6] = nn.Linear(4096, NUM_CLASSES) # nn.Linear in pytorch is a fully connected layer
                                                 # The convolutional layer is nn.Conv2d
buffnet.classifier[6] = nn.Linear(4096, NUM_CLASSES)
# We just changed the last layer of AlexNet with a new fully connected layer with 7 outputs
# It is mandatory to study torchvision.models.alexnet source code

**Prepare Training**

In [0]:
# Define loss function
criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy

# Choose parameters to optimize
# To access a different set of parameters, you have to access submodules of AlexNet
# (nn.Module objects, like AlexNet, implement the Composite Pattern)
# e.g.: parameters of the fully connected layers: net.classifier.parameters()
# e.g.: parameters of the convolutional layers: look at alexnet's source code ;) 
parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet

# Define optimizer
# An optimizer updates the weights based on loss
# We use SGD with momentum
optimizer = optim.SGD(parameters_to_optimize, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# Define scheduler
# A scheduler dynamically changes learning rate
# The most common schedule is the step(-down), which multiplies learning rate by gamma every STEP_SIZE epochs
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

**Train**

In [0]:
# # By default, everything is loaded to cpu
# net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

# cudnn.benchmark # Calling this optimizes runtime

# current_step = 0
# # Start iterating over the epochs
# for epoch in range(NUM_EPOCHS):
#   print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))

#   # Iterate over the dataset
#   for images, labels in source_train_dataloader:
#     # Bring data over the device of choice
#     images = images.to(DEVICE)
#     labels = labels.to(DEVICE)

#     net.train() # Sets module in training mode

#     # PyTorch, by default, accumulates gradients after each backward pass
#     # We need to manually set the gradients to zero before starting a new iteration
#     optimizer.zero_grad() # Zero-ing the gradients

#     # Forward pass to the network
#     outputs = net(images)

#     # Compute loss based on output and ground truth
#     loss = criterion(outputs, labels)

#     # Log loss
#     if current_step % LOG_FREQUENCY == 0:
#       print('Step {}, Loss {}'.format(current_step, loss.item()))

#     # Compute gradients for each layer and update weights
#     loss.backward()  # backward pass: computes gradients
#     optimizer.step() # update weights based on accumulated gradients

#     current_step += 1

#   # Step the scheduler
#   scheduler.step() 

***DANN Training***


In [0]:
import numpy as np

# By default, everything is loaded to cpu
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

cudnn.benchmark # Calling this optimizes runtime

current_step = 0
# Start iterating over the epochs
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))
  
  # Iterate over the dataset
  for i, (src_data, tgt_data) in enumerate(zip(source_train_dataloader, target_train_dataloader)):
    
    alpha = 0.05

    # Bring data over the device of choice
    src_img, src_lab = src_data
    tgt_img, tgt_lab = tgt_data

    src_img = src_img.to(DEVICE)
    src_lab = src_lab.to(DEVICE)
    tgt_img = tgt_img.to(DEVICE)

    # Create tensors labels for Steps 2 and 3 with labels rispectively equal to 0 and 1
    z = torch.zeros(src_lab.shape, dtype=torch.long).to(DEVICE)
    o = torch.ones(tgt_lab.shape, dtype=torch.long).to(DEVICE)

    # Set network in training mode
    net.train()

    optimizer.zero_grad() #Zeroing gradients

    ### STEP 1 ###
    # Train Label predictor on source labels
    out1 = net(src_img)
    loss1 = criterion(out1, src_lab)

    loss1.backward()

    ### STEP 2 ###
    # Train discriminator on source data (all labels to 0)
    out2 = net(src_img, alpha)
    loss2 = criterion(out2, z)

    loss2.backward()

    ### STEP 3 ###
    # Train discriminator on target data (all labels to 1)
    out3 = net(tgt_img, alpha)
    loss3 = criterion(out3, o)

    loss3.backward()


    # We can call the optimizer.step after the 3 sequential operations
    optimizer.step()

    current_step+=1

    # Log loss
    if current_step % LOG_FREQUENCY == 0:
      print('Step {}\nSrc Label Predictor Loss {}\nSrc Domain Discriminator Loss {}\nTgt Domain Discriminator Loss {}\n'.format(current_step, loss1.item(), loss2.item(), loss3.item()))

  # Step the scheduler
  scheduler.step() 

**Test**

In [0]:
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
net.train(False) # Set Network to evaluation mode

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  # Forward Pass
  outputs = net(images)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Update Corrects
  running_corrects += torch.sum(preds == labels.data).data.item()

# Calculate Accuracy
accuracy = running_corrects / float(len(A_DS))

print('Test Accuracy: {}'.format(accuracy))

***Cross Domain Validation part 1***

In [0]:
# import copy
# import math
# # Run an hyperparameter search method on Photo to Cartoon and Photo to Sketch, without Domain Adaptation, and average results for each set of hyperparameters
# # Set values for hyperparameter to iterate over
# batch_sizes = [64, 128, 256, 512]
# learning_rates = [0.001, 0.01, 0.1, 0.2]
# alphas = [0.1, 0.01, 0.05, 0.03]

# best_acc = -1
# best_loss = 141289347192834981271

# buffnet = copy.deepcopy(net)

# best_avg_acc=0
# best_avg_param=[-1, -1]

# # Without domain adaption - i do not need alphas
# for bs in batch_sizes:
#   for lr1 in learning_rates:

#     NE = 10

#     bp1_no_DANN = [-1, -1, -1]
#     bp2_no_DANN = [-1, -1, -1]

#     # Average accuracies for Cartoon and Sketch plus their counters and the best parameters for next step.
#     avg_acc_C=0
#     avg_acc_S=0

#     avg_cnt_C=0
#     avg_cnt_S=0


#     # Reset the network for each set of hyperparameters
#     net = copy.deepcopy(buffnet)

#     # Prepare Dataloaders & Prepare training phases 
#     source_train_dataloader = DataLoader(P_DS, batch_size=bs, shuffle=True, num_workers=4, drop_last=True)
#     target_train_dataloader_C = DataLoader(C_DS, batch_size=bs, shuffle=True, num_workers=4, drop_last=True)
#     test_dataloader_C= DataLoader(C_DS, batch_size=bs, shuffle=False, num_workers=4)
#     test_dataloader_S= DataLoader(S_DS, batch_size=bs, shuffle=False, num_workers=4)

#     criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy
#     parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet

#     optimizer = optim.SGD(parameters_to_optimize, lr=lr1, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

#     scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

#     # By default, everything is loaded to cpu
#     net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

#     cudnn.benchmark # Calling this optimizes runtime

#     current_step = 0
#     # Start iterating over the epochs
#     for epoch in range(NE):

#       # Iterate over the dataset
#       for images, labels in source_train_dataloader:
#         # Bring data over the device of choice
#         images = images.to(DEVICE)
#         labels = labels.to(DEVICE)

#         net.train() # Sets module in training mode

#         # PyTorch, by default, accumulates gradients after each backward pass
#         # We need to manually set the gradients to zero before starting a new iteration
#         optimizer.zero_grad() # Zero-ing the gradients

#         # Forward pass to the network
#         outputs = net(images)

#         # Compute loss based on output and ground truth
#         loss = criterion(outputs, labels)

#         # Compute gradients for each layer and update weights
#         loss.backward()  # backward pass: computes gradients
#         optimizer.step() # update weights based on accumulated gradients

#         current_step += 1

#       ### Validation phase on Cartoon ###
#       net.train(False)
#       best_acc = -1
#       best_loss = 141289347192834981271 
#       running_corrects=0
#       for images_v, labels_v in test_dataloader_C:
#         images_v = images_v.to(DEVICE)
#         labels_v = labels_v.to(DEVICE)

#         outputs_v = net(images_v)

#         _, preds = torch.max(outputs_v.data, 1)

#         running_corrects += torch.sum(preds == labels_v.data).data.item()

#       accuracy = running_corrects / float(len(C_DS))

#       avg_acc_C += accuracy
#       avg_cnt_C += 1

#       if accuracy > best_acc:
#         bp1_no_DANN = [bs, lr1, accuracy]
#         best_acc = accuracy   

#       ### Validation phase on Sketch ###
#       best_acc = -1
#       best_loss = 141289347192834981271 
#       running_corrects=0
#       for images_v, labels_v in test_dataloader_S:
#         images_v = images_v.to(DEVICE)
#         labels_v = labels_v.to(DEVICE)

#         outputs_v = net(images_v)

#         _, preds = torch.max(outputs_v.data, 1)

#         running_corrects += torch.sum(preds == labels_v.data).data.item()

#       accuracy = running_corrects / float(len(S_DS))

#       avg_acc_S += accuracy
#       avg_cnt_S +=1

#       if accuracy > best_acc:
#         bp2_no_DANN = [bs, lr1, accuracy]
#         best_acc = accuracy   

#       # Step the scheduler
#       scheduler.step()

#     print("Total average accuracy score with batch_size = {} and learning_rate = {} is: {}\n".format(bp1_no_DANN[0], bp1_no_DANN[1], (avg_acc_S+avg_acc_C)/(avg_cnt_S+avg_cnt_C)))

#     x = (avg_acc_S+avg_acc_C)/(avg_cnt_S+avg_cnt_C)

#     if x > best_avg_acc:
#       best_avg_acc = x
#       best_avg_param = [bs, lr1]


# # Implementing step 3A with the best hyperparameters found previously, which are saved into the best_avg_param array

# source_train_dataloader = DataLoader(P_DS, batch_size=best_avg_param[0], shuffle=True, num_workers=4, drop_last=True)
# target_train_dataloader = DataLoader(A_DS, batch_size=best_avg_param[0], shuffle=True, num_workers=4, drop_last=True)
# test_dataloader = DataLoader(A_DS, batch_size=best_avg_param[0], shuffle=False, num_workers=4)

# net = copy.deepcopy(buffnet)

# net = net.to(DEVICE)

# criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy
# parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet

# optimizer = optim.SGD(parameters_to_optimize, lr=best_avg_param[1], momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

# cudnn.benchmark # Calling this optimizes runtime

# current_step = 0
# # Start iterating over the epochs
# for epoch in range(NUM_EPOCHS):
#   print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))

#   # Iterate over the dataset
#   for images, labels in source_train_dataloader:
#     # Bring data over the device of choice
#     images = images.to(DEVICE)
#     labels = labels.to(DEVICE)

#     net.train() # Sets module in training mode

#     # PyTorch, by default, accumulates gradients after each backward pass
#     # We need to manually set the gradients to zero before starting a new iteration
#     optimizer.zero_grad() # Zero-ing the gradients

#     # Forward pass to the network
#     outputs = net(images)

#     # Compute loss based on output and ground truth
#     loss = criterion(outputs, labels)

#     # Log loss
#     if current_step % LOG_FREQUENCY == 0:
#       print('Step {}, Loss {}'.format(current_step, loss.item()))

#     # Compute gradients for each layer and update weights
#     loss.backward()  # backward pass: computes gradients
#     optimizer.step() # update weights based on accumulated gradients

#     current_step += 1

#   # Step the scheduler
#   scheduler.step() 


# # Test phase 
# net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
# net.train(False) # Set Network to evaluation mode

# running_corrects = 0
# for images, labels in tqdm(test_dataloader):
#   images = images.to(DEVICE)
#   labels = labels.to(DEVICE)

#   # Forward Pass
#   outputs = net(images)

#   # Get predictions
#   _, preds = torch.max(outputs.data, 1)

#   # Update Corrects
#   running_corrects += torch.sum(preds == labels.data).data.item()

# # Calculate Accuracy
# accuracy = running_corrects / float(len(A_DS))

# print('Test Accuracy: {}'.format(accuracy))

***Cross Domain Validation part 2***

In [0]:
# import copy
# import math
# # Run an hyperparameter search method on Photo to Cartoon and Photo to Sketch, without Domain Adaptation, and average results for each set of hyperparameters
# # Set values for hyperparameter to iterate over
# batch_sizes = [64, 128, 256, 512]
# learning_rates = [0.001, 0.01, 0.1, 0.2]
# alphas = [0.1, 0.01, 0.05, 0.03]

# best_acc = -1
# best_loss = 141289347192834981271

# buffnet = copy.deepcopy(net)

# # Without domain adaption - i do not need alphas
# for bs in batch_sizes:
#   for lr1 in learning_rates:
#       for als in alphas:
#         NE = 8

#         bp1_DANN = [-1, -1, -1, -1]
#         bp2_DANN = [-1, -1, -1, -1]

#         # Average accuracies for Cartoon and Sketch plus their counters and the best parameters for next step.
#         avg_acc_C=0
#         avg_acc_S=0

#         avg_cnt_C=0
#         avg_cnt_S=0

#         best_avg_acc=0
#         best_avg_param=[-1, -1, -1]


#         # Reset the network for each set of hyperparameters
#         net = copy.deepcopy(buffnet)

#         # Prepare Dataloaders & Prepare training phases
#         source_train_dataloader = DataLoader(P_DS, batch_size=bs, shuffle=True, num_workers=4, drop_last=True)
#         target_train_dataloader_C = DataLoader(C_DS, batch_size=bs, shuffle=True, num_workers=4, drop_last=True)
#         target_train_dataloader_S = DataLoader(S_DS, batch_size=bs, shuffle=True, num_workers=4, drop_last=True)
#         test_dataloader_C= DataLoader(C_DS, batch_size=bs, shuffle=False, num_workers=4)
#         test_dataloader_S= DataLoader(S_DS, batch_size=bs, shuffle=False, num_workers=4)

#         criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy
#         parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet

#         optimizer = optim.SGD(parameters_to_optimize, lr=lr1, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

#         scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

#         # By default, everything is loaded to cpu
#         net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

#         cudnn.benchmark # Calling this optimizes runtime

#         current_step = 0
#         # Start iterating over the epochs
#         for epoch in range(NE):
#           # Iterate over the dataset
#           for i, (src_data, tgt_data) in enumerate(zip(source_train_dataloader, target_train_dataloader_C)):
#             alpha = als

#             # Bring data over the device of choice
#             src_img, src_lab = src_data
#             tgt_img, tgt_lab = tgt_data

#             src_img = src_img.to(DEVICE)
#             src_lab = src_lab.to(DEVICE)
#             tgt_img = tgt_img.to(DEVICE)

#             # Create tensors labels for Steps 2 and 3 with labels rispectively equal to 0 and 1
#             z = torch.zeros(src_lab.shape, dtype=torch.long).to(DEVICE)
#             o = torch.ones(tgt_lab.shape, dtype=torch.long).to(DEVICE)

#             # Set network in training mode
#             net.train()

#             optimizer.zero_grad() #Zeroing gradients

#             ### STEP 1 ###
#             # Train Label predictor on source labels
#             out1 = net(src_img)
#             loss1 = criterion(out1, src_lab)

#             loss1.backward()

#             ### STEP 2 ###
#             # Train discriminator on source data (all labels to 0)
#             out2 = net(src_img, alpha)
#             loss2 = criterion(out2, z)

#             loss2.backward()

#             ### STEP 3 ###
#             # Train discriminator on target data (all labels to 1)
#             out3 = net(tgt_img, alpha)
#             loss3 = criterion(out3, o)

#             loss3.backward()


#             # We can call the optimizer.step after the 3 sequential operations
#             optimizer.step()

#             current_step+=1

#             ### Validation phase on Cartoon ###
#         net.train(False)
#         best_acc = -1
#         best_loss = 141289347192834981271
#         running_corrects=0
#         for images_v, labels_v in test_dataloader_C:
#           images_v = images_v.to(DEVICE)
#           labels_v = labels_v.to(DEVICE)

#           outputs_v = net(images_v)

#           _, preds = torch.max(outputs_v.data, 1)

#           running_corrects += torch.sum(preds == labels_v.data).data.item()

#         accuracy = running_corrects / float(len(C_DS))

#         avg_acc_C += accuracy
#         avg_cnt_C += 1

#         if accuracy > best_acc:
#           bp1_DANN = [bs, lr1, als, accuracy]
#           best_acc = accuracy

#         scheduler.step()

#         for epoch in range(NE):
#           # Iterate over the dataset
#           for i, (src_data, tgt_data) in enumerate(zip(source_train_dataloader, target_train_dataloader_S)):
#             alpha = als

#             # Bring data over the device of choice
#             src_img, src_lab = src_data
#             tgt_img, tgt_lab = tgt_data

#             src_img = src_img.to(DEVICE)
#             src_lab = src_lab.to(DEVICE)
#             tgt_img = tgt_img.to(DEVICE)

#             # Create tensors labels for Steps 2 and 3 with labels rispectively equal to 0 and 1
#             z = torch.zeros(src_lab.shape, dtype=torch.long).to(DEVICE)
#             o = torch.ones(tgt_lab.shape, dtype=torch.long).to(DEVICE)

#             # Set network in training mode
#             net.train()

#             optimizer.zero_grad() #Zeroing gradients

#             ### STEP 1 ###
#             # Train Label predictor on source labels
#             out1 = net(src_img)
#             loss1 = criterion(out1, src_lab)

#             loss1.backward()

#             ### STEP 2 ###
#             # Train discriminator on source data (all labels to 0)
#             out2 = net(src_img, alpha)
#             loss2 = criterion(out2, z)

#             loss2.backward()

#             ### STEP 3 ###
#             # Train discriminator on target data (all labels to 1)
#             out3 = net(tgt_img, alpha)
#             loss3 = criterion(out3, o)

#             loss3.backward()


#             # We can call the optimizer.step after the 3 sequential operations
#             optimizer.step()

#             current_step+=1


#             ### Validation phase on Sketch ###
#         best_acc = -1
#         best_loss = 141289347192834981271
#         running_corrects=0
#         for images_v, labels_v in test_dataloader_S:
#           images_v = images_v.to(DEVICE)
#           labels_v = labels_v.to(DEVICE)

#           outputs_v = net(images_v)

#           _, preds = torch.max(outputs_v.data, 1)

#           running_corrects += torch.sum(preds == labels_v.data).data.item()

#         accuracy = running_corrects / float(len(S_DS))

#         avg_acc_S += accuracy
#         avg_cnt_S +=1

#         if accuracy > best_acc:
#           bp2_DANN = [bs, lr1, als, accuracy]
#           best_acc = accuracy

#         # Step the scheduler
#         scheduler.step()

#         print("Total average accuracy score with batch_size = {} and learning_rate = {} and alpha = {} is: {}\n".format(bs, lr1, als, (avg_acc_S+avg_acc_C)/(avg_cnt_S+avg_cnt_C)))

#         x = (avg_acc_S+avg_acc_C)/(avg_cnt_S+avg_cnt_C)

#         if x > best_avg_acc:
#           best_avg_acc = x
#           best_avg_param = [bs, lr1, als]


# # Implementing 3B with the best parameters i found here
# source_train_dataloader = DataLoader(P_DS, batch_size=best_avg_param[0], shuffle=True, num_workers=4, drop_last=True)
# target_train_dataloader = DataLoader(A_DS, batch_size=best_avg_param[0], shuffle=True, num_workers=4, drop_last=True)
# test_dataloader = DataLoader(A_DS, batch_size=best_avg_param[0], shuffle=False, num_workers=4)

# net = copy.deepcopy(buffnet)

# net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

# criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy
# parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet

# optimizer = optim.SGD(parameters_to_optimize, lr=best_avg_param[1], momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

# cudnn.benchmark # Calling this optimizes runtime

# current_step = 0
# # Start iterating over the epochs
# for epoch in range(NUM_EPOCHS):
#   print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))

#   # Iterate over the dataset
#   for i, (src_data, tgt_data) in enumerate(zip(source_train_dataloader, target_train_dataloader)):

#     alpha = best_avg_param[2]

#     # Bring data over the device of choice
#     src_img, src_lab = src_data
#     tgt_img, tgt_lab = tgt_data

#     src_img = src_img.to(DEVICE)
#     src_lab = src_lab.to(DEVICE)
#     tgt_img = tgt_img.to(DEVICE)

#     # Create tensors labels for Steps 2 and 3 with labels rispectively equal to 0 and 1
#     z = torch.zeros(src_lab.shape, dtype=torch.long).to(DEVICE)
#     o = torch.ones(tgt_lab.shape, dtype=torch.long).to(DEVICE)

#     # Set network in training mode
#     net.train()

#     optimizer.zero_grad() #Zeroing gradients

#     ### STEP 1 ###
#     # Train Label predictor on source labels
#     out1 = net(src_img)
#     loss1 = criterion(out1, src_lab)

#     loss1.backward()

#     ### STEP 2 ###
#     # Train discriminator on source data (all labels to 0)
#     out2 = net(src_img, alpha)
#     loss2 = criterion(out2, z)

#     loss2.backward()

#     ### STEP 3 ###
#     # Train discriminator on target data (all labels to 1)
#     out3 = net(tgt_img, alpha)
#     loss3 = criterion(out3, o)

#     loss3.backward()


#     # We can call the optimizer.step after the 3 sequential operations
#     optimizer.step()

#     current_step+=1

#     # Log loss
#     if current_step % LOG_FREQUENCY == 0:
#       print('Step {}\nSrc Label Predictor Loss {}\nSrc Domain Discriminator Loss {}\nTgt Domain Discriminator Loss {}\n'.format(current_step, loss1.item(), loss2.item(), loss3.item()))

#   # Step the scheduler
#   scheduler.step()

# # Test phase
# net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
# net.train(False) # Set Network to evaluation mode

# running_corrects = 0
# for images, labels in tqdm(test_dataloader):
#   images = images.to(DEVICE)
#   labels = labels.to(DEVICE)

#   # Forward Pass
#   outputs = net(images)

#   # Get predictions
#   _, preds = torch.max(outputs.data, 1)

#   # Update Corrects
#   running_corrects += torch.sum(preds == labels.data).data.item()

# # Calculate Accuracy
# accuracy = running_corrects / float(len(A_DS))

# print('Test Accuracy: {}'.format(accuracy))
