<a href="https://colab.research.google.com/github/Nicordaro/Project_MLDL/blob/master/LWF_by_Alb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
# Clone github repository with data
# if os.path.isdir('./Project_MLDL'):
!rm -rf Project_MLDL
if not os.path.isdir('./CIFAR100_tError'):
  !git clone https://github.com/Nicordaro/Project_MLDL

In [0]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import random
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import time
import gc



from PIL import Image
from Project_MLDL.CIFAR100_tError import CIFAR100_tError
from Project_MLDL.model_finetuning import ResNet18
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn
from torchvision import transforms
from torchvision.models import resnet18
from torchvision.models import resnet34
from tqdm import tqdm

# Garbage collector
gc.enable()

In [0]:
DEVICE = 'cuda' # 'cuda' or 'cpu'

# Init at 10 because first train is on 10 classes
NUM_CLASSES = 10

# Used for the pseudorandom shuffle of the split
SEED = 12

BATCH_SIZE = 128     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = 2         # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 1e-5  # Regularization, you can keep this at the default

NUM_EPOCHS = 70     # Total number of training epochs (iterations over dataset)
MILESTONES = [48, 62]  # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.2          # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 50

In [0]:
# Define transforms for training phase
train_transform = transforms.Compose([transforms.RandomHorizontalFlip(), # Randomly flip the image with probability of 0.5
                                      transforms.Pad(4), # Add padding
                                      transforms.RandomCrop(32),# Crops a random squares of the image
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # https://gist.github.com/weiaicunzai/e623931921efefd4c331622c344d8151
])
# Define transforms for the evaluation phase
eval_transform = transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))                                 
])

In [0]:
DATA_DIR = './CIFAR100'

lbls = [i for i in range(0,100)]  #Array of classes integer-encoded (?)
random.seed(SEED)
random.shuffle(lbls)

def make_data_labels(lbls):       #After shuffle, take first 10 classes, and remove the first 10 from the list passed as argument
    new_labels=[]
    for el in lbls[:10]:
        new_labels.append(el)
    lbls = lbls[10:]

    return lbls, new_labels

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

class iCaRL(nn.Module):
  def __init__(self):
    super(iCaRL, self).__init__()
    
    self.feature_extractor = ResNet18()
    self.feature_extractor.linear = nn.Linear(512, 2048)
    self.relu = nn.ReLU()
    self.fc = nn.Linear(2048, 100)
    
    self.cls_loss = nn.BCEWithLogitsLoss()
    self.dist_loss = nn.BCELoss()

    self.optimizer = optim.SGD(self.parameters(), lr=2, weight_decay=1e-5, momentum=0.9)

  def forward(self, x):
    x = self.feature_extractor(x)
    #x = nn.functional.norm(x)
    x = self.relu(x)
    x = self.fc(x)
    return x

  def classify(self, x):
    
    _, labels = torch.max(torch.softmax(self.forward(images), dim=1), dim=1, keepdim=False)

    return labels  
  

  def update_representation(self,X):
    # da implementare known classes, new classes
    # da vedere che loss utilizzare
    # da rivedere il mapping delle classi
    previous_model = copy.deepcopy(self)
		previous_model.cuda()

    train_dataloader = DataLoader(X, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
    optimizer = self.optimizer
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES, gamma=GAMMA, last_epoch=-1)
    cudnn.benchmark # Calling this optimizes runtime
    current_step = 0

    # Start iterating over the epochs
    for epoch in range(NUM_EPOCHS):
      print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_last_lr()))
    
      # Iterate over the dataset
      for images, labels,indices in train_dataloader:
          # Bring data over the device of choice
          images = images.to(DEVICE)
          labels = labels.to(DEVICE)
          indices = indices.to(DEVICE)
          # PyTorch, by default, accumulates gradients after each backward pass
          # We need to manually set the gradients to zero before starting a new iteration
          optimizer.zero_grad() # Zero-ing the gradients

          # Forward pass to the network
          outputs = self.forward(images)

          cls_loss = nn.CrossEntropyLoss()(outputs, labels)

          # Se non sono le prime 10 classi..
					if self.n_classes//len(new_classes) > 1:
						dist_target = previous_model.forward(images)
						logits_dist = outputs[:,:-(self.n_classes-self.n_known)]
						dist_loss = MultiClassCrossEntropy(logits_dist, dist_target, 2)
						loss = dist_loss+cls_loss
					else:
						loss = cls_loss
    
          # # One hot encoding labels for binary cross-entropy loss
          # labels_onehot = nn.functional.one_hot(labels,100)
          # labels_onehot = labels_onehot.type_as(outputs)
          # criterion = self.cls_loss
          # loss = criterion(outputs, labels_onehot)
          # del(labels)
          # torch.cuda.empty_cache()
          
          # distillation = 0
          # g = torch.sigmoid(outputs)
          # qi = q[indices]
          # del(indices)
          # for y in range(0,len(self.exemplar_sets)):
          #   distillation += self.dist_loss(g[:,y],qi[:,y])
          # loss += distillation
    
          # if current_step % LOG_FREQUENCY == 0:
          #     print('Step {}, Loss {}'.format(current_step, loss.item()))

      # Compute gradients for each layer and update weights
          loss.backward()  # backward pass: computes gradients
          optimizer.step() # update weights based on accumulated gradients
          current_step += 1

        # Step the scheduler
      scheduler.step()


In [0]:
# Tutto da rivedere

start_time = time.clock()
net = iCaRL()
K = 300
t = 0
test_dataset = CIFAR100_tError(DATA_DIR, train=False, transform=eval_transform, download=True)
for i in range(0,10): # batches of 10
  print(f"processing batch {i+1}")
  #Create Datasets
  train_datasets = []
  # take 10 new classes
  lbls, new_labels = make_data_labels(lbls)
  for num,label in enumerate(new_labels):
    train_dataset = CIFAR100_tError(DATA_DIR, train=True, transform=train_transform, download=True)
    train_dataset.increment([label],[t])
    test_dataset.increment([label],[t])
    train_datasets.append(train_dataset) # List of training examples in per class sets
    test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=4)
    t += 1
  net.update_representation(train_datasets)
  m = K/t #numbers of exemplars per class
  net.reduce_exemplar_sets(m)
  for X in train_datasets:
    net.construct_exemplar_set(X,m) #new exemplar sets
  # Test on Test set
  running_corrects = 0
  for images,labels in test_dataloader:
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)
    preds = net.classify(images)
    running_corrects += torch.sum(preds == labels.data).data.item()
    accuracy = running_corrects / float(len(test_dataset))
    del(images)
    del(labels)
    torch.cuda.empty_cache()
  print(f"Test Accuracy: {accuracy}")