<a href="https://colab.research.google.com/github/ShakutaiGit/Final_Project_Applied_Data_science/blob/main/Improvement_Of_source_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!mkdir outputs

mkdir: cannot create directory ‘outputs’: File exists


In [None]:
%%writefile models.py
from torchvision import models as models
import torch.nn as nn

def resnet34(pretrained, requires_grad):
    model = models.resnet34(progress=True, pretrained=pretrained)
    # to freeze the hidden layers
    if requires_grad == False:
        for param in model.parameters():
            param.requires_grad = False
    # to train the hidden layers
    elif requires_grad == True:
        for param in model.parameters():
            param.requires_grad = True
    # make the classification layer learnable
    # we have 10 classes in total for the CIFAR10 dataset
    model.fc = nn.Linear(512, 10)
    return model

Writing models.py


In [None]:
%%writefile train.py
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import matplotlib
import torch.nn as nn
import torch.optim as optim
import models
import argparse
import joblib

from tqdm import tqdm

matplotlib.style.use('ggplot')

parser = argparse.ArgumentParser()
parser.add_argument('-wr', '--warm-restart', dest='warm_restart', 
                    action='store_true')
parser.add_argument('-t0', '--t-zero', dest='t_zero', type=int,
                    default=50)
parser.add_argument('-tm', '--t-mult', dest='t_mult', type=int,
                    default=1)
parser.add_argument('-e', '--epochs', type=int, default=100)
args = vars(parser.parse_args())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[INFO]: Computation device: {device}")
epochs = args['epochs']
batch_size = 128 # same the original paper

# we will apply the same transforms as described in the paper
train_transform = transforms.Compose(
    [transforms.RandomHorizontalFlip(),
     transforms.RandomCrop(size=(32, 32), padding=4, padding_mode='reflect'),
     transforms.ToTensor(),
     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))])
val_transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                             download=True, 
                                             transform=train_transform)
train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                               batch_size=batch_size,
                                               shuffle=True)

val_dataset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                           download=True, 
                                           transform=val_transform)
val_dataloader = torch.utils.data.DataLoader(val_dataset, 
                                             batch_size=batch_size,
                                             shuffle=False)

# instantiate the model
# we will train all the layers' parameters from scratch
model = models.resnet34(pretrained=False, requires_grad=True).to(device)
# total parameters and trainable parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"[INFO]: {total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"[INFO]: {total_trainable_params:,} trainable parameters.")

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9, 
                      weight_decay=0.0005)

# when using warm restarts
if args['warm_restart']:
    print('[INFO]: Initializing Cosine Annealing with Warm Restart Scheduler')
    steps = args['t_zero']
    mult = args['t_mult']
    print(f"[INFO]: Number of epochs for first restart: {steps}")
    print(f"[INFO]: Multiplicative factor: {mult}")
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, 
        T_0=steps, 
        T_mult=mult,
        verbose=True
    )
    loss_plot_name = f"wr_loss_s{steps}_m{mult}"
    train_loss_list = f"wr_train_loss_s{steps}_m{mult}"
    val_loss_list = f"wr_val_loss_s{steps}_m{mult}"
# when not using warm restarts
elif args['warm_restart'] == False:
    print('[INFO]: Using default Multi Step LR scheduler')
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, 
                                                     milestones=[60, 120, 160],
                                                     gamma=0.2)
    loss_plot_name = 'loss'
    train_loss_list = 'train_loss'
    val_loss_list = 'val_loss'

# training
def train(model, trainloader, optimizer, criterion, scheduler, epoch):
    model.train()
    print('Training')
    # we will use this list to store the updated learning rates per epoch
    lrs = []
    train_running_loss = 0.0
    iters = len(trainloader)
    counter = 0
    for i, data in tqdm(enumerate(trainloader), total=len(trainloader)):
        counter += 1
        if args['warm_restart']:
            lrs.append(scheduler.get_last_lr()[0])
            # print the LR after each 500 iterations
            if counter % 500 == 0:
                print(f"[INFO]: LR at iteration {counter}: {scheduler.get_last_lr()}")
        
        image, labels = data
        image = image.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(image)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # if using warm restart, then update after each batch iteration
        if args['warm_restart']:
            scheduler.step(epoch + i / iters)

        train_running_loss += loss.item()
    
    epoch_loss = train_running_loss / counter
    return lrs, epoch_loss

# validation
def validate(model, testloader, criterion):
    model.eval()
    print('Validation')
    val_running_loss = 0.0
    counter = 0
    for i, data in tqdm(enumerate(testloader), total=len(testloader)):
        counter += 1
        
        image, labels = data
        image = image.to(device)
        labels = labels.to(device)

        outputs = model(image)
        loss = criterion(outputs, labels)

        val_running_loss += loss.item()
        
    epoch_loss = val_running_loss / counter
    return epoch_loss

# start the training
train_loss, val_loss = [], []
learning_rate_plot = []
for epoch in range(epochs):
    print(f"[INFO]: Epoch {epoch+1} of {epochs}")
    
    print(f"[INFO]: Current LR [Epoch Begin]: {scheduler.get_last_lr()}")
    lrs, train_epoch_loss = train(model, train_dataloader, optimizer, 
                                  criterion, scheduler, epoch)
    val_epoch_loss = validate(model, val_dataloader, criterion)
    train_loss.append(train_epoch_loss)
    val_loss.append(val_epoch_loss)
    learning_rate_plot.extend(lrs)

    # if not using warm restart, then check whether to update MultiStepLR
    if args['warm_restart'] == False:
        scheduler.step() # take default MultiStepLR
    print(f"[INFO]: Current LR [Epoch end]: {scheduler.get_last_lr()}")
    print(f"Training loss: {train_epoch_loss:.3f}")
    print(f"Validation loss: {val_epoch_loss:.3f}")
    print('------------------------------------------------------------')
print('Finished Training')

if args['warm_restart']:
    plt.figure(figsize=(10, 7))
    plt.plot(learning_rate_plot, color='blue', label='lr')
    plt.xlabel('Iterations')
    plt.ylabel('lr')
    plt.legend()
    plt.savefig(f"outputs/lr_schedule_s{steps}_m{mult}.jpg")

plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='orange', label='train loss')
plt.plot(val_loss, color='red', label='validataion loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig(f"outputs/{loss_plot_name}.jpg")

# serialize the loss lists to disk
if args['warm_restart']:
    joblib.dump(train_loss, f"outputs/{train_loss_list}.pkl")
    joblib.dump(val_loss, f"outputs/{val_loss_list}.pkl")
else:
    joblib.dump(train_loss, f"outputs/{train_loss_list}.pkl")
    joblib.dump(val_loss, f"outputs/{val_loss_list}.pkl")

print('\n\n')

Writing train.py


In [None]:
%%writefile run.sh
python train.py -e 200

python train.py -e 200 --warm-restart -t0 50

python train.py -e 200 --warm-restart -t0 200

Writing run.sh


In [None]:
!sh run.sh

[INFO]: Computation device: cpu
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz
170499072it [00:11, 14920308.48it/s]                   
Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
[INFO]: 21,289,802 total parameters.
[INFO]: 21,289,802 trainable parameters.
[INFO]: Using default Multi Step LR scheduler
[INFO]: Epoch 1 of 200
[INFO]: Current LR [Epoch Begin]: [0.05]
Training
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
  6% 25/391 [01:09<16:40,  2.73s/it]Traceback (most recent call last):
  File "train.py", line 160, in <module>
    criterion, scheduler, epoch)
  File "train.py", line 118, in train
    outputs = model(image)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/torchvision/models/resnet.py", line 249, 

IndentationError: ignored