### Note: Do not run the both training cells one after another as the nvidia-smi might take the value of previous run and you might be confused that gradient checkpointing is consuming same memory.

In [1]:
!pip install nvidia-ml-py3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!git clone https://github.com/laxmimerit/dog-cat-full-dataset.git

fatal: destination path 'dog-cat-full-dataset' already exists and is not an empty directory.


In [1]:
#All library imports.
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import cv2
import nvidia_smi
import copy
from PIL import Image
from torch.utils.data import Dataset,DataLoader
import torch.utils.checkpoint as checkpoint
from tqdm import tqdm
import shutil
from torch.utils.checkpoint import checkpoint_sequential
device="cuda" if torch.cuda.is_available() else "cpu"
%matplotlib inline
import random

nvidia_smi.nvmlInit()


In [2]:
#Define the dataset and the dataloader.
train_dataset=datasets.ImageFolder(root="/content/dog-cat-full-dataset/data/train",
                            transform=transforms.Compose([
                                transforms.RandomRotation(30),
                                transforms.RandomHorizontalFlip(),
                                transforms.RandomResizedCrop(224, scale=(0.96, 1.0), ratio=(0.95, 1.05)),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                            ]))

val_dataset=datasets.ImageFolder(root="/content/dog-cat-full-dataset/data/test",
                            transform=transforms.Compose([
                                transforms.Resize([224, 224]),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
                            ]))

train_dataloader=DataLoader(train_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=2)

val_dataloader=DataLoader(val_dataset,
                            batch_size=64,
                            shuffle=True,
                            num_workers=2)

In [3]:
def train_with_grad_checkpointing(model,loss_func,optimizer,train_dataloader,val_dataloader,epochs=10):
   

 
    #Training loop.
    for epoch in range(epochs):
      model.train()
      for images, target in tqdm(train_dataloader):
          images, target = images.to(device), target.to(device)
          images.requires_grad=True
          optimizer.zero_grad()
          #Applying gradient checkpointing
          segments = 2

          # get the modules in the model. These modules should be in the order
          # the model should be executed
          modules = [module for k, module in model._modules.items()]

          # now call the checkpoint API and get the output
          output = checkpoint_sequential(modules, segments, images)
          loss = loss_func(output, target)
          loss.backward()
          optimizer.step()
      if os.path.exists('checkpoints/') is False:
        os.mkdir('checkpoints')
      torch.save(model.state_dict(), 'checkpoints/epoch_'+str(epoch)+'.pt')


      #Test the model on validation data.
      train_acc,train_loss=test_model(model,train_dataloader)
      val_acc,val_loss=test_model(model,val_dataloader)

      #Check memory.
      handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
      info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
      memory_used=info.used
      memory_used=(memory_used/1024)/1024

      print(f"Epoch={epoch} Train Accuracy={train_acc} Train loss={train_loss} Validation accuracy={val_acc} Validation loss={val_loss} Memory used={memory_used} MB")


def train_model(model,loss_func,optimizer,train_dataloader,val_dataloader,epochs=10):

    model.train()
    #Training loop.
    for epoch in range(epochs):
      model.train()
      for images, target in tqdm(train_dataloader):
          images, target = images.to(device), target.to(device)
          images.requires_grad=True
          optimizer.zero_grad()
          output = model(images)
          loss = loss_func(output, target)
          loss.backward()
          optimizer.step()
      if os.path.exists('grad_checkpoints/') is False:
        os.mkdir('grad_checkpoints')
      torch.save(model.state_dict(), 'grad_checkpoints/epoch_'+str(epoch)+'.pt')


      #Test the model on validation data.
      train_acc,train_loss=test_model(model,train_dataloader)
      val_acc,val_loss=test_model(model,val_dataloader)

      #Check memory usage.
      handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
      info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
      memory_used=info.used
      memory_used=(memory_used/1024)/1024

      print(f"Epoch={epoch} Train Accuracy={train_acc} Train loss={train_loss} Validation accuracy={val_acc} Validation loss={val_loss} Memory used={memory_used} MB")



def test_model(model,val_dataloader):
  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
      for images, target in val_dataloader:
          images, target = images.to(device), target.to(device)
          output = model(images)
          test_loss += loss_func(output, target).data.item()
          _, predicted = torch.max(output, 1)
          correct += (predicted == target).sum().item()
  
  test_loss /= len(val_dataloader.dataset)

  return int(correct / len(val_dataloader.dataset) * 100),test_loss

    


In [None]:
#Define the model,loss and optimizer and train the model

torch.manual_seed(0)

lr = 0.003

# model = models.resnet50()
# model=model.to(device)

vgg16=models.vgg16()
vgg_layers_list=list(vgg16.children())[:-1]
vgg_layers_list.append(nn.Flatten())
vgg_layers_list.append(nn.Linear(25088,4096))
vgg_layers_list.append(nn.ReLU())
vgg_layers_list.append(nn.Dropout(0.5,inplace=False))
vgg_layers_list.append(nn.Linear(4096,4096))
vgg_layers_list.append(nn.ReLU())
vgg_layers_list.append(nn.Dropout(0.5,inplace=False))
vgg_layers_list.append(nn.Linear(4096,2))
model = nn.Sequential(*vgg_layers_list)
model=model.to(device)




num_epochs=10

#Loss
loss_func = nn.CrossEntropyLoss()

# Optimizer 
# optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
optimizer = optim.SGD(params=model.parameters(), lr=0.001, momentum=0.9)


#Fitting the model.
model = train_model(model, loss_func, optimizer,
                       train_dataloader,val_dataloader,num_epochs)



100%|██████████| 313/313 [04:47<00:00,  1.09it/s]


Epoch=0 Train Accuracy=58 Train loss=0.010537974041700364 Validation accuracy=60 Validation loss=0.010591725468635559 Memory used=14222.125 MB


100%|██████████| 313/313 [05:02<00:00,  1.03it/s]


Epoch=1 Train Accuracy=56 Train loss=0.010523867833614349 Validation accuracy=59 Validation loss=0.01050753378868103 Memory used=14222.125 MB


100%|██████████| 313/313 [05:04<00:00,  1.03it/s]


Epoch=2 Train Accuracy=63 Train loss=0.009880178460478783 Validation accuracy=60 Validation loss=0.010314488422870636 Memory used=14222.125 MB


100%|██████████| 313/313 [05:01<00:00,  1.04it/s]


Epoch=3 Train Accuracy=69 Train loss=0.009104096573591233 Validation accuracy=67 Validation loss=0.009390122538805008 Memory used=14222.125 MB


100%|██████████| 313/313 [05:01<00:00,  1.04it/s]


Epoch=4 Train Accuracy=72 Train loss=0.008648773723840713 Validation accuracy=70 Validation loss=0.008878046506643296 Memory used=14222.125 MB


100%|██████████| 313/313 [05:02<00:00,  1.03it/s]


Epoch=5 Train Accuracy=66 Train loss=0.009526893058419228 Validation accuracy=62 Validation loss=0.01051809189915657 Memory used=14222.125 MB


100%|██████████| 313/313 [05:01<00:00,  1.04it/s]


Epoch=6 Train Accuracy=74 Train loss=0.008127348774671554 Validation accuracy=75 Validation loss=0.00794955143928528 Memory used=14222.125 MB


100%|██████████| 313/313 [05:02<00:00,  1.04it/s]


Epoch=7 Train Accuracy=75 Train loss=0.007835089820623397 Validation accuracy=73 Validation loss=0.008243390429019929 Memory used=14222.125 MB


100%|██████████| 313/313 [05:03<00:00,  1.03it/s]


Epoch=8 Train Accuracy=74 Train loss=0.008096014150977134 Validation accuracy=76 Validation loss=0.00776103823184967 Memory used=14222.125 MB


100%|██████████| 313/313 [05:10<00:00,  1.01it/s]


Epoch=9 Train Accuracy=79 Train loss=0.006945362535119057 Validation accuracy=79 Validation loss=0.007098762637376786 Memory used=14222.125 MB


In [4]:
#Define the model,loss and optimizer and train the model

torch.manual_seed(0)

lr = 0.003

# model = models.resnet50()
# model=model.to(device)

vgg16=models.vgg16()
vgg_layers_list=list(vgg16.children())[:-1]
vgg_layers_list.append(nn.Flatten())
vgg_layers_list.append(nn.Linear(25088,4096))
vgg_layers_list.append(nn.ReLU())
vgg_layers_list.append(nn.Dropout(0.5,inplace=False))
vgg_layers_list.append(nn.Linear(4096,4096))
vgg_layers_list.append(nn.ReLU())
vgg_layers_list.append(nn.Dropout(0.5,inplace=False))
vgg_layers_list.append(nn.Linear(4096,2))
model = nn.Sequential(*vgg_layers_list)
model=model.to(device)




num_epochs=10

#Loss
loss_func = nn.CrossEntropyLoss()

# Optimizer 
# optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
optimizer = optim.SGD(params=model.parameters(), lr=0.001, momentum=0.9)


#Fitting the model.
model = train_with_grad_checkpointing(model, loss_func, optimizer,
                       train_dataloader,val_dataloader,num_epochs)



100%|██████████| 313/313 [06:44<00:00,  1.29s/it]


Epoch=0 Train Accuracy=59 Train loss=0.010533255109190941 Validation accuracy=61 Validation loss=0.010588853204250336 Memory used=10550.125 MB


100%|██████████| 313/313 [06:41<00:00,  1.28s/it]


Epoch=1 Train Accuracy=60 Train loss=0.010360798668861389 Validation accuracy=63 Validation loss=0.010376701986789704 Memory used=10550.125 MB


100%|██████████| 313/313 [06:43<00:00,  1.29s/it]


Epoch=2 Train Accuracy=63 Train loss=0.00989786814749241 Validation accuracy=60 Validation loss=0.010341400074958802 Memory used=10550.125 MB


100%|██████████| 313/313 [06:43<00:00,  1.29s/it]


Epoch=3 Train Accuracy=70 Train loss=0.009069903796911239 Validation accuracy=67 Validation loss=0.009372828048467636 Memory used=10550.125 MB


100%|██████████| 313/313 [06:43<00:00,  1.29s/it]


Epoch=4 Train Accuracy=72 Train loss=0.008637868346273898 Validation accuracy=71 Validation loss=0.00886544327735901 Memory used=10550.125 MB


100%|██████████| 313/313 [06:44<00:00,  1.29s/it]


Epoch=5 Train Accuracy=68 Train loss=0.009268309929966927 Validation accuracy=63 Validation loss=0.010140820240974426 Memory used=10550.125 MB


100%|██████████| 313/313 [06:45<00:00,  1.29s/it]


Epoch=6 Train Accuracy=74 Train loss=0.008132873253524303 Validation accuracy=76 Validation loss=0.007947488868236543 Memory used=10550.125 MB


100%|██████████| 313/313 [06:45<00:00,  1.30s/it]


Epoch=7 Train Accuracy=75 Train loss=0.00785558657348156 Validation accuracy=74 Validation loss=0.008125495082139968 Memory used=10550.125 MB


100%|██████████| 313/313 [06:45<00:00,  1.30s/it]


Epoch=8 Train Accuracy=74 Train loss=0.007979773138463497 Validation accuracy=76 Validation loss=0.0076255324840545655 Memory used=10550.125 MB


100%|██████████| 313/313 [06:45<00:00,  1.30s/it]


Epoch=9 Train Accuracy=79 Train loss=0.00688817176669836 Validation accuracy=79 Validation loss=0.00707613160610199 Memory used=10550.125 MB
