# PyTorch Transfer Learning

 1.   Freezing all the layers except the final one
 2.   Freezing the first few layers
 3.   Fine-tuning the entire network.

The following pre-trained models are available on PyTorch

 *   resnet18, resnet34, resnet50, resnet101, resnet152
 *   squeezenet1_0, squeezenet1_1
 *   Alexnet
 *   inception_v3
 *   Densenet121, Densenet169, Densenet201
 *   Vgg11, vgg13, vgg16, vgg19, vgg11_bn. vgg13_bn, vgg16_bn, vgg19_bn

In [1]:
import sys
sys.version

'3.6.7 (default, Oct 22 2018, 11:32:17) \n[GCC 8.2.0]'

In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
#import torch

In [3]:
!pip install Pillow
!pip install image



In [0]:
%matplotlib inline

In [0]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

plt.ion()   # interactive mode

In [6]:
import torch
print(torch.__version__)

0.4.1


In [7]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [8]:
torch.cuda.is_available()

True

### Get Data

In [0]:
DATASET_ZIP_FILE = 'flower_data.zip'

In [0]:
from zipfile import ZipFile
files = os.listdir()
if not DATASET_ZIP_FILE in files:
  !curl --header 'Host: s3.amazonaws.com' --user-agent 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0' --header 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' --header 'Accept-Language: en-US,en;q=0.5' --header 'Upgrade-Insecure-Requests: 1' 'https://s3.amazonaws.com/content.udacity-data.com/courses/nd188/flower_data.zip' --output 'flower_data.zip'  
  '''
  !pip install kaggle
  from google.colab import files
  files.upload()
  !mkdir -p ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  #this permissions change avoids a warnong on Kaggle tool startup
  !chmod 600 ~/.kaggle/kaggle.json  
  !kaggle datasets download -d moltean/fruits
  '''
  !ls
  with ZipFile(DATASET_ZIP_FILE, 'r') as zipF:
    zipF.extractall()
    print('UnZip Done')

In [0]:
data_dir = 'flower_data/'
PATH = data_dir

train_dir = 'train'
val_dir = 'valid'

In [0]:
batch_size = 16

In [13]:
# again, list total number of classes, and list them all
# os.list dir sorting depends on OS dependent file indexing, so leaving it as it is

classes = os.listdir(f'{data_dir}/{train_dir}')
classes.sort()
ClassesNumer = len(classes)
print("Class Total Count: ", ClassesNumer)
#print(classes)


Class Total Count:  102


In [0]:
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    train_dir: transforms.Compose([
        #transforms.Resize(224),
        transforms.RandomResizedCrop(224),
        #transforms.RandomHorizontalFlip(),
        #transforms.RandomVerticalFlip(),
        #transforms.RandomRotation(degrees=90),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
       # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    val_dir: transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
}

In [0]:

image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),data_transforms[x]) for x in [train_dir, val_dir]}

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=16, shuffle=True, num_workers=4) for x in [train_dir, val_dir]}

dataset_sizes = {x: len(image_datasets[x]) for x in [train_dir, val_dir]}

class_names = image_datasets[train_dir].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [16]:
print(dataloaders)
print(dataset_sizes)
print(device)

{'train': <torch.utils.data.dataloader.DataLoader object at 0x7f4875174208>, 'valid': <torch.utils.data.dataloader.DataLoader object at 0x7f4870907da0>}
{'train': 6552, 'valid': 818}
cuda:0


In [17]:
print(image_datasets[train_dir])


Dataset ImageFolder
    Number of datapoints: 6552
    Root Location: flower_data/train
    Transforms (if any): Compose(
                             RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=PIL.Image.BILINEAR)
                             ToTensor()
                             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                         )
    Target Transforms (if any): None


In [18]:
'''
#Let’s visualize a few training images so as to understand the data augmentations.
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated


# Get a batch of training data
inputs, classes = next(iter(dataloaders[train_dir]))

# Make a grid from batch
out = torchvision.utils.make_grid(inputs)

imshow(out, title=[class_names[x] for x in classes])
'''

'\n#Let’s visualize a few training images so as to understand the data augmentations.\ndef imshow(inp, title=None):\n    """Imshow for Tensor."""\n    inp = inp.numpy().transpose((1, 2, 0))\n    mean = np.array([0.485, 0.456, 0.406])\n    std = np.array([0.229, 0.224, 0.225])\n    inp = std * inp + mean\n    inp = np.clip(inp, 0, 1)\n    plt.imshow(inp)\n    if title is not None:\n        plt.title(title)\n    plt.pause(0.001)  # pause a bit so that plots are updated\n\n\n# Get a batch of training data\ninputs, classes = next(iter(dataloaders[train_dir]))\n\n# Make a grid from batch\nout = torchvision.utils.make_grid(inputs)\n\nimshow(out, title=[class_names[x] for x in classes])\n'

###Training the model

Now, let’s write a general function to train a model. Here, we will illustrate:

    Scheduling the learning rate
    Saving the best model

In the following, parameter ''scheduler'' is an LR scheduler object from ''torch.optim.lr_scheduler''.

In [0]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in [train_dir, val_dir]:
            if phase == train_dir:
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == train_dir):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == train_dir:
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == val_dir and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [0]:
# Load a pretrained model and reset final fully connected layer
model_ft = models.resnet152(pretrained=True)
num_ftrs = model_ft.fc.in_features
#model_ft.fc = nn.Linear(num_ftrs, 2)
model_ft.fc = nn.Linear(num_ftrs, ClassesNumer)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
#optimizer_ft = optim.Adam(model_ft.parameters(),lr=0.001,amsgrad=True)
optimizer_ft = optim.Adagrad(model_ft.parameters(),lr=0.01,lr_decay=0.0001)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=20, gamma=0.1)

In [21]:
#Freeze all layers first
#for param in model_ft.parameters():
#    param.requires_grad = False
    
# Then unfreeze last classification layer only for feature extract
#for param in model_ft.fc.parameters():
#    param.requires_grad = True    

    
# To view which layers are freeze and which layers are not freezed:
for name, child in model_ft.named_children():
  for name_2, params in child.named_parameters():
    print(name_2, params.requires_grad)

weight True
weight True
bias True
0.conv1.weight True
0.bn1.weight True
0.bn1.bias True
0.conv2.weight True
0.bn2.weight True
0.bn2.bias True
0.conv3.weight True
0.bn3.weight True
0.bn3.bias True
0.downsample.0.weight True
0.downsample.1.weight True
0.downsample.1.bias True
1.conv1.weight True
1.bn1.weight True
1.bn1.bias True
1.conv2.weight True
1.bn2.weight True
1.bn2.bias True
1.conv3.weight True
1.bn3.weight True
1.bn3.bias True
2.conv1.weight True
2.bn1.weight True
2.bn1.bias True
2.conv2.weight True
2.bn2.weight True
2.bn2.bias True
2.conv3.weight True
2.bn3.weight True
2.bn3.bias True
0.conv1.weight True
0.bn1.weight True
0.bn1.bias True
0.conv2.weight True
0.bn2.weight True
0.bn2.bias True
0.conv3.weight True
0.bn3.weight True
0.bn3.bias True
0.downsample.0.weight True
0.downsample.1.weight True
0.downsample.1.bias True
1.conv1.weight True
1.bn1.weight True
1.bn1.bias True
1.conv2.weight True
1.bn2.weight True
1.bn2.bias True
1.conv3.weight True
1.bn3.weight True
1.bn3.bias Tru

In [22]:
# Train and evaluate
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=27)

Epoch 0/26
----------
train Loss: 4.1040 Acc: 0.0809
valid Loss: 3.6028 Acc: 0.1174

Epoch 1/26
----------
train Loss: 3.3704 Acc: 0.1725
valid Loss: 3.2219 Acc: 0.2200

Epoch 2/26
----------
train Loss: 2.9138 Acc: 0.2593
valid Loss: 2.6933 Acc: 0.3350

Epoch 3/26
----------
train Loss: 2.6057 Acc: 0.3283
valid Loss: 3.1842 Acc: 0.3802

Epoch 4/26
----------
train Loss: 2.3121 Acc: 0.3898
valid Loss: 2.2296 Acc: 0.4108

Epoch 5/26
----------
train Loss: 2.0517 Acc: 0.4504
valid Loss: 2.0417 Acc: 0.5122

Epoch 6/26
----------
train Loss: 1.8620 Acc: 0.5085
valid Loss: 1.8719 Acc: 0.5587

Epoch 7/26
----------
train Loss: 1.6859 Acc: 0.5504
valid Loss: 1.5880 Acc: 0.6100

Epoch 8/26
----------
train Loss: 1.5707 Acc: 0.5717
valid Loss: 2.3804 Acc: 0.6467

Epoch 9/26
----------
train Loss: 1.4213 Acc: 0.6168
valid Loss: 2.1668 Acc: 0.6369

Epoch 10/26
----------
train Loss: 1.3404 Acc: 0.6377
valid Loss: 1.1928 Acc: 0.7054

Epoch 11/26
----------
train Loss: 1.1803 Acc: 0.6865
valid Loss

In [0]:
# Then unfreeze all layers
#for param in model_ft.parameters():
#    param.requires_grad = True  

In [0]:
# To view which layers are freeze and which layers are not freezed:#
#for name, child in model_ft.named_children():
#  for name_2, params in child.named_parameters():
#    print(name_2, params.requires_grad)

In [0]:
#criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
#optimizer_ft = optim.Adam(model_ft.parameters(),lr=0.001)
# Decay LR by a factor of 0.1 every 7 epochs
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
# Train and evaluate
#model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=25)


In [25]:
"""
# Generic function to display predictions for a few images
def visualize_model(model, num_images=6):
    was_training = model.training
    model.eval()
    images_so_far = 0
    fig = plt.figure()

    with torch.no_grad():
        for i, (inputs, labels) in enumerate(dataloaders[val_dir]):
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            for j in range(inputs.size()[0]):
                images_so_far += 1
                ax = plt.subplot(num_images//2, 2, images_so_far)
                ax.axis('off')
                ax.set_title('predicted: {}'.format(class_names[preds[j]]))
                imshow(inputs.cpu().data[j])

                if images_so_far == num_images:
                    model.train(mode=was_training)
                    return
        model.train(mode=was_training)
"""        

"\n# Generic function to display predictions for a few images\ndef visualize_model(model, num_images=6):\n    was_training = model.training\n    model.eval()\n    images_so_far = 0\n    fig = plt.figure()\n\n    with torch.no_grad():\n        for i, (inputs, labels) in enumerate(dataloaders[val_dir]):\n            inputs = inputs.to(device)\n            labels = labels.to(device)\n\n            outputs = model(inputs)\n            _, preds = torch.max(outputs, 1)\n\n            for j in range(inputs.size()[0]):\n                images_so_far += 1\n                ax = plt.subplot(num_images//2, 2, images_so_far)\n                ax.axis('off')\n                ax.set_title('predicted: {}'.format(class_names[preds[j]]))\n                imshow(inputs.cpu().data[j])\n\n                if images_so_far == num_images:\n                    model.train(mode=was_training)\n                    return\n        model.train(mode=was_training)\n"

In [0]:
#visualize_model(model_ft)

ConvNet as fixed feature extractor: Training only the last fully connected layer
----------------------
Here, we need to freeze all the network except the final layer. We need
to set ``requires_grad == False`` to freeze the parameters so that the
gradients are not computed in ``backward()``.

You can read more about this in the documentation
`here <http://pytorch.org/docs/notes/autograd.html#excluding-subgraphs-from-backward>`__.




In [27]:
#Freeze all layers first
for param in model_ft.parameters():
    param.requires_grad = False
    
# Then unfreeze last classification layer only for feature extract
for param in model_ft.fc.parameters():
    param.requires_grad = True    

    
# To view which layers are freeze and which layers are not freezed:
for name, child in model_ft.named_children():
  for name_2, params in child.named_parameters():
    print(name_2, params.requires_grad)

weight False
weight False
bias False
0.conv1.weight False
0.bn1.weight False
0.bn1.bias False
0.conv2.weight False
0.bn2.weight False
0.bn2.bias False
0.conv3.weight False
0.bn3.weight False
0.bn3.bias False
0.downsample.0.weight False
0.downsample.1.weight False
0.downsample.1.bias False
1.conv1.weight False
1.bn1.weight False
1.bn1.bias False
1.conv2.weight False
1.bn2.weight False
1.bn2.bias False
1.conv3.weight False
1.bn3.weight False
1.bn3.bias False
2.conv1.weight False
2.bn1.weight False
2.bn1.bias False
2.conv2.weight False
2.bn2.weight False
2.bn2.bias False
2.conv3.weight False
2.bn3.weight False
2.bn3.bias False
0.conv1.weight False
0.bn1.weight False
0.bn1.bias False
0.conv2.weight False
0.bn2.weight False
0.bn2.bias False
0.conv3.weight False
0.bn3.weight False
0.bn3.bias False
0.downsample.0.weight False
0.downsample.1.weight False
0.downsample.1.bias False
1.conv1.weight False
1.bn1.weight False
1.bn1.bias False
1.conv2.weight False
1.bn2.weight False
1.bn2.bias False
1

In [29]:
criterion = nn.CrossEntropyLoss()
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)
# Observe that all parameters are being optimized
#optimizer_ft = optim.Adam(model_ft.parameters(),lr=0.001,amsgrad=True)
optimizer_ft = optim.Adagrad(model_ft.parameters(),lr=0.001,lr_decay=0.0001)
#train
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=9)

Epoch 0/8
----------
train Loss: 0.4350 Acc: 0.8777
valid Loss: 0.6525 Acc: 0.8790

Epoch 1/8
----------
train Loss: 0.4259 Acc: 0.8793
valid Loss: 0.7185 Acc: 0.8802

Epoch 2/8
----------
train Loss: 0.4251 Acc: 0.8817
valid Loss: 0.8072 Acc: 0.8863

Epoch 3/8
----------
train Loss: 0.4276 Acc: 0.8837
valid Loss: 0.7578 Acc: 0.8765

Epoch 4/8
----------
train Loss: 0.4335 Acc: 0.8756
valid Loss: 0.6761 Acc: 0.8875

Epoch 5/8
----------
train Loss: 0.4195 Acc: 0.8852
valid Loss: 0.8384 Acc: 0.8619

Epoch 6/8
----------
train Loss: 0.4447 Acc: 0.8797
valid Loss: 0.6950 Acc: 0.8741

Epoch 7/8
----------
train Loss: 0.4483 Acc: 0.8762
valid Loss: 0.5868 Acc: 0.8778

Epoch 8/8
----------
train Loss: 0.4192 Acc: 0.8790
valid Loss: 0.8325 Acc: 0.8704

Training complete in 24m 35s
Best val Acc: 0.887531


In [0]:
#visualize_model(model_conv)

#plt.ioff()
#plt.show()