In [203]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

In [204]:
import torch
print(torch.__version__)

1.0.0.dev20181207


In [205]:
torch.cuda.is_available()

True

In [206]:
torch.cuda.device_count()

1

In [207]:
torch.cuda.current_device()

0

In [208]:
torch.cuda.get_device_name(torch.cuda.current_device())

'GeForce GTX 1080 Ti'

In [209]:
os.listdir()

['Flowers_pytorch_sgd_densnet201.ipynb',
 'projects',
 'envML',
 'data',
 'Flowers_pytorch_sgd2.ipynb',
 'Flowers_pytorch_sgd_topScore.ipynb',
 '.ipynb_checkpoints',
 'Untitled.ipynb']

In [210]:
!ls

data				      Flowers_pytorch_sgd_topScore.ipynb
envML				      projects
Flowers_pytorch_sgd2.ipynb	      Untitled.ipynb
Flowers_pytorch_sgd_densnet201.ipynb


In [211]:
data_dir = 'data/flower_data/'
PATH = data_dir

train_dir = 'train'
val_dir = 'valid'

In [212]:
# again, list total number of classes, and list them all
# os.list dir sorting depends on OS dependent file indexing, so leaving it as it is

classes = os.listdir(f'{data_dir}/{train_dir}')
classes.sort()
ClassesNumer = len(classes)
print("Class Total Count: ", ClassesNumer)
#print(classes)

Class Total Count:  102


In [213]:
# Data augmentation and normalization for training
# Just normalization for validation
'''
data_transforms = {
    train_dir: transforms.Compose([
        #transforms.Resize(224),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        #transforms.RandomRotation(degrees=90),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
       # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    val_dir: transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
}
'''

'\ndata_transforms = {\n    train_dir: transforms.Compose([\n        #transforms.Resize(224),\n        transforms.RandomResizedCrop(224),\n        transforms.RandomHorizontalFlip(),\n        transforms.RandomVerticalFlip(),\n        #transforms.RandomRotation(degrees=90),\n        transforms.ToTensor(),\n        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n       # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n    ]),\n    val_dir: transforms.Compose([\n        transforms.Resize(224),\n        transforms.CenterCrop(224),\n        transforms.ToTensor(),\n        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n        #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n    ]),\n}\n'

In [214]:
data_transforms = {
    # Train uses data augmentation
    train_dir:
        transforms.Compose([
        transforms.RandomResizedCrop(size=256),
        #transforms.Resize(size=224),
        #transforms.RandomRotation(degrees=90),
        #transforms.RandomRotation(degrees=30),
        transforms.RandomRotation(degrees=45),
        #transforms.ColorJitter(0.4, 0.4, 0.4),
        transforms.RandomHorizontalFlip(),
        #transforms.RandomVerticalFlip(),
        transforms.CenterCrop(size=224),  # Image net standards
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])  # Imagenet standards
    ]),
    # Validation does not use augmentation
    val_dir:
        transforms.Compose([
        #transforms.Resize(size=256),
        transforms.Resize(size=256),
        transforms.CenterCrop(size=224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [215]:

image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),data_transforms[x]) for x in [train_dir, val_dir]}

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=32, shuffle=True, num_workers=8) for x in [train_dir, val_dir]}

dataset_sizes = {x: len(image_datasets[x]) for x in [train_dir, val_dir]}

class_names = image_datasets[train_dir].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [216]:
print(dataloaders)
print(dataset_sizes)
print(device)

{'train': <torch.utils.data.dataloader.DataLoader object at 0x7fd2e4eb9c88>, 'valid': <torch.utils.data.dataloader.DataLoader object at 0x7fd2e4eb9b00>}
{'train': 6551, 'valid': 818}
cuda:0


In [217]:
print(image_datasets[train_dir])

Dataset ImageFolder
    Number of datapoints: 6551
    Root Location: data/flower_data/train
    Transforms (if any): Compose(
                             RandomResizedCrop(size=(256, 256), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=PIL.Image.BILINEAR)
                             RandomRotation(degrees=(-45, 45), resample=False, expand=False)
                             RandomHorizontalFlip(p=0.5)
                             CenterCrop(size=(224, 224))
                             ToTensor()
                             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                         )
    Target Transforms (if any): None


In [218]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('lr{}, Epoch {}/{}'.format(scheduler.get_lr(),epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in [train_dir, val_dir]:
            if phase == train_dir:
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == train_dir):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == train_dir:
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            

            # deep copy the model
            if phase == val_dir and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [219]:
# Load a pretrained model and reset final fully connected layer

model_ft = models.densenet201(pretrained=True)
num_ftrs = model_ft.classifier.in_features
model_ft.classifier = nn.Linear(num_ftrs, ClassesNumer)

model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.Adam(model_ft.parameters(),lr=1e-4,amsgrad=True)
#optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.0001, momentum=0.9)

# Decay LR by a factor of 0.1 every ? epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)

In [220]:
model_ft

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplac

In [221]:
## First train just last new layer with small lr

#Freeze all layers first
for param in model_ft.parameters():
    param.requires_grad = False
    
# Then unfreeze last classification layer only for feature extract
for param in model_ft.classifier.parameters():
    param.requires_grad = True    

    
# To view which layers are freeze and which layers are not freezed:
for name, child in model_ft.named_children():
    for name_2, params in child.named_parameters():
        print(name_2, params.requires_grad)

conv0.weight False
norm0.weight False
norm0.bias False
denseblock1.denselayer1.norm1.weight False
denseblock1.denselayer1.norm1.bias False
denseblock1.denselayer1.conv1.weight False
denseblock1.denselayer1.norm2.weight False
denseblock1.denselayer1.norm2.bias False
denseblock1.denselayer1.conv2.weight False
denseblock1.denselayer2.norm1.weight False
denseblock1.denselayer2.norm1.bias False
denseblock1.denselayer2.conv1.weight False
denseblock1.denselayer2.norm2.weight False
denseblock1.denselayer2.norm2.bias False
denseblock1.denselayer2.conv2.weight False
denseblock1.denselayer3.norm1.weight False
denseblock1.denselayer3.norm1.bias False
denseblock1.denselayer3.conv1.weight False
denseblock1.denselayer3.norm2.weight False
denseblock1.denselayer3.norm2.bias False
denseblock1.denselayer3.conv2.weight False
denseblock1.denselayer4.norm1.weight False
denseblock1.denselayer4.norm1.bias False
denseblock1.denselayer4.conv1.weight False
denseblock1.denselayer4.norm2.weight False
denseblock1.d

In [222]:
# Train and evaluate
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=6)

lr[0.001], Epoch 0/5
----------
train Loss: 4.2480 Acc: 0.0878
valid Loss: 3.9291 Acc: 0.1724

lr[0.0001], Epoch 1/5
----------
train Loss: 3.6547 Acc: 0.2613
valid Loss: 3.4208 Acc: 0.3802

lr[0.0001], Epoch 2/5
----------
train Loss: 3.1816 Acc: 0.4242
valid Loss: 2.9671 Acc: 0.4780

lr[0.0001], Epoch 3/5
----------
train Loss: 2.9184 Acc: 0.4921
valid Loss: 2.9279 Acc: 0.4902

lr[1e-05], Epoch 4/5
----------
train Loss: 2.8774 Acc: 0.5030
valid Loss: 2.9105 Acc: 0.4927

lr[1e-05], Epoch 5/5
----------
train Loss: 2.8379 Acc: 0.5131
valid Loss: 2.8409 Acc: 0.5024

Training complete in 2m 14s
Best val Acc: 0.502445


In [223]:
# On second round, train whole network, but only weights not bias
'''
# Freeze Bias but not Weights
for name, child in model_ft.named_children():
  for name_2, params in child.named_parameters():
    if(str(name_2).find('bias') == -1): # weight
        params.requires_grad = True
        #print(name_2, params.requires_grad)
    else:
        params.requires_grad = False
        
#last bias, requires_grad=True
# Then unfreeze last classification layer only for feature extract
for param in model_ft.classifier.parameters():
    param.requires_grad = True
'''    

#UnFreeze all layers first
for param in model_ft.parameters():
    param.requires_grad = True 
    
# To view which layers are freeze and which layers are not freezed:
for name, child in model_ft.named_children():
  for name_2, params in child.named_parameters():
    print(name_2, params.requires_grad)
    

conv0.weight True
norm0.weight True
norm0.bias True
denseblock1.denselayer1.norm1.weight True
denseblock1.denselayer1.norm1.bias True
denseblock1.denselayer1.conv1.weight True
denseblock1.denselayer1.norm2.weight True
denseblock1.denselayer1.norm2.bias True
denseblock1.denselayer1.conv2.weight True
denseblock1.denselayer2.norm1.weight True
denseblock1.denselayer2.norm1.bias True
denseblock1.denselayer2.conv1.weight True
denseblock1.denselayer2.norm2.weight True
denseblock1.denselayer2.norm2.bias True
denseblock1.denselayer2.conv2.weight True
denseblock1.denselayer3.norm1.weight True
denseblock1.denselayer3.norm1.bias True
denseblock1.denselayer3.conv1.weight True
denseblock1.denselayer3.norm2.weight True
denseblock1.denselayer3.norm2.bias True
denseblock1.denselayer3.conv2.weight True
denseblock1.denselayer4.norm1.weight True
denseblock1.denselayer4.norm1.bias True
denseblock1.denselayer4.conv1.weight True
denseblock1.denselayer4.norm2.weight True
denseblock1.denselayer4.norm2.bias Tru

In [224]:
# Find total parameters and trainable parameters
total_params = sum(p.numel() for p in model_ft.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
    p.numel() for p in model_ft.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')

18,288,870 total parameters.
18,288,870 training parameters.


In [225]:
#optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.8)
#optimizer_ft = optim.Adam(model_ft.parameters(),lr=1e-4,amsgrad=True)
optimizer_ft = optim.Adam(model_ft.parameters())

# Decay LR by a factor of 0.1 every ? epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.1)

In [226]:
# Train and evaluate
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=15)

lr[0.01], Epoch 0/14
----------
train Loss: 1.7092 Acc: 0.5688
valid Loss: 0.8446 Acc: 0.7812

lr[0.001], Epoch 1/14
----------
train Loss: 0.9428 Acc: 0.7471
valid Loss: 0.6185 Acc: 0.8386

lr[0.001], Epoch 2/14
----------
train Loss: 0.7467 Acc: 0.7941
valid Loss: 0.4864 Acc: 0.8619

lr[0.001], Epoch 3/14
----------
train Loss: 0.6374 Acc: 0.8235
valid Loss: 0.4094 Acc: 0.9034

lr[0.001], Epoch 4/14
----------
train Loss: 0.5626 Acc: 0.8446
valid Loss: 0.3166 Acc: 0.9193

lr[0.001], Epoch 5/14
----------
train Loss: 0.3193 Acc: 0.9150
valid Loss: 0.1614 Acc: 0.9645

lr[0.0001], Epoch 6/14
----------
train Loss: 0.2505 Acc: 0.9331
valid Loss: 0.1536 Acc: 0.9743

lr[0.0001], Epoch 7/14
----------
train Loss: 0.2205 Acc: 0.9447
valid Loss: 0.1449 Acc: 0.9694

lr[0.0001], Epoch 8/14
----------
train Loss: 0.2166 Acc: 0.9415
valid Loss: 0.1333 Acc: 0.9743

lr[0.0001], Epoch 9/14
----------
train Loss: 0.1834 Acc: 0.9527
valid Loss: 0.1308 Acc: 0.9645

lr[0.0001], Epoch 10/14
----------
tr

In [227]:
#Freeze all layers first
for param in model_ft.parameters():
    param.requires_grad = False
    
# Then unfreeze last classification layer only for feature extract
for param in model_ft.classifier.parameters():
    param.requires_grad = True    

    
# To view which layers are freeze and which layers are not freezed:
for name, child in model_ft.named_children():
  for name_2, params in child.named_parameters():
    print(name_2, params.requires_grad)


conv0.weight False
norm0.weight False
norm0.bias False
denseblock1.denselayer1.norm1.weight False
denseblock1.denselayer1.norm1.bias False
denseblock1.denselayer1.conv1.weight False
denseblock1.denselayer1.norm2.weight False
denseblock1.denselayer1.norm2.bias False
denseblock1.denselayer1.conv2.weight False
denseblock1.denselayer2.norm1.weight False
denseblock1.denselayer2.norm1.bias False
denseblock1.denselayer2.conv1.weight False
denseblock1.denselayer2.norm2.weight False
denseblock1.denselayer2.norm2.bias False
denseblock1.denselayer2.conv2.weight False
denseblock1.denselayer3.norm1.weight False
denseblock1.denselayer3.norm1.bias False
denseblock1.denselayer3.conv1.weight False
denseblock1.denselayer3.norm2.weight False
denseblock1.denselayer3.norm2.bias False
denseblock1.denselayer3.conv2.weight False
denseblock1.denselayer4.norm1.weight False
denseblock1.denselayer4.norm1.bias False
denseblock1.denselayer4.conv1.weight False
denseblock1.denselayer4.norm2.weight False
denseblock1.d

In [228]:
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=1e-5, momentum=0.4)
# Decay LR by a factor of 0.1 every ? epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=20, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=100)

lr[0.0001], Epoch 0/99
----------
train Loss: 0.2195 Acc: 0.9437
valid Loss: 0.1502 Acc: 0.9743

lr[1e-05], Epoch 1/99
----------
train Loss: 0.2207 Acc: 0.9435
valid Loss: 0.1460 Acc: 0.9743

lr[1e-05], Epoch 2/99
----------
train Loss: 0.2390 Acc: 0.9391
valid Loss: 0.1470 Acc: 0.9743

lr[1e-05], Epoch 3/99
----------
train Loss: 0.2065 Acc: 0.9457
valid Loss: 0.1514 Acc: 0.9694

lr[1e-05], Epoch 4/99
----------
train Loss: 0.2270 Acc: 0.9402
valid Loss: 0.1482 Acc: 0.9707

lr[1e-05], Epoch 5/99
----------
train Loss: 0.2234 Acc: 0.9415
valid Loss: 0.1530 Acc: 0.9731

lr[1e-05], Epoch 6/99
----------
train Loss: 0.2296 Acc: 0.9379
valid Loss: 0.1499 Acc: 0.9694

lr[1e-05], Epoch 7/99
----------
train Loss: 0.2195 Acc: 0.9420
valid Loss: 0.1497 Acc: 0.9707

lr[1e-05], Epoch 8/99
----------
train Loss: 0.2147 Acc: 0.9444
valid Loss: 0.1503 Acc: 0.9719

lr[1e-05], Epoch 9/99
----------
train Loss: 0.2201 Acc: 0.9417
valid Loss: 0.1491 Acc: 0.9743

lr[1e-05], Epoch 10/99
----------
train

train Loss: 0.2363 Acc: 0.9399
valid Loss: 0.1480 Acc: 0.9756

lr[1.0000000000000004e-08], Epoch 76/99
----------
train Loss: 0.2257 Acc: 0.9425
valid Loss: 0.1471 Acc: 0.9719

lr[1.0000000000000004e-08], Epoch 77/99
----------
train Loss: 0.2161 Acc: 0.9423
valid Loss: 0.1535 Acc: 0.9682

lr[1.0000000000000004e-08], Epoch 78/99
----------
train Loss: 0.2121 Acc: 0.9461
valid Loss: 0.1479 Acc: 0.9694

lr[1.0000000000000004e-08], Epoch 79/99
----------
train Loss: 0.2027 Acc: 0.9470
valid Loss: 0.1527 Acc: 0.9743

lr[1.0000000000000004e-08], Epoch 80/99
----------
train Loss: 0.2084 Acc: 0.9481
valid Loss: 0.1443 Acc: 0.9768

lr[1.0000000000000003e-09], Epoch 81/99
----------
train Loss: 0.2183 Acc: 0.9426
valid Loss: 0.1483 Acc: 0.9731

lr[1.0000000000000003e-09], Epoch 82/99
----------
train Loss: 0.2240 Acc: 0.9399
valid Loss: 0.1484 Acc: 0.9694

lr[1.0000000000000003e-09], Epoch 83/99
----------
train Loss: 0.2171 Acc: 0.9428
valid Loss: 0.1536 Acc: 0.9731

lr[1.0000000000000003e-09

In [229]:
#UnFreeze all layers first
#for param in model_ft.parameters():
#    param.requires_grad = True 