# Load data

In [1]:
import PIL.Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, models
import time
import copy

In [2]:
torch.__version__

'1.6.0'

In [3]:
torch.cuda.is_available()

True

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
# torch.set_default_tensor_type(torch.cuda.FloatTensor)
torch.backends.cudnn.benchmark=True

## Get CSVs

In [4]:
train_df = pd.read_csv('../data/csvs/train.csv')
# test_df = pd.read_csv('../data/csvs/test.csv')

In [5]:
train_df.head(5)

Unnamed: 0.1,Unnamed: 0,file,lvl_three,lvl_one,lvl_two
0,0,1_220_F_83683073_O4yJOnarzTjKXuUBAgkAifmiC8d0I...,1,0,3
1,1,20_220_F_5292725_818KTy3xv82nEkNolcs2m37MOV86s...,20,1,5
2,2,20_220_F_47187567_lwYwc9UQtBK5Be6v4P7HNsCc4Hhr...,20,1,5
3,3,1_220_F_38932828_Osns7NBWCq8AhJonYpQArrToDLLhT...,1,0,3
4,4,1_220_F_97168737_y0VWy7kLMby9BO6lHDfpyfNpW9o0S...,1,0,3


# Split data

Scikit-learn definitely takes the cake for ease in stratified splitting. A helper function splits the training and validation sets for ease of input into a Pytorch dataset class.

In [6]:
def train_val_split(files, target, test_size, stratify=True):
    """
    
    """
    if stratify:
        X_train, X_val, y_train, y_val = train_test_split(files, target, test_size=test_size, stratify=target)
    else:
        X_train, X_val, y_train, y_val = train_test_split(files, target, test_size=test_size)
    train_split = pd.concat([X_train, y_train], axis = 1)
    val_split = pd.concat([X_val, y_val], axis = 1) 
    return train_split, val_split

In [7]:
train_split, val_split = train_val_split(train_df['file'], train_df['lvl_one'], test_size=0.1)

# Utils

In [8]:
# import numpy as np
# import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience.
    
    Developed by: https://github.com/Bjarten/early-stopping-pytorch
    
    """
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0

In [9]:
def train_model(model, dataloader, criterion, optimizer, save_path, num_epochs=25):
    since = time.time()
    
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

#     best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_loss = np.inf
    
    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=7, verbose=True)
    
    for epoch in range(num_epochs):
        
        epoch_time = time.time()
        
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        if early_stopping.early_stop:
                    print("Early stopping")
                    break

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloader[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_time_elapsed = time.time() - epoch_time
            print('Training complete in {:.0f}m {:.0f}s'.format(epoch_time_elapsed // 60, epoch_time_elapsed % 60))

            epoch_loss = running_loss / len(dataloader[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloader[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                print('saving model')
                save_model(save_path, epoch, model, optimizer, epoch_loss, epoch_acc)
            # save metrics
            if phase == 'train':
                train_acc_history.append(epoch_acc)
                train_loss_history.append(epoch_loss)
            if phase == 'val':
                val_acc_history.append(epoch_acc)
                val_loss_history.append(epoch_loss)
                early_stopping(epoch_loss, model)

                if early_stopping.early_stop:
                    print("Early stopping")
                    break
            #early stopping
            
        print()
        

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    

    # load best model weights
    #model.load_state_dict(best_model_wts)
    checkpoint = torch.load(save_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    print('Best val Acc: {:4f}'.format(checkpoint['acc']))
    print('Best val Loss: {:4f}'.format(checkpoint['loss']))
    
    return model, train_loss_history, train_acc_history, val_loss_history, val_acc_history

In [10]:
def save_model(path, epoch, model, optimizer, loss, acc):
    torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                'acc': acc
                }, path)

# Dataset & Dataloaders

In [11]:
class ImgDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, df, root_dir, percent_sample=None, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.img_df=df
        self.root_dir=root_dir
        self.transform=transform
        self.percent_sample=percent_sample

    def __len__(self):
        if self.percent_sample:
            assert self.percent_sample > 0.0, 'Percentage to sample must be >= 0 and <= 1.'
            assert self.percent_sample <= 1.0, 'Percentage to sample must be >= 0 and <= 1.'
            return int(np.floor(len(self.img_df) * self.percent_sample))
        else:
            return len(self.img_df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        img_name = os.path.join(self.root_dir,
                                self.img_df.iloc[idx, 0])
        X = PIL.Image.open(img_name).convert('RGB') #Some images in greyscale, so converting to ensure 3 channels - 1 causes issues in transformers
        y = self.img_df.iloc[idx, 1]
        
        if self.transform:
            X = self.transform(X)

        return X, y

In [12]:
train_transforms = transforms.Compose([ 
    transforms.Resize(256),
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], #OG means/sds from imagenet
                         std=[0.229, 0.224, 0.225])
])
val_transforms = transforms.Compose([ 
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [13]:
train_dataset = ImgDataset(df=train_split,
                           root_dir='../data/images/train/train',
                           percent_sample=1,
                           transform=train_transforms
                          )
val_dataset = ImgDataset(df=val_split,
                           root_dir='../data/images/train/train',
                           percent_sample=1,
                           transform=val_transforms
                          )

In [14]:
train_loader = DataLoader(train_dataset,
                          batch_size=128, 
                          shuffle=True,
                          pin_memory=True,
                          num_workers=4)
val_loader = DataLoader(val_dataset, 
                        batch_size=128,
                        pin_memory=True,
                        num_workers=4)

In [15]:
loaders_dict = {'train': train_loader, 
                'val': val_loader}

# Training

In [16]:
num_classes = 2
num_epochs = 10

In [29]:
model_ft = models.resnet50(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Sequential(nn.Linear(num_ftrs, num_classes))
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss().to(device)
# optimizer_ft = torch.optim.SGD(model_ft.parameters(), 
#                                lr=0.01, 
#                                momentum=0.9, 
#                                weight_decay=0.0001)
optimizer_ft = torch.optim.SGD(model_ft.parameters(), 
                               lr=0.01, 
                               momentum=0.9, 
                               weight_decay=0.0001,
                               nesterov=True)

In [30]:
torch.cuda.empty_cache()

In [23]:
# load_model = torch.load()
# model_new = 

In [31]:
# Run the functions and save the best model in the function model_ft.
model_ft, \
train_val, train_acc, \
val_loss, val_acc = train_model(model_ft, 
                                loaders_dict, 
                                criterion, 
                                optimizer_ft,
                                'nesterov_sgd.tar',
                                num_epochs)

Epoch 0/9
----------
Training complete in 9m 12s
train Loss: 0.6033 Acc: 0.6768
Training complete in 9m 33s
val Loss: 0.5379 Acc: 0.7199
saving model

Epoch 1/9
----------
Training complete in 9m 12s
train Loss: 0.5495 Acc: 0.7138
Training complete in 9m 32s
val Loss: 0.5268 Acc: 0.7322
saving model

Epoch 2/9
----------
Training complete in 9m 12s
train Loss: 0.5318 Acc: 0.7276
Training complete in 9m 33s
val Loss: 0.4995 Acc: 0.7548
saving model

Epoch 3/9
----------
Training complete in 9m 12s
train Loss: 0.5214 Acc: 0.7361
Training complete in 9m 33s
val Loss: 0.4995 Acc: 0.7532
saving model

Epoch 4/9
----------
Training complete in 9m 12s
train Loss: 0.5105 Acc: 0.7432
Training complete in 9m 33s
val Loss: 0.4945 Acc: 0.7599
saving model

Epoch 5/9
----------
Training complete in 9m 13s
train Loss: 0.5027 Acc: 0.7500
Training complete in 9m 34s
val Loss: 0.4941 Acc: 0.7616
saving model

Epoch 6/9
----------
Training complete in 9m 12s
train Loss: 0.4957 Acc: 0.7537
Training compl

**No flip 128 batch**
```
Epoch 0/29
----------
train Loss: 0.6071 Acc: 0.6711
val Loss: 0.5384 Acc: 0.7260
saving model

Epoch 1/29
----------
train Loss: 0.5551 Acc: 0.7111
val Loss: 0.5088 Acc: 0.7471
saving model

Epoch 2/29
----------
train Loss: 0.5387 Acc: 0.7228
val Loss: 0.5343 Acc: 0.7382
EarlyStopping counter: 1 out of 7

Epoch 3/29
----------
train Loss: 0.5245 Acc: 0.7334
val Loss: 0.5078 Acc: 0.7543
saving model

Epoch 4/29
----------
train Loss: 0.5143 Acc: 0.7396
val Loss: 0.4914 Acc: 0.7600
saving model

Epoch 5/29
----------
train Loss: 0.5056 Acc: 0.7453
val Loss: 0.4941 Acc: 0.7624
EarlyStopping counter: 1 out of 7

Epoch 6/29
----------
train Loss: 0.4989 Acc: 0.7514
val Loss: 0.4850 Acc: 0.7676
saving model

Epoch 7/29
----------
train Loss: 0.4924 Acc: 0.7536
val Loss: 0.4955 Acc: 0.7626
EarlyStopping counter: 1 out of 7

Epoch 8/29
----------
train Loss: 0.4870 Acc: 0.7576
val Loss: 0.4872 Acc: 0.7701
EarlyStopping counter: 2 out of 7

Epoch 9/29
----------
train Loss: 0.4795 Acc: 0.7633
val Loss: 0.5159 Acc: 0.7578
EarlyStopping counter: 3 out of 7

Epoch 10/29
----------
train Loss: 0.4750 Acc: 0.7648
val Loss: 0.4992 Acc: 0.7687
EarlyStopping counter: 4 out of 7

Epoch 11/29
----------
train Loss: 0.4711 Acc: 0.7684
val Loss: 0.5064 Acc: 0.7567
EarlyStopping counter: 5 out of 7

Epoch 12/29
----------
train Loss: 0.4665 Acc: 0.7719
val Loss: 0.4952 Acc: 0.7672
EarlyStopping counter: 6 out of 7

Epoch 13/29
----------
train Loss: 0.4616 Acc: 0.7760
val Loss: 0.4994 Acc: 0.7657
EarlyStopping counter: 7 out of 7
```

**Second round**
```
Epoch 0/29
----------
train Loss: 0.6065 Acc: 0.6713
val Loss: 0.5427 Acc: 0.7226

Epoch 1/29
----------
train Loss: 0.5548 Acc: 0.7116
val Loss: 0.5775 Acc: 0.7160
EarlyStopping counter: 1 out of 7

Epoch 2/29
----------
train Loss: 0.5371 Acc: 0.7252
val Loss: 0.4903 Acc: 0.7586

Epoch 3/29
----------
train Loss: 0.5233 Acc: 0.7335
val Loss: 0.5118 Acc: 0.7479
EarlyStopping counter: 1 out of 7

Epoch 4/29
----------
train Loss: 0.5140 Acc: 0.7407
val Loss: 0.4950 Acc: 0.7586
EarlyStopping counter: 2 out of 7

Epoch 5/29
----------
train Loss: 0.5055 Acc: 0.7456
val Loss: 0.5105 Acc: 0.7555
EarlyStopping counter: 3 out of 7

Epoch 6/29
----------
train Loss: 0.4973 Acc: 0.7521
val Loss: 0.5002 Acc: 0.7593
EarlyStopping counter: 4 out of 7

Epoch 7/29
----------
train Loss: 0.4919 Acc: 0.7550
val Loss: 0.4928 Acc: 0.7637
EarlyStopping counter: 5 out of 7

Epoch 8/29
----------
train Loss: 0.4846 Acc: 0.7603
val Loss: 0.4908 Acc: 0.7631
EarlyStopping counter: 6 out of 7

Epoch 9/29
----------
train Loss: 0.4799 Acc: 0.7636
val Loss: 0.4953 Acc: 0.7636
EarlyStopping counter: 7 out of 7
Early stopping

Epoch 10/29
----------
Early stopping
Training complete in 97m 3s

```

In [None]:
# checkpoint = torch.load('best_model.tar')
# model_ft.load_state_dict(checkpoint['model_state_dict'])
# # optimizer_ft.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
# loss = checkpoint['loss']