In [2]:
from pdb import set_trace as st

In [3]:
import os
from pathlib import Path

import cv2

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import pretrainedmodels as pm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms

from albumentations import Compose, JpegCompression, CLAHE, RandomRotate90, Transpose, ShiftScaleRotate, \
        Blur, OpticalDistortion, GridDistortion, HueSaturationValue, Flip, VerticalFlip

from kekas import Keker, DataOwner, DataKek
from kekas.transformations import Transformer, to_torch, normalize
from kekas.metrics import accuracy
from kekas.modules import Flatten, AdaptiveConcatPool2d
from kekas.callbacks import Callback, Callbacks, DebuggerCallback

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Dataset creation

## Downloading

Let's see how to build a classification pipeline with Kekas.
We will finetune a convolutional neural network on Cats and Dogs dataset.

Firstly, download Cats and Dogs dataset from https://www.microsoft.com/en-us/download/details.aspx?id=54765 and unpack it wherever you want.

## Dataframe creation and train/val split

In [5]:
# Let's create a pandas DataFrame to help us with data handling
root_dir = Path("PetImages/")  # path to Cats and Dogs dataset root directory

fpaths = []
labels = []
for d in root_dir.iterdir():
    for f in d.iterdir():
        img = cv2.imread(str(f))  # some files there are corrupted, so add only good ones
        if img is not None:
            labels.append(d.name)
            fpaths.append(str(f))

df = pd.DataFrame(data={"fpath": fpaths, "label": labels})
df.head()

Unnamed: 0,fpath,label
0,PetImages/Dog/3208.jpg,Dog
1,PetImages/Dog/8355.jpg,Dog
2,PetImages/Dog/7032.jpg,Dog
3,PetImages/Dog/872.jpg,Dog
4,PetImages/Dog/1438.jpg,Dog


In [6]:
# split dataset to train and val parts
train_df, val_df = train_test_split(df, test_size=2000)
train_df.shape, val_df.shape

((22946, 2), (2000, 2))

## Augmentations

In [13]:
# create train and val datasets using DataKek class - a pytorch Dataset that uses pandas DataFrame as data source

# at first we need to create a reader function that will define how image will be opened
def reader_fn(i, row):
    # it always gets i and row as parameters
    # where i is an index of dataframe and row is a dataframes row
    image = cv2.imread(row["fpath"])[:,:,::-1]  # BGR -> RGB
    if row["label"] == "Dog":
        label = 0
    else:
        label = 1
    return {"image": image, "label": label}


# Then we should create transformations/augmentations
# We will use awesome https://github.com/albu/albumentations library
def augs(p=0.5):
    return Compose([
        CLAHE(),
        RandomRotate90(),
        Transpose(),
        ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.50, rotate_limit=45, p=.75),
        Blur(blur_limit=3),
        OpticalDistortion(),
        GridDistortion(),
        HueSaturationValue()
    ], p=p)

def get_transforms(dataset_key, size, p):
    # we need to use a Transformer class to apply transformations to DataKeks elements
    # dataset_key is an image key in dict returned by reader_fn
    
    PRE_TFMS = Transformer(dataset_key, lambda x: cv2.resize(x, (size, size)))

    AUGS = Transformer(dataset_key, lambda x: augs()(image=x)["image"])

    NRM_TFMS = transforms.Compose([
        Transformer(dataset_key, to_torch()),
        Transformer(dataset_key, normalize())
    ])
    
    train_tfms = transforms.Compose([PRE_TFMS, AUGS, NRM_TFMS])
    val_tfms = transforms.Compose([PRE_TFMS, NRM_TFMS])  # because we don't want to augment val set yet
    
    return train_tfms, val_tfms

## DataKeks creation

In [22]:
# now let's create DataKeks
train_tfms, val_tfms = get_transforms("image", 224, 0.5)

train_dk = DataKek(df=train_df, reader_fn=reader_fn, transforms=train_tfms)
val_dk = DataKek(df=val_df, reader_fn=reader_fn, transforms=val_tfms)

## DataLoaders

In [23]:
# and DataLoaders
batch_size = 32
workers = 8

train_dl = DataLoader(train_dk, batch_size=batch_size, num_workers=workers, shuffle=True, drop_last=True)
val_dl = DataLoader(val_dk, batch_size=batch_size, num_workers=workers, shuffle=False)

# Model

In [24]:
# create a simple neural network using pretrainedmodels library
# https://github.com/Cadene/pretrained-models.pytorch

class Net(nn.Module):
    def __init__(
            self,
            num_classes: int,
            p: float = 0.5,
            pooling_size: int = 2,
            last_conv_size: int = 2048,
            arch: str = "se_resnext50_32x4d",
            pretrained: str = "imagenet") -> None:
        """A simple model to finetune.
        
        Args:
            num_classes: the number of target classes, the size of the last layer's output
            p: dropout probability
            pooling_size: the size of the result feature map after adaptive pooling layer
            last_conv_size: size of the flatten last backbone conv layer
            arch: the name of the architecture form pretrainedmodels
            pretrained: the mode for pretrained model from pretrainedmodels
        """
        super().__init__()
        net = pm.__dict__[arch](pretrained=pretrained)
        modules = list(net.children())[:-2]  # delete last layers: pooling and linear
        
        # add custom head
        modules += [nn.Sequential(
            # AdaptiveConcatPool2d is a concat of AdaptiveMaxPooling and AdaptiveAveragePooling 
            AdaptiveConcatPool2d(size=pooling_size),
            Flatten(),
            nn.BatchNorm1d(2 * pooling_size * pooling_size * last_conv_size),
            nn.Dropout(p),
            nn.Linear(2 * pooling_size * pooling_size * last_conv_size, num_classes)
        )]
        self.net = nn.Sequential(*modules)

    def forward(self, x):
        logits = self.net(x)
        return logits

# Keker

## Initialization

In [25]:
# the three whales of your pipelane are: the data, the model and the loss (hi, Jeremy)

# the data is represented in Kekas by DataOwner. It is a namedtuple with three fields:
# 'train_dl', 'val_dl', 'test_dl'
# For training process we will need at least two of them, and we can skip 'test_dl' for now
# so we will initialize it with `None` value.
dataowner = DataOwner(train_dl, val_dl, None)

# model is just a pytorch nn.Module, that we created vefore
model = Net(num_classes=2)

# loss or criterion is also a pytorch nn.Module. For multiloss scenarios it can be a list of nn.Modules
# for our simple example let's use the standart cross entopy criterion
criterion = nn.CrossEntropyLoss()

In [26]:
# Also we need to specify, what model will do with each batch of data on each iteration
# We should define a `step_fn` function
# The code below repeats a `keker.default_step_fn` code to provide you with a concept of step function

def step_fn(model: torch.nn.Module,
            batch: torch.Tensor) -> torch.Tensor:
    """Determine what your model will do with your data.

    Args:
        model: the pytorch module to pass input in
        batch: the batch of data from the DataLoader

    Returns:
        The models forward pass results
    """
    
    # you could define here whatever logic you want
    inp = batch["image"]  # here we get an "image" from our dataset
    return model(inp)

In [27]:
# previous preparations was mostly out of scope of Kekas library (except DataKeks creation)
# Now let's dive into kekas a little bit

# firstly, we create a Keker - the core Kekas class, that provides all the keks for your pipeline
keker = Keker(model=model,
              dataowner=dataowner,
              criterion=criterion,
              step_fn=step_fn,                    # previosly defined step function
              target_key="label",                 # remember, we defined it in the reader_fn for DataKek?
              metrics={"acc": accuracy},          # optional, you can not specify any metrics at all
              opt=torch.optim.Adam,               # optimizer class. if note specifiyng, 
                                                  # an SGD is using by default
              opt_params={"weight_decay": 1e-5})  # optimizer kwargs in dict format (optional too)

# Actually, there are a lot of params for kekers, but this out of scope of this example
# you can read about them in Keker's docstring (but who really reads the docs, huh?)

In [28]:
# before the start of the finetuning procedure let's freeeze all the layers except the last one - the head
# the `freeze` method is mostly inspired (or stolen) from fastai
# but you should define a model's attribute to deal with
# for example, our model is actually model.net, so we need to specify the 'net' attr
# also this method does not freezes batchnorm layers by default. To change this set `freeze_bn=True`
keker.unfreeze(model_attr="net")

## Learning Rate Find

In [29]:
# let's find an 'optimal' learning rate with learning rate find procedure
# for details please see the fastai course and this articles:
# https://arxiv.org/abs/1803.09820
# https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html

# NOTE: this is an optional step and you can skip it and use your favorite learning rate

# you MUST specify the logdir to see graphics
# keker will write a tensorboard logs into this folder
# to see them start a tensorboard with `--logdir /path/to/logdir`
# OR you can use keker.plot_kek_lr method (see cell below)
keker.kek_lr(final_lr=0.1, logdir="/path/to/logdir")

Epoch 1/1: 100% 717/717 [02:41<00:00,  4.44it/s, loss=101.3424]
End of LRFinder



## Plot Learning Rate find results

In [None]:
# Zoom in plot to see on which step the loss was still decreasing
# and choose LR from this step
keker.plot_kek_lr(logdir="/path/to/logdir")

## Simple Kek

In [19]:
# Ok, now let's start training!
# It's as simple as:
keker.kek(lr=1e-5, epochs=3)  

Epoch 1/3: 100% 717/717 [02:17<00:00,  6.00it/s, loss=0.1613, val_loss=0.0292, acc=0.9881]
Epoch 2/3: 100% 717/717 [02:16<00:00,  5.35it/s, loss=0.1650, val_loss=0.0292, acc=0.9891]
Epoch 3/3: 100% 717/717 [02:16<00:00,  5.21it/s, loss=0.1229, val_loss=0.0259, acc=0.9906]


## Kek with different optimizer

In [17]:
# SomeKekasUser: Wait, and what if I want to train with the different optimizer?
#
# Me:
keker.kek(lr=1e-5, 
          epochs=1,
          opt=torch.optim.RMSprop,            # optimizer class
          opt_params={"weight_decay": 1e-5})  # optimizer kwargs in dict format (if you want)

# by default, the optimizer specified on Keker initialization is used

Epoch 1/1: 100% 717/717 [02:15<00:00,  5.42it/s, loss=0.1829, val_loss=0.0387, acc=0.9916]


## Kek with scheduler

In [20]:
# SomeKekasUser: OK, and what if I want to use a pytorch scheduler?
#
# Me:
keker.kek(lr=1e-5,
          epochs=2,
          sched=torch.optim.lr_scheduler.StepLR,       # pytorch lr scheduler class
          sched_params={"step_size":1, "gamma": 0.9})  # schedulres kwargas in dict format

# by default, no scheduler is using

Epoch 1/2: 100% 717/717 [02:16<00:00,  6.00it/s, loss=0.1569, val_loss=0.0298, acc=0.9876]
Epoch 2/2: 100% 717/717 [02:17<00:00,  5.98it/s, loss=0.1267, val_loss=0.0272, acc=0.9881]


## Log your keks

In [33]:
# SomeKekasUser: How about logging?
#
# Me:
keker.kek(lr=1e-5,
          epochs=1,
          logdir="/mnt/hdd3_4/belskikh/keks/forplot")

# It will create a `train` and `val` subfolders in logdir, and will write tensorboard logs into them
# to see them start a tensorboard with `--logdir /path/to/logdir`
# OR you can use keker.plot_kek method! (see cell below)

Epoch 1/1: 100% 717/717 [02:47<00:00,  4.43it/s, loss=0.0520, val_loss=0.0258, acc=0.9911]


## Plot your keks

In [None]:
# kekas uses plotly lib and tensorboard logs to plot inside NB
keker.plot_kek(logdir="/path/to/logdir",  # path to logdir with logs to plot
               step="batch",              # (optional) default is "step". another option is "epoch"
                                          # It determines discreteness of ploting
               metrics=["loss",           # (optional) list of metrics names
                        "acc",            # by default ["loss", "lr"] is using
                        "lr"],            # the order of the names determines the order of the plot
                                          # NOTE: names of metrics must match names in metrics dict
                                          # which was specified on Keker init step
               height=1200,               # (optional) height of the total plot 
               width=800)                 # (optional) width of the total plot

## Checkpoints saving

In [20]:
# SomeKekasUser: Also I want to save best checkpoints to later use them for SWA or ensembling!
#                And I want to measure them by custom metric, control their number, specify their name prefix,
#                and control what I need - minimize or maximize metric!
# Me: Here it is:
keker.kek(lr=1e-5,
          epochs=1,
          cp_saver_params={
              "savedir": "/path/to/save/dir",  # a directory for checkpoints
              "metric": "acc",  # (optional) from `metrics` dict on Keker init. 
                                # default is validation loss
              "n_best": 3,      # (optional) default is 3
              "prefix": "kek",  # (optional) default prefix is `checkpoint`
              "mode": "max"     # (optional) default is 'min'
          })   

# It will create a `savedir` directory, and will save best checkpoints there
# with naming `{prefix}.{epoch_num}.h5`. The best checkpoint will be dublicated with `{prefix}.best.h5` name
# look at the report down here

Epoch 1/1: 100% 717/717 [02:16<00:00,  5.75it/s, loss=0.1404, val_loss=0.0356, acc=0.9901]

Checkpoint	acc
/path/to/save/dir/kek.1.h5	0.990079


## Early stopping

In [19]:
# SomeKekasUser: Allright, and I don't want to train model, if validation loss doesn't improve for several epochs.
# 
# Me: You mean, early stopping? Here:
keker.kek(lr=1e-5,
          epochs=1, 
          early_stop_params={
              "patience": 3,   # number of bad epochs to wait before stopping
              "metric": "acc", # (optional) metric name from 'metric' dict. default is val loss
              "mode": "min",   # (optional) what you want from you metric, max or min? default is 'min'
              "min_delta": 0   # (optional) a minimum delta to count an epoch as 'bad'
          })

Epoch 1/1: 100% 717/717 [02:17<00:00,  5.78it/s, loss=0.1577, val_loss=0.0341, acc=0.9901]


## Just do it

In [23]:
# SomeAdvancedKekasUser: I WANT IT ALL!
# 
# Me: Well, okay then...
keker.kek(lr=1e-5,
          epochs=5,
          opt=torch.optim.RMSprop,
          opt_params={"weight_decay": 1e-5},
          sched=torch.optim.lr_scheduler.StepLR,
          sched_params={"step_size":1, "gamma": 0.9},
          logdir="/path/to/logdir",
          cp_saver_params={
              "savedir": "/path/to/save/dir",  
              "metric": "acc",  
              "n_best": 3,      
              "prefix": "kek",  
              "mode": "max"},     
          early_stop_params={
              "patience": 3,   
              "metric": "acc", 
              "mode": "min",   
              "min_delta": 0
          })

Epoch 1/5: 100% 717/717 [02:16<00:00,  5.96it/s, loss=0.1697, val_loss=0.0276, acc=0.9926]
Epoch 2/5: 100% 717/717 [02:16<00:00,  5.11it/s, loss=0.1241, val_loss=0.0321, acc=0.9901]
Epoch 3/5: 100% 717/717 [02:16<00:00,  5.97it/s, loss=0.1576, val_loss=0.0338, acc=0.9901]
Epoch 4/5: 100% 717/717 [02:16<00:00,  5.99it/s, loss=0.1733, val_loss=0.0300, acc=0.9906]
Epoch 5/5: 100% 717/717 [02:17<00:00,  5.94it/s, loss=0.1397, val_loss=0.0310, acc=0.9916]

Checkpoint	acc
/path/to/save/dir/kek.1.h5	0.992560
/path/to/save/dir/kek.5.h5	0.991567
/path/to/save/dir/kek.4.h5	0.990575


## One Cycle Kek!

In [24]:
# SomeFastaiFan: Did you stole something else from fastai?
#
# Me: Yes! One Cycle Policy!
keker.kek_one_cycle(max_lr=1e-5,                  # the maximum learning rate
                    cycle_len=5,                  # number of epochs, actually, but not exactly
                    momentum_range=(0.95, 0.85),  # range of momentum changes
                    div_factor=25,                # max_lr / min_lr
                    increase_fraction=0.3)        # the part of cycle when learning rate increases

# If you don't understand these parameters, read this - https://sgugger.github.io/the-1cycle-policy.html
# NOTE: you cannot use schedulers and early stopping with one cycle!
# another options are the same as for `kek` method

Epoch 1/5: 100% 717/717 [02:16<00:00,  5.58it/s, loss=0.1120, val_loss=0.0294, acc=0.9906]
Epoch 2/5: 100% 717/717 [02:17<00:00,  5.94it/s, loss=0.1517, val_loss=0.0305, acc=0.9906]
Epoch 3/5: 100% 717/717 [02:16<00:00,  5.75it/s, loss=0.1507, val_loss=0.0307, acc=0.9901]
Epoch 4/5: 100% 717/717 [02:16<00:00,  5.02it/s, loss=0.1187, val_loss=0.0336, acc=0.9906]
Epoch 5/5: 100% 717/717 [02:16<00:00,  5.03it/s, loss=0.0963, val_loss=0.0270, acc=0.9911]


## Other Keker features

### Freezing / unfreezing

In [26]:
# We've already talk about freezing. But what if I want to unfreeze?
# It has the same interface:
keker.unfreeze(model_attr="net")

# If you want to freeze till some layer:
layer_num = -2
keker.freeze_to(layer_num, model_attr="net")

### Saving / Loading

In [None]:
# saving
keker.save("/path/to/file")

# loading
keker.load("/path/to/file")

### Device and DataParallel

In [29]:
# Keker is using all avialable GPUs by default
# To limit it, use 'CUDA_VISIBLE_DEVICES' environment variable (available in os.environ dict)

# if you want to specify cuda device for your model, specify `device` parameter on Keker initialization

### Inference

In [None]:
# there are 4 (yes, four) ways to get a predictions with keker

# 1st
keker.predict(savepath="/path/to/save/dir")
# it will makes predicts on your 'test_dl' dataloader (remember, we initialized it with 'None'), if it specified,
# and saves models output in numpy.ndarray format to 'savepath'

# 2nd
loader = val_dl
keker.predict_loader(loader=loader, savepath="/path/to/save/dir")
# it will do the same as `predict()` but on any custom loader you want

# 3rd
tensor = torch.zeros(4, 224, 224, 3)
preds = keker.predict_tensor(tensor=tensor, to_numpy=False)
# it will return a predictions of the model in numpy format if `'to_numpy==True', else - torch.Tensor

# 4th
array = np.zeros((4, 224, 224, 3))
preds = keker.predict_array(array=array, to_numpy=False)
# it will do the same as `predict_tensor()` but with np.ndarra as input

### TTA

In [12]:
# I am sure that it is not very convinient way for test time augmentations,
# but here is how you can do it with Kekas

# first, specify several augmentations for TTA
flip_ = Flip(always_apply=True)
vertical_flip_ = VerticalFlip(always_apply=True)
transpose_ = Transpose(always_apply=True)

# second, create the whole augmentations with theese ones inside
def insert_aug(aug, dataset_key="image", size=224):    
    PRE_TFMS = Transformer(dataset_key, lambda x: cv2.resize(x, (size, size)))
    
    AUGS = Transformer(dataset_key, lambda x: aug(image=x)["image"])
    
    NRM_TFMS = transforms.Compose([
        Transformer(dataset_key, to_torch()),
        Transformer(dataset_key, normalize())
    ])
    
    tfm = transforms.Compose([PRE_TFMS, AUGS, NRM_TFMS])
    return tfm


flip = insert_aug(flip_)
vertical_flip = insert_aug(vertical_flip_)
transpose = insert_aug(transpose_)

tta_tfms = {"flip": flip, "v_flip": vertical_flip, "transpose": transpose}

# third, run TTA
keker.TTA(loader=val_dl,                # loader to predict on 
          tfms=tta_tfms,                # list or dict of always applying transforms
          savedir="/path/to/save/dir",  # savedir
          prefix="preds")               # (optional) name prefix. default is 'preds'

# it will saves predicts for each augmentation to savedir with name
#  - {prefix}_{name_from_dict}.npy if tfms is a dict
#  - {prefix}_{index}.npy          if tfms is a list

Predict: 100% 63/63 [00:04<00:00, 15.09it/s]
Predict: 100% 63/63 [00:05<00:00, 14.98it/s]
Predict: 100% 63/63 [00:05<00:00, 15.01it/s]


# Callbacks

## Adding callbacks

In [None]:
# Callbacks is the way in which Kekas customizes its pipeline
# each callback implements six methods, which names tell when it applies
# on_train_begin()
#     on_epoch_begin()
#         on_batch_begin()
#             >>>... step here ...<<<
#         on_batch_end()
#     on_epoch_end()
# on_train_end()

# Callbacks are widely using under the hood of Kekas
# For example - loss, opimizer, progressbar, lr scheduling, checkpoint saving, early stopping etc
# are realized as callbacks

# Callback has access to `state` attr of a keker. Here is a docs from Keker about state:

        # The state is an object that stores many variables and represents
        # the state of your train-val-repdict pipeline. _state passed to every
        # callback call.
        # You can use it as a container for your custom variables, but
        # DO NOT USE the following ones:
        #
        # loss, batch, model, dataowner, criterion, opt, parallel, checkpoint,
        # stop_iter, stop_epoch, stop_train, out, sched, mode, loader, pbar,
        # metrics, epoch_metrics

# You can write your own callback, or use something useful from kekas.callbacks

# Callbacks should be passes as a list at the Keker initiation
# For example, let's use a DebuggerCallback, that just insert a pdb.set_trace() call in pipeline
# For more info, please see a DebuggerCallback docs and source code
debugger = DebuggerCallback(when=["on_epoch_begin"], modes["train"])

keker = Keker(model=model, dataowner=dataowner, criterion=criterion, callbacks=[debugger])

# also there is a method to add a callbacks to existing Keker

keker.add_callbacks([debugger])

## Custom loss and opimizer callbacks

In [None]:
# As was said, loss and optimezer behavior is realiesed as Callbacks.
# If you use some tricky loss or optimizer logic, you can create your own Callback
# and specify it during Keker initialization

# here are the callbacks, that are using by default
class LossCallback(Callback):
    def __init__(self, target_key: str, preds_key: str) -> None:
        # target_key and preds_key are the parameters of Keker
        self.target_key = target_key
        self.preds_key = preds_key

    def on_batch_end(self, i: int, state: DotDict) -> None:
        target = state.batch[self.target_key]
        preds = state.out[self.preds_key]

        state.loss = state.criterion(preds, target)

class OptimizerCallback(Callback):
    def on_batch_end(self, i: int, state: DotDict) -> None:
        if state.mode == "train":
            state.opt.zero_grad()
            state.loss.backward()
            state.opt.step()
            
# and here is how you should specify them during Keker initialization
keker = Keker(model=model, 
              dataowner=dataowner,
              criterion=criterion,
              loss_cb=LossCallback,
              opt_cb=OptimizerCallback)

# Notes

I hope you now got an idea how to use Kekas.

I will be happy to get feedback about my library and this tutorial.

You can find me in [OpenDataScience](http://ods.ai) community by @belskikh nikname or create an issue on GitHub.

Have a good keks!