# Model training
This use-case is model training.  
By going through this you will know how to use Cascade for metadata tracking, hyperparameter tuning and model selection.  
  
Previous part is the pipeline building and is taken without comments.  
For more detailed description of it see Pipeline building example.

In [1]:
import cascade.data as cdd
import cascade.models as cdm
import cascade.utils as cdu
import cascade.meta as cde

from tqdm import tqdm
import torch
import torchvision
from torchvision.transforms import functional as F
from torch import nn
from sklearn.metrics import accuracy_score

In [2]:
import cascade
cascade.__version__

'0.5.2'

## Defining data pipeline
This part will be without comments 

In [3]:
MNIST_ROOT = 'data'
INPUT_SIZE = 784
BATCH_SIZE = 10

In [4]:
class NoiseModifier(cdd.Modifier):
    def __getitem__(self, index):
        img, label = self._dataset[index]
        img += torch.rand_like(img) * 0.1
        img = torch.clip(img, 0, 255)
        return img, label


train_ds = torchvision.datasets.MNIST(root=MNIST_ROOT,
                                     train=True, 
                                     transform=F.to_tensor,
                                     download=True)
test_ds = torchvision.datasets.MNIST(root=MNIST_ROOT, 
                                    train=False, 
                                    transform=F.to_tensor)

train_ds = cdd.Wrapper(train_ds, 
    meta_prefix={
        'desc': 'This is MNIST dataset of handwritten images, TRAIN PART'
    })
test_ds = cdd.Wrapper(test_ds)

train_ds = NoiseModifier(train_ds)
test_ds = NoiseModifier(test_ds)

train_dl = torch.utils.data.DataLoader(dataset=train_ds, 
                                       batch_size=BATCH_SIZE,
                                       shuffle=True)
test_dl = torch.utils.data.DataLoader(dataset=test_ds,
                                      batch_size=BATCH_SIZE,
                                      shuffle=False)

In [5]:
train_ds.get_meta()

[{'name': '__main__.NoiseModifier', 'type': 'dataset', 'len': 60000},
 {'name': 'cascade.data.dataset.Wrapper',
  'desc': 'This is MNIST dataset of handwritten images, TRAIN PART',
  'type': 'dataset',
  'len': 60000,
  'obj_type': torchvision.datasets.mnist.MNIST}]

## Model definition
Before training we need to define our model. We need regular nn.Module and Cascade's wrapper around it.  
  
Module defined without any specific changes in the original pytorch code, except now it accepts `*args` and `**kwargs` in `__init__`

In [6]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, *args, **kwargs):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, y):
         out = self.l1(y)
         out = self.relu(out)
         out = self.l2(out)

         return out

Next Cascade's wrapper is defined. The most of the interaction with pytorch modules are already implemented in `cascade.utils.TorchModel` so we need to only define how to train and evaluate the model.

In [7]:
class Classifier(cdu.TorchModel):
    # In train we copy-paste regular pytorch trainloop, 
    # but use self._model, where our SimpleNN is placed
    def fit(self, train_dl, num_epochs, lr, *args, **kwargs):
        criterion = nn.CrossEntropyLoss()
        optim = torch.optim.Adam(self._model.parameters(), lr=lr)

        ds_size = len(train_dl)
        for epoch in range(num_epochs):
            for x, (imgs, labels) in enumerate(train_dl): 
                imgs = imgs.reshape(-1, self._model.input_size)

                out = self._model(imgs)
                loss = criterion(out, labels)

                optim.zero_grad()
                loss.backward()
                optim.step() 

                if x % 500 == 0:
                    print (f'Epochs [{epoch}/{num_epochs}], Step[{x}/{ds_size}], Loss: {loss.item():.4f}')

    # Evaluate function takes the metrics from arguments
    # and populates self.metrics without returning anything
    def evaluate(self, test_dl, metrics_dict, *args, **kwargs):
        pred = []
        gt = []
        for imgs, labels in tqdm(test_dl): 
            imgs = imgs.reshape(-1, self._model.input_size)
            out = torch.argmax(self._model(imgs, *args, **kwargs), -1)

            pred.append(out)
            gt.append(labels)

        pred = torch.concat(pred).detach().numpy()
        gt = torch.concat(gt).detach().numpy()

        for metric_name in metrics_dict:
            self.metrics[metric_name] = metrics_dict[metric_name](gt, pred)

## Model training
Now we are ready to train our model. We define hyperparameters and pass them to our wrapper. Wrapper accepts pytorch module's class and all the parameters that are needed to initialize it.  
Additionally we pass keyword arguments that are connected to training. It is done to add them to the model's metadata.

In [8]:
NUM_EPOCHS = 2
LR = 1e-3

# Classifier will initialize SimpleNN with all the parameters passed
# but some of them are not for the SimpleNN, but to be recorded in metadata
model = Classifier(SimpleNN,
    # These arguments are needed by SimpleNN, 
    # but passed as keywords to be recorded in meta
    input_size=INPUT_SIZE,
    hidden_size=100,
    num_classes=10,
    # These arguments will be skipped by SimpleNN,
    # but will be added to meta
    num_epochs=NUM_EPOCHS,
    lr=LR,
    bs=BATCH_SIZE)
model.fit(train_dl, NUM_EPOCHS, LR)

Epochs [0/2], Step[0/6000], Loss: 2.3076
Epochs [0/2], Step[500/6000], Loss: 0.2654
Epochs [0/2], Step[1000/6000], Loss: 0.1230
Epochs [0/2], Step[1500/6000], Loss: 0.4073
Epochs [0/2], Step[2000/6000], Loss: 0.3144
Epochs [0/2], Step[2500/6000], Loss: 0.0254
Epochs [0/2], Step[3000/6000], Loss: 0.0905
Epochs [0/2], Step[3500/6000], Loss: 0.3965
Epochs [0/2], Step[4000/6000], Loss: 0.0122
Epochs [0/2], Step[4500/6000], Loss: 0.0081
Epochs [0/2], Step[5000/6000], Loss: 0.0256
Epochs [0/2], Step[5500/6000], Loss: 0.0409
Epochs [1/2], Step[0/6000], Loss: 0.0815
Epochs [1/2], Step[500/6000], Loss: 0.0398
Epochs [1/2], Step[1000/6000], Loss: 0.3893
Epochs [1/2], Step[1500/6000], Loss: 0.0126
Epochs [1/2], Step[2000/6000], Loss: 0.6683
Epochs [1/2], Step[2500/6000], Loss: 0.0035
Epochs [1/2], Step[3000/6000], Loss: 0.1270
Epochs [1/2], Step[3500/6000], Loss: 0.0544
Epochs [1/2], Step[4000/6000], Loss: 0.1283
Epochs [1/2], Step[4500/6000], Loss: 0.0062
Epochs [1/2], Step[5000/6000], Loss: 0.0

## Evaluate the model
Now we can evaluate model performance on test dataset. We pass the data and the dictionary with one metric. We can pass as many metrics as we like in the form of functions accepting ground-truth and predictions and returning metric value.  
`f(true, pred) -> metric`

In [9]:
model.evaluate(test_dl, {'acc': accuracy_score})

100%|██████████| 1000/1000 [00:14<00:00, 67.24it/s]


## Check the metadata
Let's examine metadata obtained from the model after training.

In [10]:
model.get_meta()

[{'name': '<__main__.Classifier object at 0x000002BCE393FE20>',
  'created_at': DateTime(2022, 7, 29, 8, 37, 49, 24581, tzinfo=Timezone('UTC')),
  'metrics': {'acc': 0.9662},
  'params': {'input_size': 784,
   'hidden_size': 100,
   'num_classes': 10,
   'num_epochs': 2,
   'lr': 0.001,
   'bs': 10},
  'type': 'model'}]

We can notice several things. The model is tracking the time of creation. It's metrics in place as expected after evaluation.  
Let's look at the params dict. We can see all the parameters that we passed using keywords in the wrapper. The wrapper recorded them in the metadata for us automatically.

## Saving the model
It's time to save the trained model. We can just use model.save() method, but let's look at another Cascade's tool for model management.  
It is called `ModelRepo`.

In [11]:
repo = cdm.ModelRepo('./repo')

This is the repository of models. It manages a series of experiments over a sets of models of different architectures called model lines.

In [12]:
repo.add_line('linear_nn', Classifier)

ModelLine of 0 models of <class '__main__.Classifier'>

Model line is the manager of models with similar architecture, but different parameters or different epochs. It manages saving of model and its meta and also loading of model.

Aside from model's metadata we would like to know on what data model was trained. Exactly for this we have metadata of our previously written data pipeline.  
Let's update model's meta with the field for train data.

In [13]:
model.update_meta({'train_data': train_ds.get_meta()})
model.get_meta()

[{'name': '<__main__.Classifier object at 0x000002BCE393FE20>',
  'train_data': [{'name': '__main__.NoiseModifier',
    'type': 'dataset',
    'len': 60000},
   {'name': 'cascade.data.dataset.Wrapper',
    'desc': 'This is MNIST dataset of handwritten images, TRAIN PART',
    'type': 'dataset',
    'len': 60000,
    'obj_type': torchvision.datasets.mnist.MNIST}],
  'created_at': DateTime(2022, 7, 29, 8, 37, 49, 24581, tzinfo=Timezone('UTC')),
  'metrics': {'acc': 0.9662},
  'params': {'input_size': 784,
   'hidden_size': 100,
   'num_classes': 10,
   'num_epochs': 2,
   'lr': 0.001,
   'bs': 10},
  'type': 'model'}]

Saving the model is as easy as this:

In [14]:
repo['linear_nn'].save(model)

This will save the model to the path:  
`repo/linear_nn/00000/model`  
And metadata:  
`repo/linear_nn/00000/meta.json`

## Peeking inside repo
To see model's metrics and parameters we don't need to manually go to the folders mentioned or print large metadata in console. Cascade has tools for conveniently show metrics. One of them is `MetricViewer`.

In [15]:
mv = cde.MetricViewer(repo)
mv.plot_table()

It accepts the repo object and can show tables of metrics and metadata. However, when the table is too big and we need more powerful tool, mv also has method `serve` that opens fully interactive table of metrics with the ability to sort and filter results.

## More experiments
What if we want to automatically run a number of experiments and then choose the best model?  
The workflow is pretty similar. In the example below we try to find the best option for hidden size of the model.  
We define the set of parameters for our experiments and run them in loop every time saving the results.

In [16]:
params = [
    {'hidden_size': 100,  'num_epochs': 2, 'lr': 0.001, 'bs': 10},
    {'hidden_size': 10,   'num_epochs': 2, 'lr': 0.001, 'bs': 10},
    {'hidden_size': 1000, 'num_epochs': 2, 'lr': 0.001, 'bs': 10}
]

In [17]:
for p in params:
    model = Classifier(SimpleNN,
        **p,
        input_size=INPUT_SIZE,
        num_classes=10)
    model.fit(train_dl, **p)
    model.evaluate(test_dl, {'acc': accuracy_score})
    repo['linear_nn'].save(model)

Epochs [0/2], Step[0/6000], Loss: 2.3107
Epochs [0/2], Step[500/6000], Loss: 1.1580
Epochs [0/2], Step[1000/6000], Loss: 0.7968
Epochs [0/2], Step[1500/6000], Loss: 0.0881
Epochs [0/2], Step[2000/6000], Loss: 0.3919
Epochs [0/2], Step[2500/6000], Loss: 0.6627
Epochs [0/2], Step[3000/6000], Loss: 0.0382
Epochs [0/2], Step[3500/6000], Loss: 0.1631
Epochs [0/2], Step[4000/6000], Loss: 0.0262
Epochs [0/2], Step[4500/6000], Loss: 0.2148
Epochs [0/2], Step[5000/6000], Loss: 0.2660
Epochs [0/2], Step[5500/6000], Loss: 0.0791
Epochs [1/2], Step[0/6000], Loss: 0.1174
Epochs [1/2], Step[500/6000], Loss: 0.0548
Epochs [1/2], Step[1000/6000], Loss: 0.0363
Epochs [1/2], Step[1500/6000], Loss: 0.0782
Epochs [1/2], Step[2000/6000], Loss: 0.0050
Epochs [1/2], Step[2500/6000], Loss: 0.0113
Epochs [1/2], Step[3000/6000], Loss: 0.4701
Epochs [1/2], Step[3500/6000], Loss: 0.1373
Epochs [1/2], Step[4000/6000], Loss: 0.0108
Epochs [1/2], Step[4500/6000], Loss: 0.0241
Epochs [1/2], Step[5000/6000], Loss: 0.1

100%|██████████| 1000/1000 [00:19<00:00, 50.99it/s]


Epochs [0/2], Step[0/6000], Loss: 2.1775
Epochs [0/2], Step[500/6000], Loss: 0.3399
Epochs [0/2], Step[1000/6000], Loss: 0.6470
Epochs [0/2], Step[1500/6000], Loss: 0.5015
Epochs [0/2], Step[2000/6000], Loss: 0.9082
Epochs [0/2], Step[2500/6000], Loss: 0.4990
Epochs [0/2], Step[3000/6000], Loss: 0.3705
Epochs [0/2], Step[3500/6000], Loss: 0.3698
Epochs [0/2], Step[4000/6000], Loss: 0.1470
Epochs [0/2], Step[4500/6000], Loss: 1.2374
Epochs [0/2], Step[5000/6000], Loss: 0.4824
Epochs [0/2], Step[5500/6000], Loss: 1.3306
Epochs [1/2], Step[0/6000], Loss: 0.1617
Epochs [1/2], Step[500/6000], Loss: 0.3862
Epochs [1/2], Step[1000/6000], Loss: 0.1317
Epochs [1/2], Step[1500/6000], Loss: 0.2154
Epochs [1/2], Step[2000/6000], Loss: 0.2449
Epochs [1/2], Step[2500/6000], Loss: 0.1037
Epochs [1/2], Step[3000/6000], Loss: 0.4395
Epochs [1/2], Step[3500/6000], Loss: 0.0350
Epochs [1/2], Step[4000/6000], Loss: 0.0545
Epochs [1/2], Step[4500/6000], Loss: 0.5686
Epochs [1/2], Step[5000/6000], Loss: 0.5

100%|██████████| 1000/1000 [00:13<00:00, 71.91it/s]


Epochs [0/2], Step[0/6000], Loss: 2.2944
Epochs [0/2], Step[500/6000], Loss: 0.0815
Epochs [0/2], Step[1000/6000], Loss: 0.8061
Epochs [0/2], Step[1500/6000], Loss: 0.3186
Epochs [0/2], Step[2000/6000], Loss: 0.0326
Epochs [0/2], Step[2500/6000], Loss: 0.1426
Epochs [0/2], Step[3000/6000], Loss: 0.1647
Epochs [0/2], Step[3500/6000], Loss: 0.0043
Epochs [0/2], Step[4000/6000], Loss: 0.2361
Epochs [0/2], Step[4500/6000], Loss: 0.0097
Epochs [0/2], Step[5000/6000], Loss: 0.1270
Epochs [0/2], Step[5500/6000], Loss: 0.0713
Epochs [1/2], Step[0/6000], Loss: 0.0172
Epochs [1/2], Step[500/6000], Loss: 0.0075
Epochs [1/2], Step[1000/6000], Loss: 0.0198
Epochs [1/2], Step[1500/6000], Loss: 0.4891
Epochs [1/2], Step[2000/6000], Loss: 0.0192
Epochs [1/2], Step[2500/6000], Loss: 0.0451
Epochs [1/2], Step[3000/6000], Loss: 0.0234
Epochs [1/2], Step[3500/6000], Loss: 0.0464
Epochs [1/2], Step[4000/6000], Loss: 0.0059
Epochs [1/2], Step[4500/6000], Loss: 0.0574
Epochs [1/2], Step[5000/6000], Loss: 0.0

100%|██████████| 1000/1000 [00:15<00:00, 62.87it/s]


## Results
We can the the results of our experiments - all of them are present in the table and we can choose the best option.

In [18]:
mv = cde.MetricViewer(repo)
mv.plot_table()

## See also:
- [Documentation](https://oxid15.github.io/cascade/)
- [Key concepts](https://oxid15.github.io/cascade/concepts.html)
- [Code reference](https://oxid15.github.io/cascade/modules.html)