# Model training
This use-case is model training.  
By going through this you will know how to use Cascade for metadata tracking, hyperparameter tuning and model selection.  
  
Previous part is the pipeline building and is taken without comments.  
For more detailed description of it see Pipeline building example.

In [1]:
#!pip3 install torchvision

In [2]:
import cascade.data as cdd
import cascade.models as cdm
import cascade.meta as cde
from cascade.utils.torch_model import TorchModel

from tqdm import tqdm
import torch
import torchvision
from torchvision.transforms import functional as F
from torch import nn
from sklearn.metrics import accuracy_score

In [3]:
import cascade
cascade.__version__

'0.10.0'

## Defining data pipeline
This part will be without comments 

In [4]:
MNIST_ROOT = 'data'
INPUT_SIZE = 784
BATCH_SIZE = 10

In [5]:
class NoiseModifier(cdd.Modifier):
    def __getitem__(self, index):
        img, label = self._dataset[index]
        img += torch.rand_like(img) * 0.1
        img = torch.clip(img, 0, 255)
        return img, label


train_ds = torchvision.datasets.MNIST(root=MNIST_ROOT,
                                     train=True, 
                                     transform=F.to_tensor,
                                     download=True)
test_ds = torchvision.datasets.MNIST(root=MNIST_ROOT, 
                                    train=False, 
                                    transform=F.to_tensor)

train_ds = cdd.Wrapper(train_ds, 
    meta_prefix={
        'desc': 'This is MNIST dataset of handwritten images, TRAIN PART'
    })
test_ds = cdd.Wrapper(test_ds)

train_ds = NoiseModifier(train_ds)
test_ds = NoiseModifier(test_ds)

train_dl = torch.utils.data.DataLoader(dataset=train_ds, 
                                       batch_size=BATCH_SIZE,
                                       shuffle=True)
test_dl = torch.utils.data.DataLoader(dataset=test_ds,
                                      batch_size=BATCH_SIZE,
                                      shuffle=False)

In [6]:
train_ds.get_meta()

[{'name': '__main__.NoiseModifier', 'type': 'dataset', 'len': 60000},
 {'name': 'cascade.data.dataset.Wrapper',
  'desc': 'This is MNIST dataset of handwritten images, TRAIN PART',
  'type': 'dataset',
  'len': 60000,
  'obj_type': "<class 'torchvision.datasets.mnist.MNIST'>"}]

## Model definition
Before training we need to define our model. We need regular nn.Module and Cascade's wrapper around it.  
  
Module defined without any specific changes in the original pytorch code, except now it accepts `*args` and `**kwargs` in `__init__`

In [7]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, *args, **kwargs):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, y):
         out = self.l1(y)
         out = self.relu(out)
         out = self.l2(out)

         return out

Next Cascade's wrapper is defined. The most of the interaction with pytorch modules are already implemented in `cascade.utils.TorchModel` so we need to only define how to train and evaluate the model.

In [8]:
class Classifier(TorchModel):
    # In train we copy-paste regular pytorch trainloop, 
    # but use self._model, where our SimpleNN is placed
    def fit(self, train_dl, num_epochs, lr, *args, **kwargs):
        criterion = nn.CrossEntropyLoss()
        optim = torch.optim.Adam(self._model.parameters(), lr=lr)

        ds_size = len(train_dl)
        for epoch in range(num_epochs):
            for x, (imgs, labels) in enumerate(train_dl): 
                imgs = imgs.reshape(-1, self._model.input_size)

                out = self._model(imgs)
                loss = criterion(out, labels)

                optim.zero_grad()
                loss.backward()
                optim.step() 

                if x % 500 == 0:
                    print (f'Epochs [{epoch}/{num_epochs}], Step[{x}/{ds_size}], Loss: {loss.item():.4f}')

    # Evaluate function takes the metrics from arguments
    # and populates self.metrics without returning anything
    def evaluate(self, test_dl, metrics_dict, *args, **kwargs):
        pred = []
        gt = []
        for imgs, labels in tqdm(test_dl): 
            imgs = imgs.reshape(-1, self._model.input_size)
            out = torch.argmax(self._model(imgs, *args, **kwargs), -1)

            pred.append(out)
            gt.append(labels)

        pred = torch.concat(pred).detach().numpy()
        gt = torch.concat(gt).detach().numpy()

        for metric_name in metrics_dict:
            self.metrics[metric_name] = metrics_dict[metric_name](gt, pred)

## Model training
Now we are ready to train our model. We define hyperparameters and pass them to our wrapper. Wrapper accepts pytorch module's class and all the parameters that are needed to initialize it.  
Additionally we pass keyword arguments that are connected to training. It is done to add them to the model's metadata.

In [9]:
NUM_EPOCHS = 2
LR = 1e-3

# Classifier will initialize SimpleNN with all the parameters passed
# but some of them are not for the SimpleNN, but to be recorded in metadata
model = Classifier(SimpleNN,
    # These arguments are needed by SimpleNN, 
    # but passed as keywords to be recorded in meta
    input_size=INPUT_SIZE,
    hidden_size=100,
    num_classes=10,
    # These arguments will be skipped by SimpleNN,
    # but will be added to meta
    num_epochs=NUM_EPOCHS,
    lr=LR,
    bs=BATCH_SIZE)
model.fit(train_dl, NUM_EPOCHS, LR)

Epochs [0/2], Step[0/6000], Loss: 2.3479
Epochs [0/2], Step[500/6000], Loss: 0.3280
Epochs [0/2], Step[1000/6000], Loss: 0.0672
Epochs [0/2], Step[1500/6000], Loss: 0.2628
Epochs [0/2], Step[2000/6000], Loss: 0.0326
Epochs [0/2], Step[2500/6000], Loss: 0.0659
Epochs [0/2], Step[3000/6000], Loss: 0.4636
Epochs [0/2], Step[3500/6000], Loss: 0.5428
Epochs [0/2], Step[4000/6000], Loss: 0.0201
Epochs [0/2], Step[4500/6000], Loss: 0.6801
Epochs [0/2], Step[5000/6000], Loss: 0.2029
Epochs [0/2], Step[5500/6000], Loss: 0.3559
Epochs [1/2], Step[0/6000], Loss: 0.0673
Epochs [1/2], Step[500/6000], Loss: 0.1033
Epochs [1/2], Step[1000/6000], Loss: 0.0543
Epochs [1/2], Step[1500/6000], Loss: 0.0273
Epochs [1/2], Step[2000/6000], Loss: 0.3100
Epochs [1/2], Step[2500/6000], Loss: 0.1334
Epochs [1/2], Step[3000/6000], Loss: 0.0302
Epochs [1/2], Step[3500/6000], Loss: 0.6858
Epochs [1/2], Step[4000/6000], Loss: 0.0373
Epochs [1/2], Step[4500/6000], Loss: 0.0214
Epochs [1/2], Step[5000/6000], Loss: 0.6

## Evaluate the model
Now we can evaluate model performance on test dataset. We pass the data and the dictionary with one metric. We can pass as many metrics as we like in the form of functions accepting ground-truth and predictions and returning metric value.  
`f(true, pred) -> metric`

In [10]:
model.evaluate(test_dl, {'acc': accuracy_score})

100%|██████████| 1000/1000 [00:01<00:00, 915.46it/s]


## Check the metadata
Let's examine metadata obtained from the model after training.

In [11]:
model.get_meta()

[{'name': '<__main__.Classifier object at 0x7f8ea2b61b10>',
  'created_at': DateTime(2023, 3, 1, 21, 17, 42, 489049, tzinfo=Timezone('UTC')),
  'metrics': {'acc': 0.968},
  'params': {'input_size': 784,
   'hidden_size': 100,
   'num_classes': 10,
   'num_epochs': 2,
   'lr': 0.001,
   'bs': 10},
  'type': 'model',
  'module': 'SimpleNN(\n  (l1): Linear(in_features=784, out_features=100, bias=True)\n  (l2): Linear(in_features=100, out_features=10, bias=True)\n  (relu): ReLU()\n)'}]

We can notice several things. The model is tracking the time of creation. It's metrics in place as expected after evaluation.  
Let's look at the params dict. We can see all the parameters that we passed using keywords in the wrapper. The wrapper recorded them in the metadata for us automatically.

## Saving the model
It's time to save the trained model. We can just use model.save() method, but let's look at another Cascade's tool for model management.  
It is called `ModelRepo`.

In [12]:
repo = cdm.ModelRepo('./repo')

This is the repository of models. It manages a series of experiments over a sets of models of different architectures called model lines.

In [13]:
repo.add_line('linear_nn', Classifier)

ModelLine of 0 models of <class '__main__.Classifier'>

Model line is the manager of models with similar architecture, but different parameters or different epochs. It manages saving of model and its meta and also loading of model.

Aside from model's metadata we would like to know on what data model was trained. Exactly for this we have metadata of our previously written data pipeline.  
Let's update model's meta with the field for train data.

In [14]:
model.update_meta({'train_data': train_ds.get_meta()})
model.get_meta()

[{'name': '<__main__.Classifier object at 0x7f8ea2b61b10>',
  'train_data': [{'name': '__main__.NoiseModifier',
    'type': 'dataset',
    'len': 60000},
   {'name': 'cascade.data.dataset.Wrapper',
    'desc': 'This is MNIST dataset of handwritten images, TRAIN PART',
    'type': 'dataset',
    'len': 60000,
    'obj_type': "<class 'torchvision.datasets.mnist.MNIST'>"}],
  'created_at': DateTime(2023, 3, 1, 21, 17, 42, 489049, tzinfo=Timezone('UTC')),
  'metrics': {'acc': 0.968},
  'params': {'input_size': 784,
   'hidden_size': 100,
   'num_classes': 10,
   'num_epochs': 2,
   'lr': 0.001,
   'bs': 10},
  'type': 'model',
  'module': 'SimpleNN(\n  (l1): Linear(in_features=784, out_features=100, bias=True)\n  (l2): Linear(in_features=100, out_features=10, bias=True)\n  (relu): ReLU()\n)'}]

Saving the model is as easy as this:

In [15]:
repo['linear_nn'].save(model)

This will save the model to the path:  
`repo/linear_nn/00000/model`  
And metadata:  
`repo/linear_nn/00000/meta.json`

## Peeking inside repo
To see model's metrics and parameters we don't need to manually go to the folders mentioned or print large metadata in console. Cascade has tools for conveniently show metrics. One of them is `MetricViewer`.

In [16]:
mv = cde.MetricViewer(repo)
# We can show the table like this
# mv.plot_table()
# Or we can open web-application like this
# mv.serve()
# but it will not be rendered in the documentation, so...
mv.table

Unnamed: 0,line,num,created_at,saved,acc,input_size,hidden_size,num_classes,num_epochs,lr,bs
0,./repo/linear_nn,0,2023-03-01 21:17:42.489049+00:00,21 seconds after,0.968,784,100,10,2,0.001,10


It accepts the repo object and can show tables of metrics and metadata. However, when the table is too big and we need more powerful tool, mv also has method `serve` that opens fully interactive table of metrics with the ability to sort and filter results.

## More experiments
What if we want to automatically run a number of experiments and then choose the best model?  
The workflow is pretty similar. In the example below we try to find the best option for hidden size of the model.  
We define the set of parameters for our experiments and run them in loop every time saving the results.

In [17]:
params = [
    {'hidden_size': 10,  'num_epochs': 2, 'lr': 0.001, 'bs': 10},
    {'hidden_size': 50,  'num_epochs': 2, 'lr': 0.001, 'bs': 10},
    {'hidden_size': 100, 'num_epochs': 2, 'lr': 0.001, 'bs': 10}
]

In [18]:
for p in params:
    model = Classifier(SimpleNN,
        **p,
        input_size=INPUT_SIZE,
        num_classes=10)
    model.fit(train_dl, **p)
    model.evaluate(test_dl, {'acc': accuracy_score})
    repo['linear_nn'].save(model)

Epochs [0/2], Step[0/6000], Loss: 2.4030
Epochs [0/2], Step[500/6000], Loss: 0.9184
Epochs [0/2], Step[1000/6000], Loss: 0.8407
Epochs [0/2], Step[1500/6000], Loss: 0.1700
Epochs [0/2], Step[2000/6000], Loss: 1.1738
Epochs [0/2], Step[2500/6000], Loss: 0.3082
Epochs [0/2], Step[3000/6000], Loss: 0.6153
Epochs [0/2], Step[3500/6000], Loss: 0.3778
Epochs [0/2], Step[4000/6000], Loss: 0.3321
Epochs [0/2], Step[4500/6000], Loss: 0.3514
Epochs [0/2], Step[5000/6000], Loss: 0.3847
Epochs [0/2], Step[5500/6000], Loss: 0.2998
Epochs [1/2], Step[0/6000], Loss: 0.3461
Epochs [1/2], Step[500/6000], Loss: 0.6578
Epochs [1/2], Step[1000/6000], Loss: 0.1339
Epochs [1/2], Step[1500/6000], Loss: 0.1148
Epochs [1/2], Step[2000/6000], Loss: 0.5948
Epochs [1/2], Step[2500/6000], Loss: 0.4984
Epochs [1/2], Step[3000/6000], Loss: 0.1020
Epochs [1/2], Step[3500/6000], Loss: 0.2406
Epochs [1/2], Step[4000/6000], Loss: 0.4933
Epochs [1/2], Step[4500/6000], Loss: 0.8582
Epochs [1/2], Step[5000/6000], Loss: 0.0

100%|██████████| 1000/1000 [00:01<00:00, 934.60it/s]


Epochs [0/2], Step[0/6000], Loss: 2.3334
Epochs [0/2], Step[500/6000], Loss: 0.4136
Epochs [0/2], Step[1000/6000], Loss: 0.4991
Epochs [0/2], Step[1500/6000], Loss: 0.7323
Epochs [0/2], Step[2000/6000], Loss: 0.3884
Epochs [0/2], Step[2500/6000], Loss: 0.0964
Epochs [0/2], Step[3000/6000], Loss: 0.3577
Epochs [0/2], Step[3500/6000], Loss: 0.0276
Epochs [0/2], Step[4000/6000], Loss: 0.1680
Epochs [0/2], Step[4500/6000], Loss: 0.0507
Epochs [0/2], Step[5000/6000], Loss: 0.6252
Epochs [0/2], Step[5500/6000], Loss: 0.0607
Epochs [1/2], Step[0/6000], Loss: 0.1702
Epochs [1/2], Step[500/6000], Loss: 0.0736
Epochs [1/2], Step[1000/6000], Loss: 0.2258
Epochs [1/2], Step[1500/6000], Loss: 0.1754
Epochs [1/2], Step[2000/6000], Loss: 0.2483
Epochs [1/2], Step[2500/6000], Loss: 0.3203
Epochs [1/2], Step[3000/6000], Loss: 0.0468
Epochs [1/2], Step[3500/6000], Loss: 0.3267
Epochs [1/2], Step[4000/6000], Loss: 0.1223
Epochs [1/2], Step[4500/6000], Loss: 0.5855
Epochs [1/2], Step[5000/6000], Loss: 0.1

100%|██████████| 1000/1000 [00:01<00:00, 915.94it/s]


Epochs [0/2], Step[0/6000], Loss: 2.2656
Epochs [0/2], Step[500/6000], Loss: 0.4871
Epochs [0/2], Step[1000/6000], Loss: 0.3666
Epochs [0/2], Step[1500/6000], Loss: 0.5389
Epochs [0/2], Step[2000/6000], Loss: 0.2955
Epochs [0/2], Step[2500/6000], Loss: 0.1862
Epochs [0/2], Step[3000/6000], Loss: 0.0579
Epochs [0/2], Step[3500/6000], Loss: 0.3820
Epochs [0/2], Step[4000/6000], Loss: 0.2179
Epochs [0/2], Step[4500/6000], Loss: 0.0945
Epochs [0/2], Step[5000/6000], Loss: 0.0139
Epochs [0/2], Step[5500/6000], Loss: 0.0529
Epochs [1/2], Step[0/6000], Loss: 0.0069
Epochs [1/2], Step[500/6000], Loss: 0.2136
Epochs [1/2], Step[1000/6000], Loss: 0.0488
Epochs [1/2], Step[1500/6000], Loss: 0.2331
Epochs [1/2], Step[2000/6000], Loss: 0.0735
Epochs [1/2], Step[2500/6000], Loss: 0.0037
Epochs [1/2], Step[3000/6000], Loss: 0.0182
Epochs [1/2], Step[3500/6000], Loss: 0.0023
Epochs [1/2], Step[4000/6000], Loss: 0.0255
Epochs [1/2], Step[4500/6000], Loss: 0.0008
Epochs [1/2], Step[5000/6000], Loss: 0.0

100%|██████████| 1000/1000 [00:01<00:00, 885.51it/s]


## Results
We can see the results of our experiments - all of them are present in the table and we can choose the best option.

In [19]:
mv = cde.MetricViewer(repo)
# mv.plot_table()
mv.table

Unnamed: 0,line,num,created_at,saved,acc,input_size,hidden_size,num_classes,num_epochs,lr,bs
0,./repo/linear_nn,0,2023-03-01 21:17:42.489049+00:00,21 seconds after,0.968,784,100,10,2,0.001,10
1,./repo/linear_nn,1,2023-03-01 21:18:03.893347+00:00,18 seconds after,0.9062,784,10,10,2,0.001,10
2,./repo/linear_nn,2,2023-03-01 21:18:22.524721+00:00,20 seconds after,0.9546,784,50,10,2,0.001,10
3,./repo/linear_nn,3,2023-03-01 21:18:42.572824+00:00,20 seconds after,0.9688,784,100,10,2,0.001,10


## See also:
- [Pipeline building](pipeline_building.html) - (Previous example)
- [Documentation](https://oxid15.github.io/cascade/)
- [Key concepts](https://oxid15.github.io/cascade/concepts.html)
- [Code reference](https://oxid15.github.io/cascade/modules.html)