In [1]:
import wandb
import pandas as pd
from fastai.vision.all import *
from fastai.callback.wandb import WandbCallback
import params

Let's now create a train_config that we'll pass to W&B run to control training hyperparameters.

In [2]:
train_config = SimpleNamespace(
    framework="fastai",
    img_size=(224, 224),
    batch_size=64,
    augment=True, # use data augmentation
    epochs=5, 
    lr=2e-3,
    pretrained=True,  # whether to use pretrained encoder
    seed=42,
)

We set seed for reproducibility

In [3]:
set_seed(train_config.seed, reproducible=True)

In [4]:
run = wandb.init(project=params.WANDB_PROJECT, job_type="training", config=train_config)

[34m[1mwandb[0m: Currently logged in as: [33msolab5[0m. Use [1m`wandb login --relogin`[0m to force relogin


As usual, we will use W&B Artifacts to track the lineage of our models.

In [5]:
processed_data_at = run.use_artifact(f'{params.PROCESSED_DATA_AT}:latest')
processed_dataset_dir = Path(processed_data_at.download())

[34m[1mwandb[0m: Downloading large artifact data_split:latest, 2266.04MB. 36310 files... 
[34m[1mwandb[0m:   36310 of 36310 files downloaded.  
Done. 0:0:6.1


In [6]:
df = pd.read_csv(processed_dataset_dir / 'data_split.csv')

We will not use the hold out dataset stage at this moment. is_valid column will tell our trainer how we want to split data between training and validation.

In [7]:
df = df[df.Stage != 'test'].reset_index(drop=True)
df['is_valid'] = df.Stage == 'valid'

In [8]:
def label_func(fname):
    return (fname.parent.name)

We will use fastai's DataBlock API to feed data into model training and validation.

In [9]:
fnames = get_image_files(processed_dataset_dir)
fnames = list(filter(lambda x: "media" not in str(x), fnames))
len(fnames)

18160

In [10]:
df["image_fname"] = [processed_dataset_dir/f'{f}' for f in df.File_Name.values]

In [11]:
def get_data(df, bs=64, img_size=(224, 224), augment=True):
    block = DataBlock(blocks=(ImageBlock, CategoryBlock),
                  get_x=ColReader("image_fname"),
                  get_y=ColReader("Label"),
                  splitter=ColSplitter(),
                  item_tfms=Resize(img_size),
                  batch_tfms=aug_transforms() if augment else None,
                 )
    return block.dataloaders(df, bs=bs)

We are using wandb.config to track our training hyperparameters.

In [12]:
config = wandb.config    

In [13]:
dls = get_data(df, bs=config.batch_size, img_size=config.img_size, augment=config.augment)

In [14]:
metrics=[accuracy, error_rate]
learn = vision_learner(dls, arch=resnet18, pretrained=config.pretrained, metrics=metrics)



In [15]:
callbacks = [
    SaveModelCallback(monitor='valid_loss'),
    WandbCallback(log_preds=True, log_model=False)
]

Let's train our model!

In [16]:
learn.fine_tune(config.epochs, config.lr, cbs=callbacks)

Exception in thread SystemMonitor:
Traceback (most recent call last):
  File "/usr/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/root/.local/lib/python3.9/site-packages/wandb/sdk/internal/system/system_monitor.py", line 118, in _start
    asset.start()
  File "/root/.local/lib/python3.9/site-packages/wandb/sdk/internal/system/assets/cpu.py", line 166, in start
    self.metrics_monitor.start()
  File "/root/.local/lib/python3.9/site-packages/wandb/sdk/internal/system/assets/interfaces.py", line 168, in start
    logger.info(f"Started {self._process.name}")
AttributeError: 'NoneType' object has no attribute 'name'


epoch,train_loss,valid_loss,accuracy,error_rate,time
0,0.562043,0.289595,0.898678,0.101322,00:23


Better model found at epoch 0 with valid_loss value: 0.28959470987319946.


epoch,train_loss,valid_loss,accuracy,error_rate,time
0,0.213966,0.095208,0.96641,0.03359,00:26
1,0.094866,0.087995,0.965859,0.034141,00:26
2,0.05751,0.026764,0.989537,0.010463,00:26
3,0.032075,0.035872,0.985132,0.014868,00:26
4,0.020896,0.020239,0.992841,0.007159,00:26


Better model found at epoch 0 with valid_loss value: 0.0952075719833374.
Better model found at epoch 1 with valid_loss value: 0.08799496293067932.
Better model found at epoch 2 with valid_loss value: 0.026764262467622757.
Better model found at epoch 4 with valid_loss value: 0.02023932710289955.


We are reloading the model from the best checkpoint at the end and saving it. To make sure we track the final metrics correctly, we will validate the model again and save the final loss and metrics to wandb.summary.

In [17]:
scores = learn.validate()
metric_names = ['final_loss', 'Accuracy', 'Error_rate']
final_results = {metric_names[i] : scores[i] for i in range(len(scores))}
final_results.items()
for k,v in final_results.items(): 
    wandb.summary[k] = v  

In [18]:
wandb.finish()

0,1
accuracy,▁▆▆█▇█
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eps_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eps_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eps_2,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
error_rate,█▃▃▁▂▁
lr_0,▁▂▃▅▆▇█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr_1,▁▂▃▅▆▇█▂▂▂▃▃▃▄▄▄▅▅▅▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
lr_2,▁▂▃▅▆▇█▂▂▂▃▃▃▄▄▄▅▅▅▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
mom_0,█▇▆▅▃▂▁█▇▇▆▅▄▂▂▁▁▁▁▁▁▂▂▂▃▃▄▄▅▅▆▆▆▇▇▇████

0,1
Accuracy,0.99284
Error_rate,0.00716
accuracy,0.99284
epoch,6.0
eps_0,1e-05
eps_1,1e-05
eps_2,1e-05
error_rate,0.00716
final_loss,0.02024
lr_0,0.0
