<a href="https://colab.research.google.com/github/Xinhong-Deng/Cassava-Classification/blob/main/havenized_cassava_pytorch_lightning_starter_notebook_0_895.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Motivation 

Cassava is a starchy root vegetable which is staple food in Africa. Almost 80% of the farms grow these plants. But yields are generally poor due to viral diseases. It's crucial to detect these diseases early on to curb the spread.

## Goal

We have to identify correct category for each image out of 5 classes. There are 4 diseased categories and 1 healthy category

* Cassava Bacterial Blight (CBB),
* Cassava Brown Streak Disease (CBSD),
* Cassava Green Mottle (CGM),
* Cassava Mosaic Disease (CMD),
* Healthy

In this notebook , I am creating a solution in PyTorch and PyTorch Lightning. PyTorch lightning is a way of organizing your PyTorch code. When the code is organised in simple lightning modules than we can easily use a lot of advanced features like 

- Mixed Precision
- GPU/ Multi GPU/ TPU
- Gradient Accumulation
- Gradient Clipping
- Moving tensors to/from GPU
- Logging 

The entire solution is structured in a way that allows us to run multiple experiments with different models, image sizes, batch sizes and other commonly used hyper parameters easily. 


If you prefer coding in editors like visual studio, you can download entire code from my github repo over [here](https://github.com/svishnu88/Cassava).

## Import the libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import sys
from typing import Tuple
import PIL
from torch.utils.data import Dataset
from pathlib import Path
from PIL import Image
from PIL.Image import Image as PILImage
from torch.utils.data.dataloader import DataLoader
import numpy as np
import pandas as pd
from pytorch_lightning import LightningDataModule
from sklearn.model_selection import train_test_split, StratifiedKFold
import albumentations as A
from albumentations.pytorch.transforms import ToTensor

from torchvision import models
import torch.nn as nn
import torch
import torch.nn.functional as F
import pytorch_lightning as pl
from torch import optim
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

sys.path.append('../input/geneffnet/gen-efficientnet-pytorch-master')

import geffnet

path = Path("/kaggle/input/cassava-leaf-disease-classification/")

## Data Block

- Create a Pytorch Dataset.
- Create a Pytorch Lightning Data Module block which contains all the code for creating data loaders.

In [None]:
class CassavaDataset(Dataset):
    def __init__(self, path, df, transform=None) -> None:
        super().__init__()
        self.df = df
        self.path = path
        self.transform = transform
        self.num_workers = 2

    def __getitem__(self, index) -> Tuple[PILImage, int]:
        img_id, label = self.df.iloc[index]
        image = Image.open(self.path / img_id)
        image = np.array(image)
        if self.transform is not None:
            transformed = self.transform(image=image)
            image = transformed["image"]
        return image, label

    def __len__(self):
        return self.df.shape[0]

In [None]:
class CassavaDataModule(LightningDataModule):
    def __init__(
        self,
        path: str = None,
        aug_p: float = 0.5,
        val_pct: float = 0.2,
        img_sz: int = 224,
        batch_size: int = 64,
        num_workers: int = 4,
        fold_id: int = 0,
    ):
        super().__init__()
        self.path = Path(path)
        self.aug_p = aug_p
        self.val_pct = val_pct
        self.img_sz = img_sz
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.fold_id = fold_id

    def prepare_data(self):
        # only called on 1 GPU/TPU in distributed
        df = pd.read_csv(self.path / "train.csv")
        skf = StratifiedKFold(n_splits=5)
        t = df.label
        train_index, valid_index = list(skf.split(np.zeros(len(t)), t))[self.fold_id]
        train_df = df.loc[train_index]
        valid_df = df.loc[valid_index]

        train_df.to_pickle("train_df.pkl")
        valid_df.to_pickle("valid_df.pkl")
        
        test_df = pd.read_csv(self.path / "sample_submission.csv")
        test_df.to_pickle("test_df.pkl")
        

    def setup(self):
        # called on every process in DDP
        self.train_transform, self.test_transform = get_augmentations(
            p=self.aug_p, image_size=self.img_sz
        )
        self.train_df = pd.read_pickle("train_df.pkl")
        self.valid_df = pd.read_pickle("valid_df.pkl")
        self.test_df = pd.read_pickle("test_df.pkl")

    def train_dataloader(self):
        train_dataset = CassavaDataset(
            self.path / "train_images", df=self.train_df, transform=self.train_transform
        )
        return DataLoader(
            train_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
            pin_memory=True,
        )

    def val_dataloader(self):
        valid_dataset = CassavaDataset(
            self.path / "train_images", df=self.valid_df, transform=self.test_transform
        )
        return DataLoader(
            valid_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
            pin_memory=True,
        )
    
    def test_dataloader(self):
        test_dataset = CassavaDataset(
            self.path / "test_images", df=self.test_df, transform=self.test_transform
        )
        return DataLoader(
            test_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
            pin_memory=True,
        )

In [None]:
df = pd.read_csv(path/'train.csv')
ds = CassavaDataset(path/'train_images',df=df)

## Apply Augmentations

We are choosing some transformations from Albumentations library to apply on the dataset, which will give our neural network more information to learn form and become better at predictions. 

I have not played much with the augmentation pipeline, so the parametes may not be optimized. So tweaking them can improve the model further.

In [None]:
def get_augmentations(p=0.5, image_size=224):
    imagenet_stats = {"mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225]}
    train_tfms = A.Compose(
        [
            # A.Resize(image_size, image_size),
            A.RandomResizedCrop(image_size, image_size),
            A.ShiftScaleRotate(shift_limit=0.15, scale_limit=0.4, rotate_limit=45, p=p),
            A.Cutout(p=p),
            A.RandomRotate90(p=p),
            A.Flip(p=p),
            A.OneOf(
                [
                    A.RandomBrightnessContrast(
                        brightness_limit=0.2,
                        contrast_limit=0.2,
                    ),
                    A.HueSaturationValue(
                        hue_shift_limit=20, sat_shift_limit=50, val_shift_limit=50
                    ),
                ],
                p=p,
            ),
            A.OneOf(
                [
                    A.IAAAdditiveGaussianNoise(),
                    A.GaussNoise(),
                ],
                p=p,
            ),
            A.CoarseDropout(max_holes=10, p=p),
            A.OneOf(
                [
                    A.MotionBlur(p=0.2),
                    A.MedianBlur(blur_limit=3, p=0.1),
                    A.Blur(blur_limit=3, p=0.1),
                ],
                p=p,
            ),
            A.OneOf(
                [
                    A.OpticalDistortion(p=0.3),
                    A.GridDistortion(p=0.1),
                    A.IAAPiecewiseAffine(p=0.3),
                ],
                p=p,
            ),
            ToTensor(normalize=imagenet_stats),
        ]
    )

    valid_tfms = A.Compose(
        [A.CenterCrop(image_size, image_size), ToTensor(normalize=imagenet_stats)]
    )

    return train_tfms, valid_tfms

## Create a PyTorch Model
- Any Resnet/Resnext model supported by torch hub
- Efficientnet models from [geffnet](https://github.com/rwightman/gen-efficientnet-pytorch/tree/master/geffnet)

In [None]:
ssl_models = [
    "resnet18_ssl",
    "resnet50_ssl",
    "resnext50_32x4d_ssl",
    "resnext101_32x4d_ssl",
    "resnext101_32x8d_ssl",
    "resnext101_32x16d_ssl",
]

class Resnext(nn.Module):
    def __init__(
        self,
        model_name="resnet18_ssl",
        pool_type=F.adaptive_avg_pool2d,
        num_classes=1000,
        kaggle=False,
    ):
        super().__init__()
        self.pool_type = pool_type

        if kaggle:
            backbone = eval(model_name)()
        else:
            backbone = torch.hub.load(
                "facebookresearch/semi-supervised-ImageNet1K-models", model_name
            )
        list(backbone.children())[:-2]
        self.backbone = nn.Sequential(*list(backbone.children())[:-2])
        in_features = getattr(backbone, "fc").in_features
        self.classifier = nn.Linear(in_features, num_classes)

    def forward(self, x):
        features = self.pool_type(self.backbone(x), 1)
        features = features.view(x.size(0), -1)
        return self.classifier(features)


def get_efficientnet(model_name, pretrained=True, num_classes=5):
    model = geffnet.create_model(model_name, pretrained=pretrained)
    model.classifier = nn.Linear(model.classifier.in_features, num_classes)
    return model

## Pytorch Lightning Module
Create a Pytorch Lightning Module where we write the essential parts of our training pipeline like

- training step
- validation step
- choose optimizer
- choose scheduler 
- do any logging as required

In [None]:
class CassavaModel(pl.LightningModule):
    def __init__(
        self,
        model_name: str = None,
        num_classes: int = None,
        data_path: Path = None,
        loss_fn=F.cross_entropy,
        lr=1e-4,
        wd=1e-6,
    ):
        super().__init__()

        if model_name.find("res") > -1:
            self.model = Resnext(model_name=model_name, num_classes=num_classes)
        elif model_name.find("effi") > -1:
            self.model = get_efficientnet(model_name)
        self.data_path = data_path
        self.loss_fn = loss_fn
        self.lr = lr
        self.accuracy = pl.metrics.Accuracy()
        self.wd = wd

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y, _ = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y, _ = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        self.log("valid_loss", loss, prog_bar=True)
        self.log("val_acc", self.accuracy(y_hat, y), prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        x, y, img_id = batch
        y_hat = self(x)
        self.log("test_acc", self.accuracy(y_hat, y), prog_bar=True)
        
        print(y_hat)
        _, pred = torch.max(y_hat, 1)
        return img_id, pred
    
    def test_epoch_end(self, test_step_outputs):
        results = ""
        for img_id, pred in test_step_outputs:
            results += "%s,%d\n" % ("".join(img_id), pred)
        
        path = "submission.csv"
        if not os.path.exists(path):
            with open(path, "a") as f:
                f.write("image_id,label\n")
                
        with open(path, "a") as f:
            f.write(results)

    def configure_optimizers(self):
        optimizer = optim.AdamW(
            self.model.parameters(), lr=self.lr, weight_decay=self.wd
        )
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, self.trainer.max_epochs, 0
        )

        return [optimizer], [scheduler]

## Train the model

In [None]:
fold_id = 0
aug_p = 0.5
img_sz= 224
batch_size = 64
num_workers = 4
num_classes = 5
loss_fn = F.cross_entropy
lr = 1e-4
epochs = 1
gradient_clip_val = 0.1
precision = 16
model_name=ssl_models[2]

In [None]:
data_module = CassavaDataModule(
    path=path,
    aug_p=aug_p,
    img_sz=img_sz,
    batch_size=batch_size,
    num_workers=num_workers,
    fold_id=fold_id,
)
data_module.prepare_data()
data_module.setup()

In [None]:
model = CassavaModel(
    model_name=model_name,
    num_classes=num_classes,
    data_path=path,
    lr=lr,
    loss_fn=loss_fn,
)

Downloading: "https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/archive/master.zip" to /root/.cache/torch/hub/master.zip


URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [None]:
!mkdir /kaggle/working/weights

In [None]:
weights_path = Path(f"/kaggle/working/weights")

checkpoint_callback = ModelCheckpoint(
    dirpath=weights_path,
    save_weights_only=True,
    monitor="val_acc",
    mode="max",
    save_last=True,
    filename=f"{fold_id}",
)
trainer = pl.Trainer(
    gpus=1,
    callbacks=[checkpoint_callback],
    max_epochs=epochs,
    gradient_clip_val=gradient_clip_val,
    precision=precision,
   )

trainer.fit(model=model, datamodule=data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


NameError: name 'model' is not defined

## Further improvements

You can increase the number of epochs to get further improvement. Also, right now I just ran on 1-fold, you can run multiple folds to get better score. You can play with hyperparameters and augmentations as well.

With same code I was able to reach 0.895 with a single fold on leaderboard.
We have completed the training pipeline here. 
I will show inference in another kernel.[Link to be added]

Hope you liked the notebook.

# Havenize

## exp_config

In [None]:
# cross validation
fold_id = 0
aug_p = 0.5

# trainer
gradient_clip_val = 0.1
precision = 16


from haven import haven_utils as hu

EXP_GROUPS = {}
EXP_GROUPS['cassava'] = hu.cartesian_exp_group({'dataset': {'name': 'cassava', 'image_size': 224,},
                          'batch_size': 64,
                          'max_epoch': 5,
                          # 'data_size': {},
                          'model': {'name':'resnext50_32x4d_ssl', 
                                    'loss':'cross_entropy',
                                    'n_classes': 5,
                                    # data_path no need in haven
                                    },
                          'optimizer': {'name': 'adamW', 'wd': 1e-6, 'lr': 1e-4,},
                          }）



## src

### models

In [None]:
def get_model(model_dict, exp_dict=None, train_set=None):
  raise NotImplementedError

class Model(nn.Module):
    def __init__(self, exp_dict, device):
      super().__init__()
      self.exp_dict = exp_dict

      # define model
      model_dict = self.exp_dict['model']
      self.model = Resnext(name=model_dict['name'], num_classes=model_dict['n_classes'])
      if model_dict['loss'] == 'cross_entropy':
        self.loss_fn = torch.nn.CrossEntropyLoss()

      # define optimizer
      opt_dict = self.exp_dict['opt']
      if opt_dict['name'] == 'adamW':
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(), lr=opt_dict['lr'], weight_decay=opt_dict['wd']
        )
       
      self.device = device


# todo: not complete
class Resnext(nn.Module):
    def __init__(
        self,
        model_name="resnet18_ssl",
        pool_type=F.adaptive_avg_pool2d,
        num_classes=1000,
    ):
        super().__init__()
        self.pool_type = pool_type

        backbone = torch.hub.load(
                "facebookresearch/semi-supervised-ImageNet1K-models", model_name
            )
        list(backbone.children())[:-2]
        self.backbone = nn.Sequential(*list(backbone.children())[:-2])
        in_features = getattr(backbone, "fc").in_features
        self.classifier = nn.Linear(in_features, num_classes)

    def forward(self, x):
        features = self.pool_type(self.backbone(x), 1)
        features = features.view(x.size(0), -1)
        return self.classifier(features)

    # todo
    def train_on_loader():
      return 0

    # todo
    def val_on_loader():
      return 0

    def load_state_dict(self, state_dict):
      self.model.load_state_dict(state_dict['model'])
      if 'opt' not in state_dict:
        return
      self.optimizer.load_state_dict(state_dict['opt'])
      return 0

    def get_state_dict(self):
      return {'model': self.model.state_dict(), 
              'opt': self.optimizer.state_dict()}
      


### datasets

In [None]:
def get_dataset(dataset_name, split, datadir, num_workers=0):
  if dataset_name == 'cassava':
    dataset = CassavaDataset(datadir, split, num_workers)
  else:
    raise ValueError('dataset %s not found' % dataset_name)
  
  return dataset

class CassavaDataset(Dataset):
    def __init__(self, datadir_base, split, num_workers, transform=None) -> None:
        super().__init__()
        self.datadir_base = datadir_base
        self.transform = transform
        self.num_workers = num_workers

        # todo: figure out how to split the train and val set
        if split == 'train':
          df_path = os.path.join(datadir_base, "train.csv")
          data_path = os.path.join(datadir_base, "train_images")
          df = pd.read_csv(df_path)
        elif split == 'val':
          df_path = os.path.join(datadir_base, "train.csv")
          data_path = os.path.join(datadir_base, "train_images")
          df = pd.read_csv(df_path)
        elif split == 'test':
          df_path = os.path.join(datadir_base, "sample_submission.csv")
          data_path = os.path.join(datadir_base, "test_images")
          df = pd.read_csv(df_path)
        
        self.data_path = data_path
        self.df = df

    def __getitem__(self, index) -> Tuple[PILImage, int]:
        img_id, label = self.df.iloc[index]
        img_path = os.path.join(self.data_path, img_id)
        image = Image.open(img_path)
        image = np.array(image)
        if self.transform is not None:
            transformed = self.transform(image=image)
            image = transformed["image"]
        return image, label

    def __len__(self):
        return self.df.shape[0]

## train_val

In [None]:
import argparse

from haven import haven_wizard as hw
from haven import haven_utils as hu

# from src import datasets, models


def trainval(exp_dict, savedir, args):
    """
    exp_dict: dictionary defining the hyperparameters of the experiment
    savedir: the directory where the experiment will be saved
    args: arguments passed through the command line
    """
    # -- Datasets
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     split='train',
                                     datadir=args.datadir)

    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   split='val',
                                   datadir=args.datadir)
    
    test_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   split='test',
                                   datadir=args.datadir)

    # -- Model
    model = models.Model(exp_dict, device=torch.device('cuda'))

    # -- Train & Val Loop
    score_list = []
    for e in range(0, 50):
        # Compute metrics
        score_dict = {"epoch": e}
        score_dict["train_loss"] = model.val_on_dataset(
            train_set, metric_name='softmax_loss')
        score_dict["val_acc"] = model.val_on_dataset(
            val_set, metric_name='softmax_acc')
        score_list += [score_dict]

        # Train model for one epoch
        model.train_on_dataset(train_set)

        # Visualize
        # images = model.vis_on_dataset(
        #     val_set, fname=os.path.join(savedir, 'images', 'results.png'))

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail(), "\n")
        hu.save_pkl(os.path.join(savedir, 'score_list.pkl'), score_list)
        hu.torch_save(os.path.join(savedir, 'model.pth'), model.state_dict())
        print("Checkpoint Saved: %s" % savedir)

    print('Experiment completed et epoch %d' % e)

    # test and output result
    for batch in test_set:
        x, y, img_id = batch
        y_hat = model(x)
        _, pred = torch.max(y_hat, 1)

        path = "submission.csv"
        if not os.path.exists(path):
            with open(path, "w") as f:
                f.write("image_id,label\n")
                
        with open(path, "a") as f:
            f.write(results)


if __name__ == "__main__":
    # -- Create Parser
    parser = argparse.ArgumentParser()

    # Exp Arguments
    parser.add_argument('-e', '--exp_group_list', nargs="+")
    parser.add_argument("-ei", "--exp_id", default=None)

    # Savedir Arguments
    parser.add_argument('-sb', '--savedir_base', required=True)
    parser.add_argument('-d', '--datadir', required=True)

    # Others
    parser.add_argument("-r", "--reset", default=0, type=int)
    parser.add_argument("-j", "--run_jobs", default=0, type=int)
    parser.add_argument("-v", "--visualize_notebook", type=str, default='')

    args, others = parser.parse_known_args()

    # -- Launch Experiments

    # Get Experiment Groups
    import exp_configs
    exp_groups = exp_configs.EXP_GROUPS
    print('Launching exp_group: %s' % args.exp_group_list)

    if os.path.exists('job_configs.py'):
        import job_configs
        job_config = job_configs.JOB_CONFIG
    else:
        job_config = None

    # Run Selected Experiments
    hw.run_wizard(func=trainval, exp_groups=exp_groups, args=args, 
                job_config=job_config)