# Plant Pathology - Baseline

Tutorial Link -> https://www.kaggle.com/code/werooring/ch12-baseline

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Fix Seed Value and Set GPU

**Fix Seed Value**

In [1]:
import torch # pytorch
import random
import numpy as np
import os

# fix seed value
seed = 50
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) 
torch.backends.cudnn.deterministic = True 
torch.backends.cudnn.benchmark = False    
torch.backends.cudnn.enabled = False      

**Set GPU**

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

## 2. Prep Data

In [4]:
import pandas as pd

data_path = '/kaggle/input/plant-pathology-2020-fgvc7/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

**Divide Train / Validation Data**

In [5]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train, 
                               test_size = 0.1, # ratio; train : valid = 9 : 1
                               stratify = train[['healthy', 'multiple_diseases', 'rust', 'scab']], # maintaning ratio in each data set
                               random_state = 50)

**Define Dataset Class**

In [6]:
import cv2
from torch.utils.data import Dataset 
import numpy as np

class ImageDataset(Dataset):
    # constructor
    # if dataset is for test, set is_test = True 
    # else if it's for train or validation, set is_test = False
    def __init__(self, df, img_dir = './', transform = None, is_test = False):
        super().__init__()
        self.df = df # train or validation dataset 
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]
        img_path = self.img_dir + img_id + '.jpg'
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.df.iloc[idx, 1] # target value
        
        if self.transform is not None:
            # if there's a transformer(변환기)
            # use albumentations module's transformer
            image = self.transform(image=image)['image'] 
        
        # if test data, return only image data / if not, also return target value
        if self.is_test:
            return image
        else:
            # idx of target value with biggest value among 4
            label = np.argmax(self.df.iloc[idx, 1:5])
            return image, label

**Define Image Transformer**

In [7]:
# modules
import albumentations as A
from albumentations.pytorch import ToTensorV2

In [9]:
# transformer for training data
transform_train = A.Compose([
    A.Resize(450, 650),       
    A.RandomBrightnessContrast(brightness_limit=0.2, 
                               contrast_limit=0.2, p=0.3),
    A.VerticalFlip(p=0.2),    
    A.HorizontalFlip(p=0.5),  
    A.ShiftScaleRotate(      
        shift_limit=0.1,
        scale_limit=0.2,
        rotate_limit=30, p=0.3),
    A.OneOf([A.Emboss(p=1),  
             A.Sharpen(p=1),
             A.Blur(p=1)], p=0.3),
    A.PiecewiseAffine(p=0.3), 
    A.Normalize(),             
    ToTensorV2()              
])

In [11]:
# transformer for validating / testing data
transform_test = A.Compose([
    A.Resize(450, 650),
    A.Normalize(),
    ToTensorV2()
])

**Create Dataset and Data Loader**

In [12]:
# create train / validation dataset
img_dir = '/kaggle/input/plant-pathology-2020-fgvc7/images/'

dataset_train = ImageDataset(df = train, img_dir = img_dir, transform = transform_train)
dataset_valid = ImageDataset(df = valid, img_dir = img_dir, transform = transform_test)

- Use Multi-processing
    - Have to fix Data Loader's seed value -> use `seed_worker`

In [13]:
def seed_worker(worder_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7b696c341810>

In [14]:
from torch.utils.data import DataLoader

batch_size = 4

loader_train = DataLoader(dataset_train, batch_size= batch_size, 
                         shuffle = True, worker_init_fn = seed_worker,
                         generator = g, num_workers = 2)
loader_valid = DataLoader(dataset_valid, batch_size= batch_size,
                         shuffle = False, worker_init_fn = seed_worker,
                         generator = g, num_workers = 2)

## 3. Create Model

- Use pretrained model
    - Use torchvision.models module
    - Use pretrainedmodels module
    - Use self-constructed / searched module -> EfficientNet
- Perform transfer learning
    - Transfer learning: re-training pretrained model on similar but difference area

**Create EfficientNet Model**

In [15]:
!pip install efficientnet-pytorch==0.7.1

Collecting efficientnet-pytorch==0.7.1
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25ldone
[?25h  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16427 sha256=17fca8224587550d69ae28702e83470384e075bcbb738b92845570fce207109e
  Stored in directory: /root/.cache/pip/wheels/03/3f/e9/911b1bc46869644912bda90a56bcf7b960f20b5187feea3baf
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.1


In [17]:
# import EfficientNet model
from efficientnet_pytorch import EfficientNet

In [18]:
# call pre-trained efficientnet-b7 model
model = EfficientNet.from_pretrained('efficientnet-b7', num_classes = 4) # num_classes: final output count

model = model.to(device)

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b7-dcc49843.pth
100%|██████████| 254M/254M [00:01<00:00, 221MB/s]  


Loaded pretrained weights for efficientnet-b7


## 4. Train Model and Validation

**Set Loss Function and Optimizer**

In [19]:
# loss function: classification prob -> use CrossEntropyLoss()
import torch.nn as nn

criterion = nn.CrossEntropyLoss()

In [20]:
# optimizer: AdamW (Adam + weight decay (prevent over-fitting))
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006, weight_decay=0.0001)

**Train and Performance Validation**

- Performance Validation
    - Instead of validating after perform training all epochs, validate after **every** epoch
    - It takes longer time, but it can check there's no overfitting while training 

In [22]:
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm # process bar
epochs = 5

for epoch in range(epochs):
    # train
    model.train()
    epoch_train_loss = 0
    
    for images, labels in tqdm(loader_train):
        images = images.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        epoch_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
    print(f'epoch [{epoch+1}/{epochs}] - loss val from training data: {epoch_train_loss/len(loader_train):.4f}')
    
    # validation
    model.eval()
    epoch_valid_loss = 0
    preds_list = []
    true_onehot_list = []
    
    with torch.no_grad():
        for images, labels in loader_valid:
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            epoch_valid_loss += loss.item()
            
            preds = torch.softmax(outputs.cpu(), dim=1).numpy()
#             true_onehot = torch.eye(4)[labels].cpu().numpy()
            true_onehot = torch.eye(4).to(labels.device)[labels].cpu().numpy()
            preds_list.extend(preds)
            true_onehot_list.extend(true_onehot)
            
    print(f'epoch [{epoch+1}/{epochs}] - loss val from validation data: {epoch_valid_loss/len(loader_valid):.4f} / Validation Data ROC AUC: {roc_auc_score(true_onehot_list, preds_list):.4f}')

  0%|          | 0/410 [00:00<?, ?it/s]

epoch [1/5] - loss val from training data: 0.3413


RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)

## 5. Predication and Submit Result

In [None]:
dataset_test = ImageDataset(test, img_dir=img_dir, 
                            transform=transform_test, is_test=True)
loader_test = DataLoader(dataset_test, batch_size=batch_size, 
                         shuffle=False, worker_init_fn=seed_worker,
                         generator=g, num_workers=2)

**Predication**

In [None]:
model.eval()

preds = np.zeros((len(test), 4))

with torch.no_grad():
    for i, images in enumerate(loader_test):
        images = images.to(device)
        outputs = model(images)
        
        preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds[i*batch_size:(i+1)*batch_size] += preds_part

**Submit Result**

In [None]:
submission[['healthy', 'multiple_diseases', 'rust', 'scab']] = preds
submission.to_csv('submission.csv', index=False)