# Plant Pathology 2020 - FGVC7
### Identify the category of foliar diseases in apple trees
- [Competiton Link](https://www.kaggle.com/c/plant-pathology-2020-fgvc7)
- [Modeling Reference Link](https://www.kaggle.com/akasharidas/plant-pathology-2020-in-pytorch)

### This is Top 1.9% modeling code (25 Rank). If you think it's helpful, please upvote my code 👀
## Upvote Is FREE !

# 1) EDA

## Look around Data

In [None]:
import pandas as pd

# Data path
data_path = '/kaggle/input/plant-pathology-2020-fgvc7/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

## Data Visualization

### Target Value Distribution

In [None]:
# Extract data for each target value
healthy = train.loc[train['healthy']==1]
multiple_diseases = train.loc[train['multiple_diseases']==1]
rust = train.loc[train['rust']==1]
scab = train.loc[train['scab']==1]

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

mpl.rc('font', size=15)
plt.figure(figsize=(7, 7))

label = ['healthy', 'multiple diseases', 'rust', 'scab'] # Target Value Value
# Target value distribution pie chart
plt.pie([len(healthy), len(multiple_diseases), len(rust), len(scab)], 
        labels=label, 
        autopct='%1.1f%%');

### Print Image

In [None]:
import matplotlib.gridspec as gridspec
import cv2 # OpenCV Library

def show_image(img_ids, rows=4, cols=3): 
    assert len(img_ids) <= rows*cols # Raise Error if number of images exceed row/column count

    plt.figure(figsize=(15, 15)) # Set total Figure size
    grid = gridspec.GridSpec(rows, cols) 

    # 이미지 출력
    for idx, img_id in enumerate(img_ids):
        img_path = f'{data_path}/images/{img_id}.jpg' # Image File Path
        image = cv2.imread(img_path) # Read Image File
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert Image Color
        ax = plt.subplot(grid[idx])
        ax.imshow(image) # Print Image

In [None]:
# image_id for each target value (last 12)
last_healthy_img_ids = healthy['image_id'][-12:]
last_multiple_diseases_img_ids = multiple_diseases['image_id'][-12:]
last_rust_img_ids = rust['image_id'][-12:]
last_scab_img_ids = scab['image_id'][-12:]

In [None]:
show_image(last_healthy_img_ids) # Healthy Leaf Output

In [None]:
show_image(last_multiple_diseases_img_ids) # Leaf output with various diseases

In [None]:
show_image(last_rust_img_ids) # Leaf Output with Rust Disease

In [None]:
show_image(last_scab_img_ids) # Leaves infected with Scab disease

# 2) Modeling

## Fixe seed values and device settings

In [None]:
import torch # Pytorch
import random
import numpy as np
import os

# Fix Seed 
seed = 10

os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [None]:
# Set Device

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Prepare Data

### Split train data and valid data

In [None]:
from sklearn.model_selection import train_test_split

# Split train data and valid data
_, valid = train_test_split(train, 
                            test_size=0.1,
                            stratify=train[['healthy', 'multiple_diseases', 'rust', 'scab']],
                            random_state=10)

### Define DataSet

In [None]:
import cv2
from torch.utils.data import Dataset # Class for data generation
import numpy as np

class ImageDataset(Dataset):
    # Initialization method
    def __init__(self, df, img_dir='./', transform=None, is_test=False):
        super().__init__() # Call the __init__() method of the inherited Dataset class
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
    
    # Dataset size return method
    def __len__(self):
        return len(self.df)
    
    # Data return method corresponding to index(idx)
    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0] # Image ID
        img_path = self.img_dir + img_id + '.jpg' # Image file path
        image = cv2.imread(img_path) # Reda Image file
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert Image color
        # Transform Image
        if self.transform is not None:
            image = self.transform(image=image)['image']
        # If test data, return image data only; otherwise, return target values(label) as well.
        if self.is_test:
            return image
        else:
            # Index of the largest of the four target values
            label = np.argmax(self.df.iloc[idx, 1:5]) 
            return image, label

### Define Image Transformations for Data Augmentation

In [None]:
# Module for Image Transformations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Transformer for train data
transform_train = A.Compose([
    A.Resize(400, 600), # Resize image
    # 밝기 및 대비 조절 
    A.RandomBrightnessContrast(brightness_limit=0.1, 
                               contrast_limit=0.1, p=0.5),
    A.VerticalFlip(p=0.5), # Vertical Symmetric Conversion
    A.HorizontalFlip(p=0.5), # Horizontal Symmetric Conversion
    # shift, Scale, Rotational Transformation
    A.ShiftScaleRotate(
        shift_limit=0.1,
        scale_limit=0.2,
        rotate_limit=25, p=0.7),
    # Embossed, sharp, blur effect
    A.OneOf([A.Emboss(p=1),
             A.Sharpen(p=1),
             A.Blur(p=1)], p=0.5),
    A.PiecewiseAffine(p=0.5), # Affine Transformation 
    A.Normalize(), # Normalize Transformation 
    ToTensorV2() # Convert to Tensor
])

# Transformer for valid and test data
transform_test = A.Compose([
    A.Resize(400, 600),
    A.Normalize(),
    ToTensorV2()
])

### Create Datasets and Data Loaders

In [None]:
img_dir = '/kaggle/input/plant-pathology-2020-fgvc7/images/'

dataset_train = ImageDataset(train, img_dir=img_dir, transform=transform_train)
dataset_valid = ImageDataset(valid, img_dir=img_dir, transform=transform_test)

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)

In [None]:
from torch.utils.data import DataLoader # Class for creating data loaders

batch_size = 4

loader_train = DataLoader(dataset_train, batch_size=batch_size, 
                          shuffle=True, worker_init_fn=seed_worker,
                          generator=g)
loader_valid = DataLoader(dataset_valid, batch_size=batch_size, 
                          shuffle=False, worker_init_fn=seed_worker,
                          generator=g)

## Create and Train Model, Model Peformance Validation

### Create Model

In [None]:
!pip install efficientnet-pytorch==0.7.1

In [None]:
from efficientnet_pytorch import EfficientNet # EfficientNet Model

# Load pre-trained efficientnet-b7 model
model = EfficientNet.from_pretrained('efficientnet-b7', num_classes=4) 

model = model.to(device) # Assign device

### Loss Function, Optimizer, and Scheduler

In [None]:
import torch.nn as nn # Neural Network module

# Loss function
criterion = nn.CrossEntropyLoss()

In [None]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00007, weight_decay=0.0001)

In [None]:
from transformers import get_cosine_schedule_with_warmup

epochs = 38 # Number of total epochs

# Scheduler
scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=len(loader_train)*5, 
                                            num_training_steps=len(loader_train)*epochs)

### Train and Validate an Model

In [None]:
from sklearn.metrics import roc_auc_score # ROC AUC Score Calculation Function
from tqdm.notebook import tqdm # Progress Bar

# Training as much as epochs.
for epoch in range(epochs):
    model.train() # Set Model to Training State
    epoch_train_loss = 0 # Initialize loss values by epoch (for train data)
    # Repeat 'Repeatation Counts' to extract data by mini-batch size
    for images, labels in tqdm(loader_train):
        # Assign image, label (target value) data mini-position to device
        images = images.to(device)
        labels = labels.to(device)
        
        # Grad Initialization in optimizer
        optimizer.zero_grad()
        # Calculate output values using image data as input values for neural network models
        outputs = model(images)
        # Use the loss function to calculate loss of outputs and labels
        loss = criterion(outputs, labels)
        loss.backward() # Perform Backpropagation
        optimizer.step() # Update Weight
        scheduler.step() # Update Scheduler Learning Rate
        epoch_train_loss += loss.item() # Add loss in current deployment (for training data)
    # Print Training data loss
    print(f'Epoch [{epoch+1}/{epochs}] - Train data loss : {epoch_train_loss/len(loader_train):.4f}')
    
    model.eval() # Set Model to Evaluation State
    epoch_valid_loss = 0 # Initialize loss values by epoch (for valid data)
    preds_list = [] # Initialize the list for storing predicted probability values
    true_onehot_list = [] # Initialize the list for storing true target values
    
    with torch.no_grad(): # Inactivate grad calculation
        for images, labels in loader_valid:
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            epoch_valid_loss += loss.item()
            
            preds = torch.softmax(outputs.cpu(), dim=1).numpy() # Predicted Probability Value
            true_onehot = torch.eye(4)[labels].cpu().numpy() # True value (in one-hot encoding format)
            # Store predicted probability values and true values
            preds_list.extend(preds)
            true_onehot_list.extend(true_onehot)
        # Print validation data loss values and ROC AUC scores
        print(f'Epochs [{epoch+1}/{epochs}] - Valid data loss : {epoch_valid_loss/len(loader_valid):.4f} / Valid data ROC AUC : {roc_auc_score(true_onehot_list, preds_list):.4f}')  

## Prediction and Submission

In [None]:
# Test Datasets and Data Loaders
dataset_test = ImageDataset(test, img_dir=img_dir, 
                            transform=transform_test, is_test=True)
loader_test = DataLoader(dataset_test, batch_size=batch_size, 
                         shuffle=False, worker_init_fn=seed_worker,
                         generator=g)

# TTA Datasets and Data Loaders
dataset_TTA = ImageDataset(test, img_dir=img_dir, 
                           transform=transform_train, is_test=True)
loader_TTA = DataLoader(dataset_TTA, batch_size=batch_size, 
                        shuffle=False, worker_init_fn=seed_worker,
                        generator=g)

### Prediction

In [None]:
model.eval() # Set model to evaluation state

preds_test = np.zeros((len(test), 4)) # Initialize Array for Store Predicted Values

with torch.no_grad():
    for i, images in enumerate(loader_test):
        images = images.to(device)
        outputs = model(images)
        # Target Prediction Probability
        preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds_test[i*batch_size:(i+1)*batch_size] += preds_part

### Submission

In [None]:
submission_test = submission.copy() # Copy Submission Sample

submission_test[['healthy', 'multiple_diseases', 'rust', 'scab']] = preds_test

## TTA and Label Smoothing for Performance Improvements

In [None]:
num_TTA = 5 # Numver of TTA

preds_tta = np.zeros((len(test), 4)) # Initialize Array for Store Predicted Values (TTA)

# Use TTA to predict
for i in range(num_TTA):
    with torch.no_grad():
        for i, images in enumerate(loader_TTA):
            images = images.to(device)
            outputs = model(images)
            # Target Prediction Probability
            preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
            preds_tta[i*batch_size:(i+1)*batch_size] += preds_part

In [None]:
preds_tta /= num_TTA 

In [None]:
submission_tta = submission.copy() 

submission_tta[['healthy', 'multiple_diseases', 'rust', 'scab']] = preds_tta

In [None]:
submission_test.to_csv('submission_test.csv', index=False)
submission_tta.to_csv('submission_tta.csv', index=False)

In [None]:
def apply_label_smoothing(df, target, alpha, threshold):
    # Copy Target Value
    df_target = df[target].copy()
    k = len(target) # Number of Target Value
    
    for idx, row in df_target.iterrows():
        if (row > threshold).any(): # Determine if the target value is above the threshold
            row = (1 - alpha)*row + alpha/k # Apply Label Smoothing
            df_target.iloc[idx] = row # Convert to Value Applied Label Smoothing
    return df_target # Return target value with label smoothing

In [None]:
alpha = 0.01 # Label Smoothing Strength
threshold = 0.99 # Threshold to which label smoothing applies

# Copy DataFrame to apply label smoothing
submission_test_ls = submission_test.copy()
submission_tta_ls = submission_tta.copy()
target = ['healthy', 'multiple_diseases', 'rust', 'scab'] # Target Value Column Name

# Apply Label Smoothing
submission_test_ls[target] = apply_label_smoothing(submission_test_ls, target, 
                                                   alpha, threshold)
submission_tta_ls[target] = apply_label_smoothing(submission_tta_ls, target, 
                                                  alpha, threshold)

submission_test_ls.to_csv('submission_test_ls.csv', index=False)
submission_tta_ls.to_csv('submission_tta_ls.csv', index=False)

In [None]:
path = './' # Model Storing Path

torch.save({
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict()
    }, path + 'EfficientNet-B7.tar')