# Aerial Cactus Identification - Baseline

Tutorial Link -> https://www.kaggle.com/code/werooring/ch11-baseline

**Fix Seed Value**

In [19]:
import torch # pytorch
import random
import numpy as np
import os

# fix seed value
seed = 50
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) 
torch.backends.cudnn.deterministic = True 
torch.backends.cudnn.benchmark = False    
torch.backends.cudnn.enabled = False      

**Set GPU Environment**

In [20]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [21]:
device

device(type='cpu')

**Prep Data**

1. Divide train / validation data
2. Define DataSet class
3. Create DataSet
4. Create DataLoader

- What is DataSet class & DataLoader's job?
    - Provide data in mini-batch unit which is needed for deep learning model's training 

In [22]:
import pandas as pd

data_path = '/kaggle/input/aerial-cactus-identification/'

labels = pd.read_csv(data_path + 'train.csv') # train data 
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [23]:
# unzip the zip file 

from zipfile import ZipFile

# unzip training img data
with ZipFile(data_path + 'train.zip') as zipper:
    zipper.extractall()

# unzip test img data 
with ZipFile(data_path + 'test.zip') as zipper:
    zipper.extractall()

**Divide Train / Validation Data**

- stratify
    - Stratify refers to the process of ensuring that each class or category in a dataset is proportionally represented when splitting the data
    - ex) stratify = labels['has_cactus'] in next cell -> target value ratio was 1 : 3 -> this ratio is also applied in each train / validation data

In [24]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(labels, 
                               test_size = 0.1, # ratio; train : valid = 9 : 1
                               stratify = labels['has_cactus'],
                               random_state = 50)

In [25]:
# train data count : validation data count = 9 : 1
print('train data count: ', len(train))
print('validation data count: ', len(valid))

train data count:  15750
validation data count:  1750


**Define DataSet Class**

- Use Dataset class provided by pytorch
- Have to overide `__len__()` and `__getitem()__`
    - `__len__()`: return Dataset's size
    - `__getitem()__`: return data of corresponding index

In [26]:
import cv2
from torch.utils.data import Dataset 

class ImageDataset(Dataset):
    # constructor
    def __init__(self, df, img_dir = './', transform = None):
        super().__init__()
        self.df = df # train or validation dataset 
        self.img_dir = img_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]
        img_path = self.img_dir + img_id
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.df.iloc[idx, 1] # target value
        
        if self.transform is not None:
            # if there's a transformer(변환기)
            image = self.transform(image)
            
        return image, label

**Create Dataset**

In [27]:
# transform img data to tensor type
from torchvision import transforms

transform = transforms.ToTensor()
# (width pixel num, height pixel num, channel num) -> (channel num, width pixel num, height pixel num)
# if there's a batch -> (batch size, channel num, width pixel num, height pixel num)

In [28]:
# create train / validation dataset
dataset_train = ImageDataset(df = train, img_dir = 'train/', transform = transform)
dataset_valid = ImageDataset(df = valid, img_dir = 'train/', transform = transform)

**Create Data Loader**

- Data Loader
    - Fetch data by designated batch size 

In [29]:
from torch.utils.data import DataLoader

loader_train = DataLoader(dataset = dataset_train, batch_size = 32, shuffle = True)
loader_valid = DataLoader(dataset = dataset_valid, batch_size = 32, shuffle = True)

## Create Model

- CNN Model

In [30]:
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        
        # 1st CNN layer
        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 3, padding = 2)

        # 2nd CNN layer
        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 3, padding = 2)

        # Max Pooling layer
        self.max_pool = nn.MaxPool2d(kernel_size = 2)

        # Mean Pooling layer
        self.avg_pool = nn.AvgPool2d(kernel_size = 2)

        # Fully Connected layer
        self.fc = nn.Linear(in_features = 64 * 4 * 4, out_features = 2)

    def forward(self, x):
        x = self.max_pool(F.relu(self.conv1(x)))
        x = self.max_pool(F.relu(self.conv2(x)))
        x = self.avg_pool(x)
        x = x.view(-1, 64 * 4 * 4) # flattening
        x= self.fc(x)
        return x 

In [31]:
model = Model().to(device) # device = GPU

## Train Model

**Set Loss Function**

In [32]:
# loss function -> use cross entropy (because it's classification problem)
criterion = nn.CrossEntropyLoss()

**Set Optimizer**

In [33]:
# optimizer -> finding optimized weight algorithm
# SGD -> standard optimizer (use stochastic gradient descent)

optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

**Train Model**

- Process of Deep Learning Model Traning for images
    1. Fetch data of batch size from data loader
    2. Allocate given image and label(target) data to device(GPU)
    3. Initialize gradient of Optimizer
    4. Give input data (img) to CNN model -> forward propagation -> get output (predicted value)
    5. Compare predicted value and label (target) value to calculate loss
    6. Perform back propagation based on loss
    7. Update weight using gradient calculated by back prop
    8. Repeat 1~7 * (repeat count)
    9. Repeat 1~8 * (epoch count)

In [34]:
epochs = 10

for epoch in range(epochs):
    epoch_loss = 0
    
    for images, labels in loader_train: # repeat count = len(loader_train)
        images = images.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(images)
        
        loss = criterion(outputs, labels)
        
        epoch_loss += loss.item()
        loss.backward()
        
        optimizer.step() # new weight = original weight - (learning rate * gradient)
        
    print(f'epoch [{epoch+1}/{epochs}] - loss: {epoch_loss/len(loader_train):.4f}')

epoch [1/10] - loss: 0.5240
epoch [2/10] - loss: 0.3407
epoch [3/10] - loss: 0.2444
epoch [4/10] - loss: 0.1975
epoch [5/10] - loss: 0.1747
epoch [6/10] - loss: 0.1637
epoch [7/10] - loss: 0.1515
epoch [8/10] - loss: 0.1430
epoch [9/10] - loss: 0.1353
epoch [10/10] - loss: 0.1287


## Performance Validation

In [43]:
from sklearn.metrics import roc_auc_score
import numpy as np

true_list = []
preds_list = []

In [44]:
model.eval() # evaluation stage -> won't apply dropout 

with torch.no_grad(): # inactivate calculating gradient (no need to calculate gradient in evaluation step)
    for images, labels in loader_valid:
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images)
        # why back to cpu? -> roc_auc is sklearn -> it can't perform on GPU
        preds = torch.softmax(outputs.cpu(), dim = 1)[:, 1] # preds probability
        true = labels.cpu() # true val
        
        # have to convert preds and true tensors to original python array or numpy array
        preds_list.extend(preds.numpy())
        true_list.extend(true.numpy())
        
print(f'validation data ROC AUC: {roc_auc_score(true_list, preds_list):.4f}')

validation data ROC AUC: 0.9900


## Prediction and Submit Result

In [45]:
# create test dataset and data loader
dataset_test = ImageDataset(df = submission, img_dir = 'test/', transform = transform)
loader_test = DataLoader(dataset = dataset_test, batch_size = 32, shuffle = False)

In [48]:
model.eval()

preds = []

with torch.no_grad():
    for images, _ in loader_test:
        images = images.to(device)
        
        outputs = model(images)
        
        preds_part = torch.softmax(outputs.cpu(), dim = 1)[:, 1].tolist()
        
        preds.extend(preds_part)

In [49]:
submission['has_cactus'] = preds
submission.to_csv('submission.csv', index = False)

In [50]:
import shutil

# delete entire directory 
shutil.rmtree('./train')
shutil.rmtree('./test')