In [None]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
from torch.cuda.amp import autocast, GradScaler

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

# For Image Models
import timm

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


import warnings
warnings.filterwarnings("ignore")
# For descriptive error messages
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
CONFIG = {"seed": 42, 
          "epochs": 25, 
          "img_size": 768, 
          "model_name": "tf_efficientnet_b6_ns", # tf_efficientnet_b6_ns, tf_efficientnetv2_l_in21k, eca_nfnet_l2 
          "num_classes": 15587, 
          "embedding_size": 512, 
          "train_batch_size": 32, 
          "valid_batch_size": 64, 
          "learning_rate": 1e-4, 
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-7, 
          "T_max": 24, 
          "weight_decay":
          "n_fold": 5, 
          "n_accumulate": 1, 
          "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
          "gpu_parallel":True, 
          "max_grad_norm":1000,
          "amp":False,
          "image_data":"fullbody", 
          "debug":False, 
          "num_workers":10, 

          # ArcFace Hyperparameters
          "s": 30.0, # arcface scale
          "m": 0.30, # arcface margin
          "ls_eps": 0.0, # arcface label smoothing
          "easy_margin": False, # arcface easy_margin
          }

CONFIG["T_max"] = CONFIG["epochs"]-1 

if CONFIG["debug"]:
    CONFIG["img_size"] = 512
    CONFIG["model_name"] = "tf_efficientnet_b0_ns"
    CONFIG["train_batch_size"] = 32
    CONFIG["valid_batch_size"] = 64

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # set True to be faster
seed_everything(CONFIG['seed'])

In [None]:
ROOT_DIR = '/home/workspace/happy-whale-and-dolphin'

if CONFIG["image_data"] == "backfins":
    TRAIN_DIR = f'{ROOT_DIR}/train_backfins_images'
    TEST_DIR = f'{ROOT_DIR}/test_backfins_images'
    
elif CONFIG["image_data"] == "fullbody":
    TRAIN_DIR = f'{ROOT_DIR}/train_fullbody_images'
    TEST_DIR = f'{ROOT_DIR}/test_fullbody_images'
    
else:
    TRAIN_DIR = f'{ROOT_DIR}/train_images'
    TEST_DIR = f'{ROOT_DIR}/test_images'

In [None]:
def get_train_file_path(id):
    return f"{TRAIN_DIR}/{id}"

df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df['file_path'] = df['image'].apply(get_train_file_path) 
df.head()

In [None]:
encoder = LabelEncoder()
df['individual_id'] = encoder.fit_transform(df['individual_id'])

with open(f'{ROOT_DIR}/le.pkl', "wb") as fp:
    joblib.dump(encoder, fp)

In [None]:
skf = StratifiedKFold(n_splits=CONFIG['n_fold'])
for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.individual_id)):
      df.loc[val_ , "kfold"] = fold

In [None]:
class HappyWhaleDataset(Dataset):
    '''
    torch HappyWhale DataSets
    '''
    def __init__(self, df, transforms=None):
        self.df = df # csv to dataframe
        self.file_names = df['file_path'].values 
        self.labels = df['individual_id'].values 
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df) 
    
    def __getitem__(self, index):
        img_path = self.file_names[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
        label = self.labels[index] 
        
        if self.transforms:
            img = self.transforms(image=img)["image"] 
            
        return {
            'image': img, 
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
data_transforms = {
    "train": A.Compose([
        A.Affine(rotate=(-15, 15), translate_percent=(0.0, 0.25), shear=(-3, 3), p=0.5),
        A.RandomResizedCrop(CONFIG['img_size'], CONFIG['img_size'], scale=(0.9, 1.0), ratio=(0.75, 1.333)), 
        A.ToGray(p=0.1),
        A.GaussianBlur(blur_limit=(3, 7), p=0.07), 
        A.GaussNoise(p=0.07), # 高斯噪音
        A.RandomGridShuffle(grid=(2, 2), p=0.28), 
        A.RandomBrightnessContrast(p=0.5),
        A.HorizontalFlip(p=0.1), 
        A.Normalize( 
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']), 
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.
        )
}

In [None]:
from pylab import rcParams
dataset_show = HappyWhaleDataset(df,
               transforms=data_transforms['train'] 
               )
rcParams['figure.figsize'] = 15,8
for i in range(2):
    f, axarr = plt.subplots(1,3)
    for p in range(3):
        idx = np.random.randint(0, len(dataset_show))
        img_data = dataset_show[idx]
        axarr[p].imshow(img_data["image"].transpose(0, 1).transpose(1,2))
        axarr[p].set_title(str(img_data["label"]))

In [None]:
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'

In [None]:
# Arcface
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, 
                 m=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features 
        self.out_features = out_features
        self.s = s # re-scale
        self.m = m # margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)
        self.easy_margin = easy_margin 
        self.cos_m = math.cos(m) # cos margin
        self.sin_m = math.sin(m) # sin margin
        self.threshold = math.cos(math.pi - m) # cos(pi - m) = -cos(m)
        self.mm = math.sin(math.pi - m) * m # sin(pi - m)*m = sin(m)*m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight)) # 获得cosθ (vector)
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) # 获得cosθ
        phi = cosine * self.cos_m - sine * self.sin_m # cosθ*cosm – sinθ*sinm = cos(θ + m)
        phi = phi.float() # phi to float
        cosine = cosine.float() # cosine to float
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            # if cos(θ) > cos(pi - m) means θ + m < math.pi, so phi = cos(θ + m);
            # else means θ + m >= math.pi, we use Talyer extension to approximate the cos(θ + m).
            # if fact, cos(θ + m) = cos(θ) - m * sin(θ) >= cos(θ) - m * sin(math.pi - m)
            phi = torch.where(cosine > self.threshold, phi, cosine - self.mm) 
        # https://github.com/ronghuaiyang/arcface-pytorch/issues/48
        # --------------------------- convert label to one-hot ---------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device=CONFIG['device'])
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output

In [None]:
class HappyWhaleModel(nn.Module):
    def __init__(self, model_name, embedding_size, pretrained=True):
        super(HappyWhaleModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained) 
        if 'efficientnet' in model_name:
            in_features = self.model.classifier.in_features
            self.model.classifier = nn.Identity()
            self.model.global_pool = nn.Identity()
        elif 'nfnet' in model_name:
            in_features = self.model.head.fc.in_features
            self.model.head.fc = nn.Identity()
            self.model.head.global_pool = nn.Identity()

        self.pooling = GeM() # GeM Pooling
        self.embedding = nn.Sequential(
                            nn.BatchNorm1d(in_features),
                            nn.Linear(in_features, embedding_size)
                            )
        # arcface
        self.fc = ArcMarginProduct(embedding_size,
                                   CONFIG["num_classes"], 
                                   s=CONFIG["s"],
                                   m=CONFIG["m"], 
                                   easy_margin=CONFIG["easy_margin"], 
                                   ls_eps=CONFIG["ls_eps"])

    def forward(self, images, labels):
        '''
        train/valid
        '''
        features = self.model(images) 
        pooled_features = self.pooling(features).flatten(1)
        embedding = self.embedding(pooled_features) # embedding
        output = self.fc(embedding, labels) # arcface
        return output
    
    def extract(self, images):
        '''
        test
        '''
        features = self.model(images) 
        pooled_features = self.pooling(features).flatten(1)
        embedding = self.embedding(pooled_features) # embedding
        return embedding

model = HappyWhaleModel(CONFIG['model_name'], CONFIG['embedding_size']) 

if CONFIG['gpu_parallel']: 
    num_gpu = torch.cuda.device_count() 
    model = nn.DataParallel(model, device_ids=range(num_gpu)) 
_ = model.to(CONFIG['device']) 

In [None]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train() #
    scaler = GradScaler() 
    dataset_size = 0 
    running_loss = 0.0 
    pred_correct = 0 
    counter = 0 
    correct_top5 = 0 
    bar = tqdm(enumerate(dataloader), total=len(dataloader)) 
    for step, data in bar: 
        images = data['image'].to(device, dtype=torch.float) 
        labels = data['label'].to(device, dtype=torch.long) 
        
        batch_size = images.size(0) 
        if CONFIG["amp"]: 
            with autocast(): 
                outputs = model(images, labels) 
                loss = criterion(outputs, labels) 
        else:
            outputs = model(images, labels) 
            loss = criterion(outputs, labels) 

        loss = loss / CONFIG['n_accumulate'] 
        if CONFIG["amp"]:
            scaler.scale(loss).backward() 
        else:
            loss.backward() 
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG['max_grad_norm']) 
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            if CONFIG["amp"]:
                scaler.step(optimizer)
                scaler.update() 
            else:
                optimizer.step() 
            optimizer.zero_grad() 

            if scheduler is not None:
                scheduler.step() 
                
        running_loss += (loss.item() * batch_size) 
        dataset_size += batch_size 
        epoch_loss = running_loss / dataset_size

        outputs = outputs.detach().cpu().numpy() 
        labels = labels.detach().cpu().numpy() 
        counter += len(labels) 
        
        pred_correct += (np.argmax(outputs, -1) == labels).sum() 
        acc = pred_correct / counter 

        outputs_top5 = outputs.argsort()[:, -5:][:,::-1] 
        labels_top5 = np.expand_dims(labels, axis=1) 
        correct_top5 += sum(np.any((labels_top5 == outputs_top5),axis=1)) 
        acc_top5 = correct_top5 / counter 

        bar.set_postfix(Epoch=epoch,
                        Train_Loss=epoch_loss,
                        Train_Acc=acc,
                        Train_Top5_Acc=acc_top5,
                        grad_norm=grad_norm.item(),
                        LR=optimizer.param_groups[0]['lr']
                        )
    gc.collect()
    return epoch_loss

In [None]:
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0 
    running_loss = 0.0 
    pred_correct = 0 
    counter = 0 
    correct_top5 = 0 
    bar = tqdm(enumerate(dataloader), total=len(dataloader)) 
    for step, data in bar:        
        images = data['image'].to(device, dtype=torch.float) 
        labels = data['label'].to(device, dtype=torch.long) 
        
        batch_size = images.size(0)  

        outputs = model(images, labels) 
        loss = criterion(outputs, labels)
    
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        epoch_loss = running_loss / dataset_size

            
        outputs = outputs.detach().cpu().numpy()  
        labels = labels.detach().cpu().numpy() 
        counter += len(labels) 

        pred_correct += (np.argmax(outputs, -1) == labels).sum() 
        acc = pred_correct / counter 

        outputs_top5 = outputs.argsort()[:, -5:][:,::-1] 
        labels_top5 = np.expand_dims(labels, axis=1)
        correct_top5 += sum(np.any((labels_top5 == outputs_top5),axis=1)) 
        acc_top5 = correct_top5 / counter 
        
        bar.set_postfix(Epoch=epoch,
                        Valid_Loss=epoch_loss,
                        Valid_Acc=acc,
                        Valid_Top5_Acc=acc_top5,
                        LR=optimizer.param_groups[0]['lr'])   

    gc.collect()
    return epoch_loss 

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs):  
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict()) 
    best_epoch_loss = np.inf 
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, dataloader=train_loader, device=CONFIG['device'], epoch=epoch) 
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], epoch=epoch) 
    
        history['Train Loss'].append(train_epoch_loss) 
        history['Valid Loss'].append(val_epoch_loss)
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})") 
            best_epoch_loss = val_epoch_loss 
            best_model_wts = copy.deepcopy(model.state_dict()) 
            PATH = "Loss{:.4f}_epoch{:.0f}.bin".format(best_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH) 
            # Save a model file from the current directory
            print(f"Model Saved")
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60)) # 打印本次训练的耗时
    print("Best Loss: {:.4f}".format(best_epoch_loss)) # 打印 best loss
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history 

In [None]:
def fetch_scheduler(optimizer):

    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=CONFIG['T_max'], eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [None]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True) 
    df_valid = df[df.kfold == fold].reset_index(drop=True) 
    
    train_dataset = HappyWhaleDataset(df_train, transforms=data_transforms["train"]) 
    valid_dataset = HappyWhaleDataset(df_valid, transforms=data_transforms["valid"]) 

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'],  num_workers=CONFIG["num_workers"], shuffle=True,  pin_memory=False, drop_last=True) 
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'],  num_workers=CONFIG["num_workers"], shuffle=False, pin_memory=False) 
    
    return train_loader, valid_loader

<span style="color: #000508; font-family: Segoe UI; font-size: 1.5em; font-weight: 300;">Prepare Dataloaders</span>

In [None]:
train_loader, valid_loader = prepare_loaders(df, fold=0) 
optimizer = optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay']) 
scheduler = fetch_scheduler(optimizer) # 定义调度器
model, history = run_training(model, optimizer, scheduler, device=CONFIG['device'], num_epochs=CONFIG['epochs'])