In [None]:
import os
import gc
# import cv2
import math
import copy
import time
import random
import glob
from matplotlib import pyplot as plt

# For data manipulation
import numpy as np
import pandas as pd

# Image processing
# import timm
# import albumentations as A
# from albumentations.pytorch import ToTensorV2

# Pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torchvision

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

# Format and filter potential warnings and errors
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
! pip install pytorch_metric_learning

In [None]:
PATH = '/kaggle/input/aaa-ml-2021/avito-auto-moderation/'
TRAIN_FILE = 'train_v2.csv'
SUB_FILE = 'sample_submission_v2.csv'

train = pd.read_csv(os.path.join(PATH, TRAIN_FILE))
submission = pd.read_csv(os.path.join(PATH, SUB_FILE))

In [None]:
train.head()

In [None]:
train.label.mean()

In [None]:
train

## Class examples

In [None]:
image_file = train.image[0]
img = plt.imread(os.path.join(PATH, image_file))
plt.imshow(img)

In [None]:
submission

In [None]:
res = []

for i in range(381):
    image_file = submission.image[i]
    img = plt.imread(os.path.join(PATH, image_file))
    if img.shape[0] <= 400 or img.shape[1] <= 400:
        res.append(image_file)

In [None]:
res

In [None]:
image_file = train.image[1]
img = plt.imread(os.path.join(PATH, image_file))
plt.imshow(img)

In [None]:
CONFIG = {
    "seed": 42,
    "epochs": 20,
    "img_size": 256,
    "model_name": "tf_efficientnet_b0_ns",
    "checkpoint_path" : "/kaggle/input/tf-efficientnet/pytorch/tf-efficientnet-b0/1/tf_efficientnet_b0_aa-827b6e33.pth",
    "num_classes": 2,
    "train_batch_size": 32,
    "valid_batch_size": 64,
    "learning_rate": 0.001,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 0.00001,
    "T_max": 500,
    "T_0": 5,
    "weight_decay": 0.00001,
    "fold" : 0,
    "n_fold": 5,
    "n_accumulate": 1,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [None]:
def get_train_file_path(image_id):
    return f"{PATH}{image_id}"

In [None]:
train_images = sorted(glob.glob(f"{PATH}/*.jpg"))

In [None]:
train.size

In [None]:
train['file_path'] = train['image'].apply(get_train_file_path)
train = train[ train["file_path"].isin(train_images) ].reset_index(drop=True)
train

In [None]:
CONFIG['T_max'] = train.shape[0] * (CONFIG["n_fold"]-1) * CONFIG['epochs'] // CONFIG['train_batch_size'] // CONFIG["n_fold"]
CONFIG['T_max']

In [None]:
skf = StratifiedKFold(n_splits=CONFIG['n_fold'])

for fold, ( _, val_) in enumerate(skf.split(X=train, y=train.label)):
      train.loc[val_ , "kfold"] = int(fold)

In [None]:
class SmokeData(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.file_names = df['file_path'].values
        self.labels = df['label'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path = self.file_names[index]
        img = cv2.imread(img_path)
        label = self.labels[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.ToGray(
                p=0.5
        ),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        A.HorizontalFlip(
                p=0.5
        ),
        A.Flip(
                p=0.5
        ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

In [None]:
class SmokeModel(nn.Module):
    def __init__(self, model_name, num_classes, pretrained=True, checkpoint_path=None):
        super(SmokeModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)

        inputs = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.linear1 = nn.Linear(inputs, num_classes)
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()
        self.do1 = nn.Dropout(0.2)

    def forward(self, images):
        out = self.model(images)
        embedding = out
        out = self.relu(out)
        out = self.do1(out)
        out = self.linear1(out)
        out = self.softmax(out)
        return embedding, out

    
model = SmokeModel(CONFIG['model_name'], CONFIG['num_classes'], checkpoint_path=CONFIG['checkpoint_path'])
model.to(CONFIG['device']);

In [None]:
from pytorch_metric_learning import samplers

In [None]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
#     df_train = df.reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = SmokeData(df_train, transforms=data_transforms["train"])
    valid_dataset = SmokeData(df_valid, transforms=data_transforms["valid"])
    
    train_sampler = samplers.MPerClassSampler(df_train['label'].values, 8, batch_size=None, length_before_new_iter=1000)

    train_loader = DataLoader(train_dataset, batch_size=16, 
                              num_workers=2, sampler=train_sampler)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False)
    
    return train_loader, valid_loader

In [None]:
train_loader, valid_loader = prepare_loaders(train, fold=CONFIG["fold"])

In [None]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

In [None]:
from pytorch_metric_learning import losses, miners, regularizers

In [None]:
triplet_loss = losses.TripletMarginLoss()
miner = miners.MultiSimilarityMiner()

In [None]:
def train_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    data_size = 0
    current_loss = 0.0
    current_acc  = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        
        batch_size = images.size(0)
        
        embeddings, outputs = model(images)
        miner_out = miner(embeddings, labels)
        loss = criterion(outputs, labels)
        loss = loss / CONFIG['n_accumulate']
        loss2 = triplet_loss(embeddings, labels, miner_out)
        
        full_loss = 0.9 * loss + 0.1 * loss2
            
        full_loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        _, predicted = torch.max(model.softmax(outputs), 1)
        acc = torch.sum( predicted == labels )
        
        current_loss += (loss.item() * batch_size)
        current_acc  += acc.item()
        data_size += batch_size
        
        epoch_loss = current_loss / data_size
        epoch_acc = current_acc / data_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss, Train_Acc=epoch_acc,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss, epoch_acc

In [None]:
@torch.inference_mode()
def validation_epoch(model, dataloader, device, epoch):
    model.eval()
    
    data_size = 0
    current_loss = 0.0
    current_acc = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        
        batch_size = images.size(0)

        _, outputs = model(images)
        loss = criterion(outputs, labels)

        _, predicted = torch.max(model.softmax(outputs), 1)
        acc = torch.sum( predicted == labels )

        current_loss += (loss.item() * batch_size)
        current_acc  += acc.item()
        data_size += batch_size
        
        epoch_loss = current_loss / data_size
        epoch_acc = current_acc / data_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss, Valid_Acc=epoch_acc,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss, epoch_acc

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model = copy.deepcopy(model.state_dict())
    best_epoch_acc = -np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        
        train_loader, valid_loader = prepare_loaders(train, fold=epoch % 5)
        
        train_epoch_loss, train_epoch_acc = train_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss, val_epoch_acc = validation_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        history['Train Accuracy'].append(train_epoch_acc)
        history['Valid Accuracy'].append(val_epoch_acc)
        
        # deep copy the model
        if best_epoch_acc <= val_epoch_acc:
            print(f"{b_}Validation Accuracy Improved ({best_epoch_acc} ---> {val_epoch_acc})")
            best_epoch_acc = val_epoch_acc
            best_model = copy.deepcopy(model.state_dict())
            PATH = "Acc{:.2f}_Loss{:.4f}_epoch{:.0f}.bin".format(best_epoch_acc, val_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Accuracy: {:.4f}".format(best_epoch_acc))
    
    # load best model weights
    model.load_state_dict(best_model)
    
    return model, history

In [None]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'ExponentialLR':
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer)
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [None]:
optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer)

In [None]:
model, history = run_training(model, optimizer, scheduler,
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
history = pd.DataFrame.from_dict(history)
history.to_csv("history.csv", index=False)

In [None]:
plt.plot( range(history.shape[0]), history["Train Loss"].values, label="Train Loss")
plt.plot( range(history.shape[0]), history["Valid Loss"].values, label="Valid Loss")
plt.title("Model Loss with Increasing Epochs")
plt.xlabel("epochs")
plt.ylabel("Loss")
plt.grid()
plt.legend()
plt.show()

In [None]:
plt.plot( range(history.shape[0]), history["Train Accuracy"].values, label="Train Accuracy")
plt.plot( range(history.shape[0]), history["Valid Accuracy"].values, label="Valid Accuracy")
plt.title("Model Accuracy with Increasing Epochs")
plt.xlabel("epochs")
plt.ylabel("Accuracy")
plt.grid()
plt.legend()
plt.show()

In [None]:
submission['file_path'] = submission['image'].apply(get_train_file_path)
submission = submission[ submission["file_path"].isin(train_images) ].reset_index(drop=True)
submission

In [None]:
submission['label'] = 1

In [None]:
submission

In [None]:
subm_data = SmokeData(submission, data_transforms['valid'])

In [None]:
res = []

In [None]:
for data in subm_data:
    with torch.no_grad():
        _, pred = model(data['image'].reshape((1, 3, 256, 256)).to(CONFIG['device']))
        res.append(pred.flatten()[1].item())

In [None]:
pd.Series(res)

In [None]:
submission['score'] = pd.Series(res)

In [None]:
submission = submission.drop(columns=['file_path', 'label'])

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)