In [1]:
import pandas as pd
import numpy as np
from skimage import io
from collections import OrderedDict, namedtuple
from torch.optim import lr_scheduler
from glob import glob  # find all pathnames matching certain patterns
from sklearn.model_selection import GroupKFold
import joblib
import torch
import torch.nn as nn
import os
import random
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2  # for converting image to tensor
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler  # return indices in sequence/random order
import cv2
from sklearn.metrics import *
from tqdm.notebook import tqdm
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
import warnings
warnings.filterwarnings("ignore")


SEED = 42

# Seed everything for reproducable results
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)



# Efficientnet

In [2]:
!pip install efficientnet_pytorch
# import efficientnet_pytorch
from efficientnet_pytorch import EfficientNet

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.6.3.tar.gz (16 kB)
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l- \ done
[?25h  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.6.3-py3-none-any.whl size=12419 sha256=dcdf49468b3eff76124df0e1042a3ce0a33918c1597b4489e26357320f978ac4
  Stored in directory: /root/.cache/pip/wheels/90/6b/0c/f0ad36d00310e65390b0d4c9218ae6250ac579c92540c9097a
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.6.3


# Getting dataframe for iterating

In [3]:
%%time
dataset = []
for label, kind in enumerate(["Cover", "JMiPOD", "JUNIWARD", "UERD"]):
    for i, path in enumerate(glob("../input/alaska2-image-steganalysis/Cover/*.jpg")):
        if i == 2500:
            break
        data = {
            "kind": kind,
            "image_name": path.split("/")[-1],
            "label": label
        }
        dataset.append(data)
random.shuffle(dataset)
dataset = pd.DataFrame(dataset)
dataset.shape

CPU times: user 1.16 s, sys: 115 ms, total: 1.28 s
Wall time: 1.56 s


(10000, 3)

# Changes made in fold column (0,1,2,3,4)

In [4]:
# Group k folds
"""
In group K folds image name exp 0001.jpg is present in all the four image folders,
If we take 0001.jpg in train then we will not take 0001.jpg in test (from rest of the folders)
"""
gkf = GroupKFold(n_splits = 5)

dataset.loc[:, "fold"] = 0
for fold_number, (train_index, val_index) in enumerate(gkf.split(X = dataset.index, y = dataset["label"], groups = dataset["image_name"])):
    dataset.loc[dataset.iloc[val_index].index, "fold"] = fold_number

# Train and test augmentation

In [5]:
# Simple Augmentations
def get_train_transforms():
    return A.Compose([
        A.HorizontalFlip(p = 0.5),
        A.VerticalFlip(p = 0.5),
        A.Resize(height = 512, width = 512, p = 1.0),
        ToTensorV2(p = 1.0),
    ], p = 1.0)

def get_valid_transforms():
    return A.Compose([
        A.Resize(height = 512, width = 512, p = 1.0),
        ToTensorV2(p = 1.0)
    ], p = 1.0)


In [6]:
DATA_ROOT_PATH = "../input/alaska2-image-steganalysis"

def onehot(size, target):
    vec = torch.zeros(size, dtype = torch.float32)
    vec[target] = 1
    return vec

class DatasetRetriever(Dataset):
    def __init__(self, kinds, image_names, labels, transforms = None):
        self.kinds = kinds
        self.image_names = image_names
        self.labels = labels
        self.transforms = transforms
        
    def __getitem__(self, index: int):
        kind = self.kinds[index]
        image_name = self.image_names[index]
        label = self.labels[index]
        image = cv2.imread(f"{DATA_ROOT_PATH}/{kind}/{image_name}", flags = cv2.IMREAD_COLOR)
        # cv2.im_read --> how the image 
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        
        if self.transforms:
            sample = {"image": image}
            sample = self.transforms(**sample)
            image = sample["image"]     # doubt
            """exp # a = {"a": 1, "b": 2}
            # print("{a}".format(**a))"""
            
        target = onehot(4, label)
        return image, target
        
    def __len__(self):
        return self.kinds.shape[0]
    
    def get_labels(self):
        return list(self.labels)

In [7]:
def alaska_weighted_auc(labels, preds, plot = False):
    tpr_thresholds = [0.0, 0.4, 1.0]
    weights =        [       2,   1]

    fpr, tpr, _ = roc_curve(labels, preds, pos_label=1)
    # data labels, preds
    area = np.array(tpr_thresholds)[1:] - np.array(tpr_thresholds)[:-1]     # [0.4, 0.6]
    area_normalized = np.dot(area, np.array(weights).T)  # For normalizing AUC
    fscore = 0
    for index, weight in enumerate(weights):
        ymin = tpr_thresholds[index]    
        ymax = tpr_thresholds[index + 1]

        mask = (tpr > ymin) & (tpr < ymax)
        x = np.concatenate([fpr[mask], np.linspace(fpr[mask][-1], 1, 100)])
        y = np.concatenate([tpr[mask], [ymax] * 100])
        y = y #(taking y as origin)
        score = auc(x, y-ymin)
        # Multiply score with weight
        weighted_score = score * weight

        fscore += weighted_score
        color = ["red", "green"]
        label = ["x ∈ [0, 1], y ∈ [0, 0.4]", "x ∈ [0, 1], y ∈ [0.4, 1.0]"]
        
        if plot:
            plt.title("Separate plots for x ∈ [0, 1], y ∈ [0, 0.4] and x ∈ [0, 1], y ∈ [0.4, 1.0]")
            plt.plot(x, y, color = color[index], label = label[index])
            plt.xlabel("False Positive rate")
            plt.ylabel("True Positive rate")
            plt.legend(loc = 2)
#           plt.plot()

        # Normalizing score
        final_score = fscore/area_normalized
        return final_score
    
class RocAucMeter(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.y_true = np.array([0, 1])
        self.y_pred = np.array([0.5, 0.5])
        self.score = 0
    def update(self, y_true, y_pred):
        y_true = y_true.argmax(axis=1).clip(min=0, max=1).astype(int)
        y_pred = 1 - nn.functional.softmax(torch.tensor(y_pred), dim=1)[:,0]
        self.y_true = np.hstack((self.y_true, y_true))
        self.y_pred = np.hstack((self.y_pred, y_pred))
        self.score = alaska_weighted_auc(self.y_true, self.y_pred)
    @property
    def avg(self):
        return self.score

In [8]:
class LabelSmoothing(nn.Module):
    def __init__(self, smoothing = 0.10):
        self.confidence = 1 - smoothing
        self.smoothing = smoothing
        super().__init__()
        
    def forward(self, x, target):
            x = x.float()
            target = target.float()
            
            logprobs = torch.nn.functional.log_softmax(x, dim = -1)
            nll_loss = -logprobs * target
            nll_loss = nll_loss.sum(-1)
    
            smooth_loss = -logprobs.mean(dim=-1)

            loss = self.confidence * nll_loss + self.smoothing * smooth_loss
            return loss.mean()

In [9]:

def get_net():
    net = EfficientNet.from_pretrained('efficientnet-b2')
    net._fc = nn.Linear(in_features=1408, out_features=4, bias=True)
    return net

mx = get_net()

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth" to /root/.cache/torch/checkpoints/efficientnet-b2-8bb594d6.pth


HBox(children=(FloatProgress(value=0.0, max=36804509.0), HTML(value='')))


Loaded pretrained weights for efficientnet-b2


In [10]:
def get_folds_data(TRAIN_BATCH_SIZE, EPOCHS, i):
    
    fold_number = i
    train_dataset = DatasetRetriever(
        kinds = dataset[dataset["fold"] != fold_number].iloc[:, 0].values,
        image_names = dataset[dataset["fold"] != fold_number].iloc[:, 1].values,
        labels = dataset[dataset["fold"] != fold_number].iloc[:, 2].values,
        transforms = get_train_transforms()
    )

    train_data_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=TRAIN_BATCH_SIZE,
            drop_last=True,  # take care of last batch
            num_workers=4
    )


    valid_dataset = DatasetRetriever(
        kinds=dataset[dataset['fold'] == fold_number].kind.values,
        image_names=dataset[dataset['fold'] == fold_number].image_name.values,
        labels=dataset[dataset['fold'] == fold_number].label.values,
        transforms=get_valid_transforms()
    )



    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=8,
        drop_last=False,
        num_workers=4
    )
    
    return train_data_loader, valid_data_loader

In [11]:
def _run():
    
    device = torch.device('cuda:0')
    model = mx.to(device)
    TRAIN_BATCH_SIZE = 8
    EPOCHS = 2
    
    
    for i in range(5):     # For five folds
        def loss_fn(outputs, targets):
            loss_obj = LabelSmoothing()
            loss = loss_obj(outputs, targets)
            return loss

        def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
            model.train()
            for bi, (X,y) in enumerate(data_loader):
                inputs = X
                targets = y

                inputs = inputs.to(device, dtype = torch.float32)
                targets = targets.to(device, dtype=torch.float32)

                optimizer.zero_grad()
                outputs = model(
                    inputs
                )

                loss = loss_fn(outputs, targets)
                if bi % 10 == 0:
                    print(f'bi={bi}, loss={loss}')

                loss.backward()
                optimizer.step()


                if scheduler is not None:
                    scheduler.step()

        def eval_loop_fn(data_loader, model, device):
            model.eval()
            fin_targets = []
            fin_outputs = []
            for bi, (X, y) in enumerate(data_loader):
                inputs = X
                targets = y

                inputs = inputs.to(device, dtype = torch.float32)
                targets = targets.to(device, dtype = torch.float32)
                outputs = model(
                    inputs
                )

                targets_np = targets.cpu().detach().numpy()
                outputs_np = outputs.cpu().detach().numpy()
                fin_targets.append(targets_np)
                fin_outputs.append(outputs_np)

            return np.vstack(fin_outputs), np.vstack(fin_targets)
        
        
#################################     Getting data for each fold    ######################################

        train_data_loader, valid_data_loader = get_folds_data(TRAIN_BATCH_SIZE, EPOCHS, i)

        print("\n")
        print("#"*25)
        print(f"FOLD: {i + 1}")
        print("#"*25)
        
        
############################## Initialize lr, optimizer, scheduler for each fold separately ##################################        
        lr = 0.001
        num_train_steps = int(len(dataset) / TRAIN_BATCH_SIZE * EPOCHS * 0.8)
        print(f'num_train_steps = {num_train_steps}')

        optimizer = AdamW(model.parameters(), lr=lr)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=num_train_steps
        )
        
        # Taking best model from each fold
        best_score = 0
        for epoch in range(EPOCHS):
            train_loop_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
            o, t = eval_loop_fn(valid_data_loader, model, device)
    #         torch.save(model.state_dict(), "model.bin")
            score = RocAucMeter()
            score.reset()
            score.update(t, o)
            print(f'SCORE = {score.avg}')
            if float(score.avg) > best_score:
                best_score = score.avg
                torch.save(model.state_dict(), f"model{i}.bin")


In [12]:
_run()



#########################
FOLD: 1
#########################
num_train_steps = 2000
bi=0, loss=1.4126255512237549
bi=10, loss=1.509793996810913
bi=20, loss=1.2633910179138184
bi=30, loss=1.4277515411376953
bi=40, loss=1.4665558338165283
bi=50, loss=1.5008107423782349
bi=60, loss=1.4777981042861938
bi=70, loss=1.4080777168273926
bi=80, loss=1.5895171165466309
bi=90, loss=1.3688946962356567
bi=100, loss=1.3446931838989258
bi=110, loss=1.3545405864715576
bi=120, loss=1.3193118572235107
bi=130, loss=1.4035861492156982
bi=140, loss=1.4206548929214478
bi=150, loss=1.4120688438415527
bi=160, loss=1.3810341358184814
bi=170, loss=1.323302149772644
bi=180, loss=1.4580987691879272
bi=190, loss=1.3130818605422974
bi=200, loss=1.2610474824905396
bi=210, loss=1.4416364431381226
bi=220, loss=1.438572883605957
bi=230, loss=1.406258463859558
bi=240, loss=1.4361960887908936
bi=250, loss=1.365513563156128
bi=260, loss=1.3752772808074951
bi=270, loss=1.6131097078323364
bi=280, loss=1.2928742170333862
bi=

In [13]:
# checkpoint = torch.load('./model.bin')
# mx.load_state_dict(checkpoint);
# mx.eval();