In [None]:
# COLAB
!pip install neptune-client
# pip install torch-tensorrt -f https://github.com/NVIDIA/Torch-TensorRT/releases
!unzip data.zip
!mkdir artifacts/

In [14]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

import warnings
warnings.simplefilter('ignore')

import pandas as pd 
import numpy as np

from tqdm import tqdm

from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.profiler import profile, ProfilerActivity

from torchvision import transforms as T
from torchvision.io import read_image

import itertools

import neptune.new as neptune


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
run = neptune.init(
    project="victorcallejas/Belluga",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlNDRlNTJiNC00OTQwLTQxYjgtYWZiNS02OWQ0MDcwZmU5N2YifQ=="
)

https://app.neptune.ai/victorcallejas/Belluga/e/BEL-192
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [16]:
device = torch.device("cuda")
print(torch.cuda.get_device_name(0), torch.cuda.get_device_properties(device))

fp16 = False
input_dtype = torch.float32

NVIDIA GeForce GTX 1070 with Max-Q Design _CudaDeviceProperties(name='NVIDIA GeForce GTX 1070 with Max-Q Design', major=6, minor=1, total_memory=8191MB, multi_processor_count=16)


In [17]:
# SCORING
PREDICTION_LIMIT = 20
QUERY_ID_COL = "query_id"
DATABASE_ID_COL = "database_image_id"
SCORE_COL = "score"

SCORE_THRESHOLD = 0.5

class MeanAveragePrecision:
    @classmethod
    def score(cls, predicted: pd.DataFrame, actual: pd.DataFrame, prediction_limit: int):
        """Calculates mean average precision for a ranking task.
        :param predicted: The predicted values as a dataframe with specified column names
        :param actual: The ground truth values as a dataframe with specified column names
        """
        if not predicted[SCORE_COL].between(0.0, 1.0).all():
            raise ValueError("Scores must be in range [0, 1].")
        if predicted.index.name != QUERY_ID_COL:
            raise ValueError(
                f"First column of submission must be named '{QUERY_ID_COL}', "
                f"got {predicted.index.name}."
            )
        if predicted.columns.to_list() != [DATABASE_ID_COL, SCORE_COL]:
            raise ValueError(
                f"Columns of submission must be named '{[DATABASE_ID_COL, SCORE_COL]}', "
                f"got {predicted.columns.to_list()}."
            )

        unadjusted_aps, predicted_n_pos, actual_n_pos = cls._score_per_query(
            predicted, actual, prediction_limit
        )
        adjusted_aps = unadjusted_aps.multiply(predicted_n_pos).divide(actual_n_pos)
        return adjusted_aps.mean()

    @classmethod
    def _score_per_query(
        cls, predicted: pd.DataFrame, actual: pd.DataFrame, prediction_limit: int
    ):
        """Calculates per-query mean average precision for a ranking task."""
        merged = predicted.merge(
            right=actual.assign(actual=1.0),
            how="left",
            on=[QUERY_ID_COL, DATABASE_ID_COL],
        ).fillna({"actual": 0.0})
        # Per-query raw average precisions based on predictions
        unadjusted_aps = merged.groupby(QUERY_ID_COL).apply(
            lambda df: average_precision_score(df["actual"].values, df[SCORE_COL].values)
            if df["actual"].sum()
            else 0.0
        )
        # Total ground truth positive counts for rescaling
        predicted_n_pos = merged["actual"].groupby(QUERY_ID_COL).sum().astype("int64").rename()
        actual_n_pos = actual.groupby(QUERY_ID_COL).size().clip(upper=prediction_limit)
        return unadjusted_aps, predicted_n_pos, actual_n_pos
    
    
def map_score(dataloader, model, threshold=SCORE_THRESHOLD):
    
    model.eval()
    
    sub = []
    
    sigmoid = torch.nn.Sigmoid()
    
    with torch.no_grad():        
    
        for query, reference, query_id, reference_id in tqdm(dataloader):
            
            query = query.to(device, non_blocking=True, dtype=input_dtype)
            reference = reference.to(device, non_blocking=True, dtype=input_dtype)

            with torch.cuda.amp.autocast(enabled = fp16):
                logits = sigmoid(model(query=query, reference=reference)).cpu().squeeze().tolist()
                
            sub.extend(zip(query_id, reference_id, logits))
            
    sub = pd.DataFrame(sub, columns=['query_id', 'database_image_id', 'score'])
    #sub = sub[sub.score > threshold]
    sub = sub.set_index(['database_image_id']).groupby('query_id')['score'].nlargest(20).reset_index()
    sub = sub.set_index('query_id')
    
    mean_avg_prec = MeanAveragePrecision.score(
        predicted=sub, actual=dataloader.dataset.gt, prediction_limit=PREDICTION_LIMIT
    )
    
    print('MaP: ',mean_avg_prec)
    return mean_avg_prec

In [18]:
# DATA

IMG_SIZE = 256
ROOT_DIR = '../data/'
NORM_TRANSFORMS = torch.nn.Sequential(
    T.Resize([IMG_SIZE, IMG_SIZE]),
    T.ConvertImageDtype(input_dtype),
    T.Normalize(mean = (0.4234, 0.4272, 0.4641),
                std  = (0.2037, 0.2027, 0.2142)),
)

VAL_SPLIT = 0.05

METADATA = pd.read_csv('../data/metadata.csv')[:50]

TRAIN, VAL = train_test_split(METADATA, test_size=0.05, random_state=42)
TRAIN, VAL = TRAIN.reset_index(), VAL.reset_index()
#TRAIN, VAL = METADATA, METADATA
#TRAIN = METADATA

def getImages(metadata):
    IMAGES = {}
    for image_id, path in tqdm(zip(metadata.image_id, metadata.path), total=metadata.shape[0]):
        IMAGES[image_id] = NORM_TRANSFORMS(read_image(ROOT_DIR + path))
    return IMAGES

IMAGES = getImages(METADATA)

class PreTrain_BellugaDataset(torch.utils.data.Dataset):
    
    def __init__(self, metadata):
        self.metadata = metadata

    def __len__(self):
        return self.metadata.shape[0]
    
    def __getitem__(self, idx):
        return IMAGES[self.metadata.image_id[idx]]

class Eval_BellugaDataset(torch.utils.data.Dataset):
    
    def __init__(self, metadata):
        self.metadata = metadata
    
        # GROUND TRUTH
        gt = []
        for wid in self.metadata.whale_id: # query
            tmp = self.metadata[self.metadata.whale_id == wid].image_id.tolist() # get all images id
            gt.extend(list(itertools.permutations(tmp, 2)))
        self.gt = pd.DataFrame(gt,columns=['query_id','database_image_id'])
        self.gt = self.gt.set_index('query_id')
        
        # ALL QUERIES
        self.query_reference = list(itertools.permutations(self.metadata.image_id, 2))
            
    def getimage(self, image_id):
        return IMAGES[image_id]

    def __len__(self):
        return len(self.query_reference)
    
    def __getitem__(self, idx):
        query_id = self.query_reference[idx][0]
        reference_id = self.query_reference[idx][1]
        
        query = self.getimage(query_id)
        reference = self.getimage(reference_id)
        
        return query, reference, query_id, reference_id
    
    
class Train_BellugaDataset(torch.utils.data.Dataset):
    
    def __init__(self, metadata):
        self.metadata = metadata
        self.aug = T.RandomErasing(p=0.4, scale=(0.12, 0.33), ratio=(0.3, 3.3), value=0, inplace=False)
            
    def getimage(self, image_id):
        return IMAGES[image_id]

    def __len__(self):
        return self.metadata.shape[0]
    
    def __getitem__(self, idx):
        
        anchor = self.aug(self.getimage(self.metadata.image_id[idx]))
        label = self.metadata.whale_id[idx]
        
        pos = self.aug(self.getimage(self.metadata[self.metadata.whale_id == label].sample()['image_id'].values[0]))
        neg = self.aug(self.getimage(self.metadata[self.metadata.whale_id != label].sample()['image_id'].values[0]))

        return anchor, pos, neg
    

100%|██████████| 50/50 [00:01<00:00, 43.53it/s]


In [19]:
# DATALOADERS
PRETRAIN_BS = 32
TRAIN_BS = 32
INFER_BS = TRAIN_BS

NUM_WORKERS = 0

pretrain_dataset = PreTrain_BellugaDataset(METADATA)
train_train_dataset = Train_BellugaDataset(TRAIN)
#train_eval_dataset = Eval_BellugaDataset(TRAIN)
valid_eval_dataset = Eval_BellugaDataset(VAL)

pretrain_dataloader = torch.utils.data.DataLoader(
                        pretrain_dataset, 
                        batch_size=PRETRAIN_BS,
                        shuffle=True, 
                        num_workers=NUM_WORKERS,
                        pin_memory=True
                    )

train_train_dataloader = torch.utils.data.DataLoader(
                        train_train_dataset, 
                        batch_size=TRAIN_BS,
                        shuffle=True, 
                        num_workers=NUM_WORKERS,
                        pin_memory=True
                    )
'''
train_eval_dataloader = torch.utils.data.DataLoader(
                        train_eval_dataset, 
                        batch_size=INFER_BS,
                        shuffle=True, 
                        num_workers=NUM_WORKERS,
                        pin_memory=True
                    )
'''
valid_eval_dataloader = torch.utils.data.DataLoader(
                        valid_eval_dataset, 
                        batch_size=INFER_BS,
                        shuffle=False, 
                        num_workers=NUM_WORKERS,
                        pin_memory=True
                    )   


In [20]:
import torch
from torch import nn
import torch.nn.functional as F
from einops import repeat

class SimMIM(nn.Module):
    def __init__(
        self,
        *,
        encoder,
        masking_ratio = 0.5
    ):
        super().__init__()
        assert masking_ratio > 0 and masking_ratio < 1, 'masking ratio must be kept between 0 and 1'
        self.masking_ratio = masking_ratio

        # extract some hyperparameters and functions from encoder (vision transformer to be trained)

        self.encoder = encoder
        num_patches, encoder_dim = encoder.pos_embedding.shape[-2:]
        self.to_patch, self.patch_to_emb = encoder.to_patch_embedding[:2]
        pixel_values_per_patch = self.patch_to_emb.weight.shape[-1]

        # simple linear head

        self.mask_token = nn.Parameter(torch.randn(encoder_dim))
        self.to_pixels = nn.Linear(encoder_dim, pixel_values_per_patch)

    def forward(self, img):
        device = img.device

        # get patches

        patches = self.to_patch(img)
        batch, num_patches, *_ = patches.shape

        # for indexing purposes

        batch_range = torch.arange(batch, device = device)[:, None]

        # get positions

        pos_emb = self.encoder.pos_embedding[:,:num_patches]

        # patch to encoder tokens and add positions

        tokens = self.patch_to_emb(patches)
        tokens = tokens + pos_emb

        # prepare mask tokens

        mask_tokens = repeat(self.mask_token, 'd -> b n d', b = batch, n = num_patches)
        mask_tokens = mask_tokens + pos_emb

        # calculate of patches needed to be masked, and get positions (indices) to be masked

        num_masked = int(self.masking_ratio * num_patches)
        masked_indices = torch.rand(batch, num_patches, device = device).topk(k = num_masked, dim = -1).indices
        masked_bool_mask = torch.zeros((batch, num_patches), device = device).scatter_(-1, masked_indices, 1).bool()

        # mask tokens

        tokens = torch.where(masked_bool_mask[..., None], mask_tokens, tokens)

        # attend with vision transformer

        encoded = self.encoder.patch_transformer(tokens)

        # get the masked tokens

        encoded_mask_tokens = encoded[batch_range, masked_indices]

        # small linear projection for predicted pixel values

        pred_pixel_values = self.to_pixels(encoded_mask_tokens)

        # get the masked patches for the final reconstruction loss

        masked_patches = patches[batch_range, masked_indices]

        # calculate reconstruction loss

        recon_loss = F.l1_loss(pred_pixel_values, masked_patches) / num_masked
        return recon_loss

In [32]:
from random import randrange
import torch
from torch import nn, einsum
import torch.nn.functional as F

from einops import rearrange, repeat
from einops.layers.torch import Rearrange

# helpers

def exists(val):
    return val is not None

def dropout_layers(layers, dropout):
    if dropout == 0:
        return layers

    num_layers = len(layers)
    to_drop = torch.zeros(num_layers).uniform_(0., 1.) < dropout

    # make sure at least one layer makes it
    if all(to_drop):
        rand_index = randrange(num_layers)
        to_drop[rand_index] = False

    layers = [layer for (layer, drop) in zip(layers, to_drop) if not drop]
    return layers

# classes

class LayerScale(nn.Module):
    def __init__(self, dim, fn, depth):
        super().__init__()
        if depth <= 18:  # epsilon detailed in section 2 of paper
            init_eps = 0.1
        elif depth > 18 and depth <= 24:
            init_eps = 1e-5
        else:
            init_eps = 1e-6

        scale = torch.zeros(1, 1, dim).fill_(init_eps)
        self.scale = nn.Parameter(scale)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) * self.scale

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        self.heads = heads
        self.scale = dim_head ** -0.5

        self.to_q = nn.Linear(dim, inner_dim, bias = False)
        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)

        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

        self.mix_heads_pre_attn = nn.Parameter(torch.randn(heads, heads))
        self.mix_heads_post_attn = nn.Parameter(torch.randn(heads, heads))

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, context = None):
        
        b, n, _, h = *x.shape, self.heads

        context = x if not exists(context) else torch.cat((x, context), dim = 1)

        qkv = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)

        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        dots = einsum('b h i j, h g -> b g i j', dots, self.mix_heads_pre_attn)    # talking heads, pre-softmax

        attn = self.attend(dots)
        attn = self.dropout(attn)

        attn = einsum('b h i j, h g -> b g i j', attn, self.mix_heads_post_attn)   # talking heads, post-softmax

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0., layer_dropout = 0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        self.layer_dropout = layer_dropout

        for ind in range(depth):
            self.layers.append(nn.ModuleList([
                LayerScale(dim, PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)), depth = ind + 1),
                LayerScale(dim, PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)), depth = ind + 1)
            ]))
    def forward(self, x, context = None):
        layers = dropout_layers(self.layers, dropout = self.layer_dropout)

        for attn, ff in layers:
            x = attn(x, context = context) + x
            x = ff(x) + x
        return x

class CaiT(nn.Module):
    def __init__(
        self,
        *,
        image_size,
        patch_size,
        num_classes,
        dim,
        depth,
        cross_depth,
        cls_depth,
        heads,
        mlp_dim,
        dim_head = 64,
        dropout = 0.,
        emb_dropout = 0.,
        layer_dropout = 0.
    ):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = 3 * patch_size ** 2

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))
        self.pos_embedding_q, self.pos_embedding_r = nn.Parameter(torch.randn(1, 1, dim)), nn.Parameter(torch.randn(1, 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))

        self.patch_transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout, layer_dropout)
        #self.cross_transformer = Transformer(dim, cross_depth, heads, dim_head, mlp_dim, dropout, layer_dropout)
        self.cls_transformer = Transformer(dim, cls_depth, heads, dim_head, mlp_dim, dropout, layer_dropout)

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, query, reference):
        
        xq, xr = self.to_patch_embedding(query), self.to_patch_embedding(reference)
        xq, xr = xq + self.pos_embedding + self.pos_embedding_q, xr + self.pos_embedding + self.pos_embedding_r

        xq, xr = self.patch_transformer(xq), self.patch_transformer(xr)
        
        x = torch.cat([xq, xr], dim = 1)
        #x = self.cross_transformer(x)
        
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = self.cls_transformer(cls_tokens, context = x)

        return self.mlp_head(x[:, 0])
    
def vit_concat():
    return CaiT(image_size=256, patch_size=32, num_classes=1, dim=512, depth=4, cross_depth=3, cls_depth=4, heads=16, mlp_dim=1024, dim_head = 64, dropout = 0., emb_dropout = 0.)

In [33]:
model = vit_concat().to(device)

#ckpt = torch.load('/kaggle/input/ckptttt/net (3).pt')
#model.load_state_dict(ckpt['model_state_dict'], strict=False)

optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5)
#optimizer.load_state_dict(ckpt['optimizer_state_dict'], )
#opt = torch.optim.SGD(model.parameters(), lr = .05)


In [34]:
model

CaiT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=32, p2=32)
    (1): Linear(in_features=3072, out_features=512, bias=True)
  )
  (patch_transformer): Transformer(
    (layers): ModuleList(
      (0): ModuleList(
        (0): LayerScale(
          (fn): PreNorm(
            (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (fn): Attention(
              (to_q): Linear(in_features=512, out_features=1024, bias=False)
              (to_kv): Linear(in_features=512, out_features=2048, bias=False)
              (attend): Softmax(dim=-1)
              (dropout): Dropout(p=0.0, inplace=False)
              (to_out): Sequential(
                (0): Linear(in_features=1024, out_features=512, bias=True)
                (1): Dropout(p=0.0, inplace=False)
              )
            )
          )
        )
        (1): LayerScale(
          (fn): PreNorm(
            (norm): LayerNorm((512,), eps=1e-05, elementwise_aff

In [35]:
mim = SimMIM(
    encoder = model,
    masking_ratio = 0.5  # they found 50% to yield the best results
).to(device)

epochs = 5000
for epoch_i in range(0, epochs):
    model.train()
    for images in tqdm(pretrain_dataloader):
        images = images.to(device, non_blocking=True, dtype=input_dtype)
        loss = mim(images)
        optimizer.zero_grad(True)
        loss.backward()
        optimizer.step()
        run['pretrain/running/loss'].log(loss)
        
        

100%|██████████| 2/2 [00:00<00:00,  7.72it/s]
100%|██████████| 2/2 [00:00<00:00,  7.63it/s]
100%|██████████| 2/2 [00:00<00:00,  8.33it/s]
100%|██████████| 2/2 [00:00<00:00,  8.31it/s]
100%|██████████| 2/2 [00:00<00:00,  8.83it/s]
100%|██████████| 2/2 [00:00<00:00,  8.64it/s]
100%|██████████| 2/2 [00:00<00:00,  8.68it/s]
100%|██████████| 2/2 [00:00<00:00,  9.12it/s]
100%|██████████| 2/2 [00:00<00:00,  8.89it/s]
100%|██████████| 2/2 [00:00<00:00,  8.93it/s]
100%|██████████| 2/2 [00:00<00:00,  8.97it/s]
100%|██████████| 2/2 [00:00<00:00,  8.85it/s]
100%|██████████| 2/2 [00:00<00:00,  8.91it/s]
100%|██████████| 2/2 [00:00<00:00,  8.89it/s]
100%|██████████| 2/2 [00:00<00:00,  8.89it/s]
100%|██████████| 2/2 [00:00<00:00,  8.80it/s]
100%|██████████| 2/2 [00:00<00:00,  8.61it/s]
100%|██████████| 2/2 [00:00<00:00,  9.05it/s]
 50%|█████     | 1/2 [00:00<00:00,  4.18it/s]


KeyboardInterrupt: 

In [None]:
torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, f'../submission/net.pt')

In [36]:
#optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5)
#optimizer.load_state_dict(ckpt['optimizer_state_dict'], )

optimizer = torch.optim.SGD([{'params':model.patch_transformer.parameters(), 'lr':5e-4},
                            {'params':model.cls_transformer.parameters(), 'lr':5e-3},
                            {'params':model.mlp_head.parameters(), 'lr':.05}]
                            , lr = 5e-4, momentum=0.9)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [37]:
epochs = 5000

for epoch_i in range(0, epochs):
    
    epoch_loss, epoch_acc = 0, 0
    
    model.train()
    
    for anchor, pos, neg in tqdm(train_train_dataloader):

        optimizer.zero_grad(True)
        
        anchor = anchor.to(device, non_blocking=True, dtype=input_dtype)
        pos = pos.to(device, non_blocking=True, dtype=input_dtype)
        neg = neg.to(device, non_blocking=True, dtype=input_dtype)
        
        query = torch.cat([anchor, anchor], dim=0)
        reference = torch.cat([pos, neg], dim=0)
        labels = torch.cat([torch.ones(pos.shape[0],1), torch.zeros(neg.shape[0],1)], dim=0).to(device)

        logits = model(query=query, reference=reference)
        loss = loss_fn(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        run['running/loss'].log(loss)
        
        # accuracy
        preds = torch.nn.Sigmoid()(logits).round().detach().cpu().numpy()
        acc = accuracy_score(labels.detach().cpu().numpy(), preds)
        run['running/acc'].log(acc)
        
        epoch_loss += loss
        epoch_acc += acc
        
    run['epoch/train/loss'].log(epoch_loss / len(train_train_dataloader))
    run['epoch/train/acc'].log(epoch_acc / len(train_train_dataloader))
    
    if epoch_i % 20 == 0:
        m_ap = map_score(valid_eval_dataloader, model)
        run['epoch5/valid/map'].log(m_ap)
        
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, f'../artifacts/net.pt')



100%|██████████| 2/2 [00:00<00:00,  2.01it/s]
100%|██████████| 1/1 [00:00<00:00, 17.91it/s]


MaP:  nan


  0%|          | 0/2 [00:00<?, ?it/s]


TypeError: 'float' object is not callable

Error occurred during asynchronous operation processing: Cannot log infinite or NaN value to attribute epoch5/valid/map
Info (NVML): The operating system has blocked the request.. GPU usage metrics may not be reported. For more information, see https://docs-legacy.neptune.ai/logging-and-managing-experiment-results/logging-experiment-data.html#hardware-consumption 
Exception in thread NeptuneReporting:
Traceback (most recent call last):
  File "C:\Users\vcall\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "c:\repos\belugas\env\lib\site-packages\neptune\new\internal\threading\daemon.py", line 53, in run
    self.work()
  File "c:\repos\belugas\env\lib\site-packages\neptune\new\internal\hardware\hardware_metric_reporting_job.py", line 119, in work
    metric_reports = self._metric_reporter.report(time.time())
  File "c:\repos\belugas\env\lib\site-packages\neptune\internal\hardware\metrics\reports\metric_reporter.py", line 32

In [44]:
torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, f'../submission/net.pt')


In [None]:
# Optimal batch size for inference
"""
5vcpus
52 ram
12 ram gpu

model to eval, optim bactch size calculate with tqdm
"""
model.eval()

TEST_INFER_BS_INIT = 10
TEST_INFER_BS_ML = 10

NUM_WORKERS = 0

class DummyTest(torch.utils.data.Dataset):
  def __init__(self):
    super().__init__()

  def __len__(self):
    return 7000000

  def __getitem__(self, idx):
    return torch.zeros((3,224,224)), torch.zeros((3,224,224))

BS = TEST_INFER_BS_INIT = 5
while True:
  i = 0
  BS = BS + TEST_INFER_BS_ML
  dataloader = torch.utils.data.DataLoader(
                          DummyTest(), 
                          batch_size=BS,
                          shuffle=False, 
                          num_workers=0,
                          pin_memory=True
                      )   

  for batch in tqdm(dataloader, total=len(dataloader)):
    query = batch[0].to(device, non_blocking=True, dtype=torch.float32)
    reference = batch[1].to(device, non_blocking=True, dtype=torch.float32)
    with torch.no_grad():
      logits, attn, q_cls, r_cls = model(query=query, reference=reference)
      i+=1
    
    if i == 50:
      print(BS)
      print(torch.cuda.mem_get_info(device=0))
      break

In [6]:
fp16 = True 
input_dtype = torch.float16 if fp16 else torch.float32

scaler =  torch.cuda.amp.GradScaler(enabled=fp16)

model = crossvit_base_224().to(device)
input = torch.zeros((2,3,224,224), dtype=input_dtype, device=device)
loss_fn = torch.nn.BCEWithLogitsLoss()
labels = torch.ones((2,1), dtype=input_dtype, device=device)
optimizer = torch.optim.Adam(model.parameters(), lr = 5e-4)

warmup, reps = 30, 10

for i in range(0, warmup):
    
        with torch.cuda.amp.autocast(enabled = fp16):
            logits, attn, q_cls, r_cls = model(query=input, reference=input)
            loss = loss_fn(logits, labels)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    profile_memory=True,
) as prof:
    print(torch.cuda.mem_get_info(0))
    for i in range(0, reps):
        
        with torch.cuda.amp.autocast(enabled = fp16):
            
            logits, attn, q_cls, r_cls = model(query=input, reference=input)
            loss = loss_fn(logits, labels)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        

(6465060864, 8589737984)


In [7]:
print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=25))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          aten::reshape         5.00%     566.319ms        11.90%        1.346s     153.356us     544.024ms         4.84%        1.311s     149.324us           0 b           0 b     542.46 Mb           0 

In [16]:
# layers x bs x (q,ref) x cross_attn_depth x n_heads x 1 x tokens(inc cls)
attn[0][0][0].shape

torch.Size([2, 12, 1, 197])

In [20]:
print(len(attn))
print(len(attn[0]))
print(len(attn[0][0]))
attn[0][0][0].shape

3
2
2


torch.Size([2, 12, 1, 197])