In [6]:
import os
ON_KAGGLE_KERNEL = os.path.isdir("/kaggle/input")
start_dir = os.getcwd()

if ON_KAGGLE_KERNEL:
    os.chdir("/kaggle/input/utilities/")
else:
    os.chdir(f"{os.environ.get('PYTHONPATH')}/src/utils")

from common import load_structure, save_structure, load_train_file, INPUT_DATA_DIR, OUTPUT_DATA_DIR, SUB_DIR, set_seed, create_dir_if_not_exists
from data_proc import create_idvs_with_one_img_in_train_split, create_train_val_loaders, split_into_train_val
from data_structures import WhaleDataset, TorchConfig
os.chdir(start_dir)

print(INPUT_DATA_DIR)

if not ON_KAGGLE_KERNEL:
    %reload_ext autoreload
    %autoreload 2
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = 'all'

import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import gc
import time
import copy
from collections import defaultdict
try:
    import timm
except Exception:
    !pip install --upgrade timm
    import timm
try:
    import wandb
except Exception:
    !pip install --upgrade wandb
    import wandb

create_dir_if_not_exists(OUTPUT_DATA_DIR)
create_dir_if_not_exists(SUB_DIR)


/home/paul/projects/Happywhale_competition/data


In [7]:
if ON_KAGGLE_KERNEL:
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("wandb_api")
        wandb.login(key=api_key)
        wandb_active = True
    except:
        wandb_active = False

In [8]:
conf = TorchConfig.default()
conf.dict()
set_seed(conf.seed)

if not ON_KAGGLE_KERNEL:
    conf.train_batch_size = 8

{'seed': 319,
 'epochs': 10,
 'img_size': 448,
 'augm_args': {'hor_flip': {'p': 0.5},
  'ver_flip': {'p': 0.5},
  'rot': {'p': 0.5, 'limit': 30}},
 'model_name': 'tf_efficientnet_b0',
 'num_classes': 15587,
 'train_batch_size': 32,
 'valid_batch_size': 64,
 'optim': torch.optim.adam.Adam,
 'optim_args': {'lr': 0.0001, 'weight_decay': 1e-06},
 'scheduler': torch.optim.lr_scheduler.CosineAnnealingLR,
 'scheduler_args': {'T_max': 500, 'eta_min': 1e-06},
 'n_fold': 5,
 'init_optim_': None,
 'init_sched_': None}

In [9]:
df = create_idvs_with_one_img_in_train_split()

Encoding labels of df...
Not saving /home/paul/projects/Happywhale_competition/data/id_encoding.pickle!
/home/paul/projects/Happywhale_competition/data/splits/idvs_with_one_img_in_train.pickle saved.




In [10]:
df_train, df_val = split_into_train_val(df)
len(df_train), len(df_val)

(40589, 10444)

In [11]:
transforms = conf.make_transforms()

In [12]:
train_set = WhaleDataset(df_train, transforms["train"])
val_set = WhaleDataset(df_val, transforms["valid"])

In [13]:
train_loader, val_loader = create_train_val_loaders(train_set, val_set)

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [15]:
model = timm.create_model(conf.model_name, pretrained=True, num_classes=conf.num_classes)
model.to(device);
MODEL_TRAINED = False

EfficientNet(
  (conv_stem): Conv2dSame(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  (act1): SiLU(inplace=True)
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act1): SiLU(inplace=True)
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
          (gate): Sigmoid()
        )
        (conv_pw): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act2): Identity()
      )
    )
 

In [16]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

In [17]:
optim = conf.get_optim(model)

In [18]:
scheduler = conf.get_scheduler()

In [19]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        
        batch_size = images.size(0)
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        # loss = loss / conf['n_accumulate']
            
        loss.backward()
    
        # if (step + 1) % CONFIG['n_accumulate'] == 0:
        optimizer.step()

        optimizer.zero_grad()

        if scheduler is not None:
            scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [20]:
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        
        batch_size = images.size(0)

        outputs = model(images)
        loss = criterion(outputs, labels)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss)   
    
    gc.collect()
    
    return epoch_loss

In [21]:
def run_training(model, optimizer, scheduler, num_epochs):
    # To automatically log gradients
    wandb.watch(model, log_freq=100)

    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)

    for epoch in range(1, num_epochs + 1):
        gc.collect()
        train_epoch_loss = train_one_epoch(
            model,
            optimizer,
            scheduler,
            dataloader=train_loader,
            device=device,
            epoch=epoch,
        )

        val_epoch_loss = valid_one_epoch(model, val_loader, device=device, epoch=epoch)

        history["Train Loss"].append(train_epoch_loss)
        history["Valid Loss"].append(val_epoch_loss)

        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})

        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "Loss{:.4f}_epoch{:.0f}.bin".format(best_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved.")

        print()

    end = time.time()
    time_elapsed = end - start
    print(
        "Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60
        )
    )
    print("Best Loss: {:.4f}".format(best_epoch_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history


In [22]:
run = wandb.init(
    project="HappyWhale",
    config=conf.dict(),
    job_type="Train",
    tags=["Standard classifier", "efficientnet_b0", "448"],
    anonymous="must",
)

model, history = run_training(model, optim, scheduler, num_epochs=conf.epochs)
MODEL_TRAINED = True
run.finish()


[34m[1mwandb[0m: Currently logged in as: [33mpauldanielml[0m (use `wandb login --relogin` to force relogin)


[INFO] Using GPU: NVIDIA GeForce GTX 1650



  0%|          | 0/1268 [00:01<?, ?it/s]


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/paul/projects/Happywhale_competition/env/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/paul/projects/Happywhale_competition/env/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/home/paul/projects/Happywhale_competition/env/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 86, in default_collate
    raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>


In [None]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
import json

@torch.inference_mode()
def generate_predictions_pt(model: nn.Module) -> pd.DataFrame:
    df = pd.read_csv(INPUT_DATA_DIR / "sample_submission.csv").drop(columns="predictions")
    test_dataset = WhaleDataset(df, transforms["valid"], labels=False)
    test_loader = DataLoader(dataset=test_dataset, num_workers=4, batch_size=32)

    enc = load_structure("id_encoding")

    data = []
    for batch in test_loader:
        imgs = batch["image"].to(device)
        res = F.softmax(model(imgs))

        top = res.cpu().topk(5)
        for idx, val in zip(top.indices.numpy().tolist(), top.values.numpy().tolist()):
            ids = enc.inverse_transform(idx).tolist()
            data.append(dict(zip(ids, val)))

    return_vals = [{"image": df.iloc[i]["image"], "top5": json.dumps(d)} for i,d in enumerate(data)]
    return return_vals

In [None]:
def predict_new_ind(df: pd.DataFrame, softmax_thr: float):
    
    df_local = df.copy(deep=True)

    def insert_new_ind(x, thr: float):
        data = json.loads(x)
        idvs = list(data.keys())
        ret_value = []
        for idx, (idv, score) in enumerate(data.items()):
            if score > thr:
                ret_value.append(idv)
            else:
                ret_value.append("new_individual")
                ret_value += idvs[idx:len(idvs)-1]
                break

        return " ".join(ret_value)


    df_local["predictions"] = df_local["top5"].apply(lambda x: insert_new_ind(x, softmax_thr))
    return df_local.drop(columns="top5")

In [None]:
pred = pd.DataFrame(generate_predictions_pt(model))

In [None]:
df_thr = predict_new_ind(pred, 0.04).set_index("image")

In [None]:
df_thr.to_csv(SUB_DIR / "pt_class_single_img_idv_in_train_thr_0,04.csv")