### Regression notebook for Wadhwani AI competition

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import timm
from sklearn.preprocessing import minmax_scale
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2,torchvision
from ipyexperiments.ipyexperiments import IPyExperimentsPytorch
from timm.optim.optim_factory import create_optimizer_v2
from timm import utils
from fastprogress.fastprogress import format_time
from fastai.vision.all import *
from torchvision.models.detection import FasterRCNN
import yolov5
class CFG:
    seed = 46
    n_splits = 5
    SZ = 1280
    debug = False
    BS = 8
    EP = 10
    MODEL = 'tf_efficientnet_b0_ns'
    LR = 5e-03
    WD = 1e-08

random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)
np.random.seed(CFG.seed)
plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

In [32]:
set_seed(CFG.seed)

In [33]:
DIR = '///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/data/'
IMG_PATH = '///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/data/images'
submit = pd.read_csv(os.path.join(DIR,'SampleSubmission.csv'))
train = pd.read_csv(os.path.join(DIR,'Train.csv'))
test_df = pd.read_csv(os.path.join(DIR,'Test.csv'))

VERSION = "NB_EXP_V0_009_Regression"
MODEL_FOLDER = Path(f"///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/Regression/{VERSION}/")
os.makedirs(MODEL_FOLDER,exist_ok=True)
KERNEL_TYPE = f"{CFG.MODEL}_{CFG.SZ}_bs{CFG.BS}_ep{CFG.EP}_lr{str(CFG.LR).replace('-','')}_wd{str(CFG.WD).replace('-','')}"

print(MODEL_FOLDER)
print(KERNEL_TYPE)

/mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/Regression/NB_EXP_V0_009_Regression
tf_efficientnet_b0_ns_1280_bs8_ep10_lr0.005_wd1e08


In [34]:
def make_train_dataset():
    train_new_pbw = pd.DataFrame()
    train_new_pbw['image_id_worm']= train['image_id_worm'].unique()
    train_new_pbw = pd.merge(train_new_pbw,train[train['worm_type']=='pbw'].reset_index(drop=True),on='image_id_worm',how='left')
    train_new_pbw['worm_type'] = 'pbw'
    train_new_pbw.fillna(0,inplace=True)
    
    train_new_abw = pd.DataFrame()
    train_new_abw['image_id_worm']= train['image_id_worm'].unique()
    train_new_abw = pd.merge(train_new_abw,train[train['worm_type']=='abw'].reset_index(drop=True),on='image_id_worm',how='left')
    train_new_abw['worm_type'] = 'abw'
    train_new_abw.fillna(0,inplace=True)
    
    train_out = pd.concat([train_new_pbw,train_new_abw],0).reset_index(drop=True)
    
    assert len(train_out) == train['image_id_worm'].nunique()*2
    train_out = pd.pivot(train_out,'image_id_worm','worm_type','number_of_worms').reset_index()
    train_out[['abw','pbw']] = train_out[['abw','pbw']].astype(int)
    
    labels = [f'{i}' for i in range(10)]
    train_out['abw_bins'] = pd.cut(train_out['abw'],10,labels=labels)
    train_out['pbw_bins'] = pd.cut(train_out['pbw'],10,labels=labels)
    train_out['consol_bins'] = train_out['abw_bins'].astype(str)+'_'+train_out['pbw_bins'].astype(str)
    
#     train_out = train_out[['image_id_worm','abw','pbw','abw_bins','pbw_bins','consol_bins']]
    return train_out

train_new = make_train_dataset()

In [35]:
train_new.head(1)

worm_type,image_id_worm,abw,pbw,abw_bins,pbw_bins,consol_bins
0,id_0002ea6f15c7fa6f4c221783.jpg,0,51,0,0,0_0


In [36]:
train_new.shape

(9737, 6)

### Get kfolds

In [37]:
train_files = []
val_files = []
fold = []

for folds in [0,1,2,3,4]:
    files = list(pd.read_csv(f'///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/data/splits/fold{folds}.txt',header=None)[0].apply(lambda x:x.split("/")[-1]))
    fold.append([folds]*len(files))
    train_files.append(files)

train_files= ([item for sublist in train_files for item in sublist])
fold= ([item for sublist in fold for item in sublist])
fold_dict = dict(zip(train_files,fold))
train_new['fold'] = train_new['image_id_worm'].map(fold_dict)


In [38]:
train_new = train_new.loc[~train_new['fold'].isna()].reset_index(drop=True)
train_new.shape

(7079, 7)

In [39]:
# mskf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=42)
# fold_ids = []
# train_new['fold'] = 0

# for train_index, test_index in mskf.split(train_new, train_new['consol_bins']):
#     fold_ids.append(test_index)
    
# for fld in range(CFG.n_splits):
#     valIx = fold_ids[fld]
#     train_new.loc[valIx,'fold']=fld 

In [40]:
train_new['fold'].value_counts()

0.0    1418
4.0    1417
3.0    1417
1.0    1416
2.0    1411
Name: fold, dtype: int64

In [41]:
train_new['fold'].isna().sum()

0

In [42]:
test_df.head(1)

Unnamed: 0,image_id_worm
0,id_00332970f80fa9a47a39516d.jpg


#### Data loader

In [43]:
class WadhwaniDataset(Dataset):
    def __init__(self,
                 df=train_new,
                 mode='train',
                augs = None):
        
        self.augs = augs
        self.df = df
        self.mode  = mode
        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, ix):
        image_id = self.df.loc[ix,'image_id_worm']
        img_path = f'{IMG_PATH}/{image_id}'
        
        img = Image.open(img_path).convert("RGB")
        img = np.array(img)
        
        if self.augs is not None:
            img = self.augs(image=img)['image']
    
        if self.mode == 'test':
            return img
        
        label = torch.tensor(self.df[['abw','pbw']].loc[ix]).float()
        return img, label

In [44]:
class YOLOv5TorchBackend(nn.Module):
    # YOLOv5 MultiBackend class for python inference on various backends
    def __init__(self, weights='yolov5s.pt', device=torch.device('cpu'), data=None, fp16=False, fuse=True):
        # Usage:
        #   PyTorch:              weights = *.pt
        
        self.pt = True
        
        from yolov5.models.experimental import (attempt_download, attempt_load)

        super().__init__()
        w = str(weights[0] if isinstance(weights, list) else weights)
        w = attempt_download(w)  # download if not local
        fp16 &= True
        stride = 32  # default stride
        cuda = torch.cuda.is_available() and device.type != 'cpu'  # use CUDA

        model = attempt_load(weights if isinstance(weights, list) else w, device=device, inplace=True, fuse=fuse)
        stride = max(int(model.stride.max()), 32)  # model stride
        names = model.module.names if hasattr(model, 'module') else model.names  # get class names
        model.half() if fp16 else model.float()
        self.model = model

        self.__dict__.update(locals())  # assign all variables to self

    def forward(self, im, augment=False, visualize=False):
        # YOLOv5 MultiBackend inference
        b, ch, h, w = im.shape  # batch, channel, height, width
        if self.fp16 and im.dtype != torch.float16:
            im = im.half()  # to FP16

        y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im)

        if isinstance(y, (list, tuple)):
            return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y]
        else:
            return self.from_numpy(y)

    def from_numpy(self, x):
        return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x

In [45]:
class FeatureMapsAverager(nn.Module):
    def __init__(self):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.f = -1
        self.i = True
        
        pass
    
    def forward(self, x):
        
        result = self.pool(x)
        result = result.flatten(1)
        
        return result
    


In [46]:
class WadhwaniYoloFeatDataset(Dataset):
    def __init__(self,
                 df=train_new,
                 mode='train',
                 yolo = None,
                augs = None):
        
        self.augs = augs
        self.df = df
        self.mode  = mode
        self.yolo = yolo
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, ix):
        image_id = self.df.loc[ix,'image_id_worm']
        img_path = f'{IMG_PATH}/{image_id}'
        
        img = Image.open(img_path).convert("RGB")
        img = np.array(img)
        
        if self.augs is not None:
            img = self.augs(image=img)['image']
            
        yFeats = self.yolo(img.unsqueeze(0))
        
        if self.mode == 'test':
            return img,yFeats
        
        label = torch.tensor(self.df[['abw','pbw']].loc[ix]).float()
        return img, yFeats,label

In [47]:
def worker_init_fn(worker_id):
    """
    Handles PyTorch x Numpy seeding issues.
    Args:
        worker_id (int): Id of the worker.
    """
    np.random.seed(np.random.get_state()[1][0] + worker_id)

### Augmentations

In [48]:

TRAIN_AUG = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.ShiftScaleRotate(rotate_limit=45, border_mode=0, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1,p=0.5),
    A.Blur(p=0.5),
#     A.CLAHE(p=0.5),
    A.Resize(CFG.SZ,CFG.SZ,p=1),
    A.Normalize(),
    ToTensorV2(),
])

VALID_AUG = A.Compose([
    A.Resize(CFG.SZ,CFG.SZ,p=1),
    A.Normalize(),
    ToTensorV2(),
])


YOLO_AUG = A.Compose([
    A.Resize(3072,3072,p=1),
    A.Normalize(),
    ToTensorV2(),
])


YOLO_AUG1 = A.Compose([
    A.Resize(3584,3584,p=1),
    A.Normalize(),
    ToTensorV2(),
])

In [49]:
# yolo_detection_model = YOLOv5TorchBackend(weights='/mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/yolov5l6_exp003/fold 0/fold0/weights/best_fold0.pt', device=torch.device("cpu"))
# model = yolo_detection_model.model
# model.model[-1] = FeatureMapsAverager()
# model.eval()
# a,b,c = next(iter(WadhwaniYoloFeatDataset(augs=TRAIN_AUG,yolo=model)))
# a.shape,b.shape,c.shape

### Visualization

In [50]:
# dataset_show = WadhwaniDataset(train_new, augs=TRAIN_AUG, mode='train')
# loader_show = torch.utils.data.DataLoader(dataset_show, batch_size=6)
# img,target = next(iter(loader_show))

# grid = torchvision.utils.make_grid(img, normalize=True, padding=2)
# grid = grid.permute(1, 2, 0)
# show_image(grid, figsize=(15,8))#, title=[labels_class_map_rev[x] for x in target.numpy()]);

### Model

In [51]:
# def get_wadhwani_regression_model(model_name, pretrained=True, **kwargs):
#     model = timm.create_model(model_name, pretrained=pretrained, **kwargs)
#     model = nn.Sequential(model, nn.Dropout(0.15), nn.Linear(model.num_classes, 2),nn.ReLU())
#     return model

In [52]:
# m = timm.create_model(CFG.MODEL, pretrained=True,num_classes=0)
# m(torch.randn(1,3,124,124)).shape

In [53]:
class get_wadhwani_regression_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = timm.create_model(CFG.MODEL, pretrained=True,drop_path_rate=0.15,num_classes=0)
        self.linear = nn.Sequential(nn.Linear(1280+1024,2),nn.ReLU())
    
    def forward(self,img,yf):
        _if = self.model(img)
#         print(_if.shape,yf.squeeze(1).shape)
        _if = torch.cat([_if,yf.squeeze(1)],1)
        res = self.linear(_if)
        return res

In [54]:
# model = timm.create_model(CFG.MODEL, pretrained=True)
# model.num_classes
# nn.Sequential(model, nn.Dropout(0.15), nn.Linear(model.num_classes, 2),nn.ReLU())

In [55]:
# dl = DataLoader(WadhwaniYoloFeatDataset(train_new, augs=TRAIN_AUG, mode='train',yolo=model),
#                           batch_size=2,
#                           shuffle=True,
#                           num_workers=8,
#                           drop_last=True,
#                         worker_init_fn=worker_init_fn)

# a,b,c = next(iter(dl))
# a.shape,b.shape,c.shape

In [56]:
# m = get_wadhwani_regression_model()
# out = m(a,b)
# print(out, out.shape)

In [57]:
# nn.L1Loss()(out,c).item()

### Train & Validation Function

In [58]:
def train_one_epoch(
    model: nn.Module,
    loader: Iterable,
    loss_fn: Callable,
    optimizer: torch.optim.Optimizer,
    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
    mixup_fn: Callable = None,
    grad_scaler: torch.cuda.amp.GradScaler = None,
    mbar: master_bar = None,
):

    model.train()

    losses_m = utils.AverageMeter()

    pbar = progress_bar(loader, parent=mbar, leave=False)
    pbar.update(0)

    for batch_idx, (input,yf, target) in enumerate(loader):
        input, yf, target = input.cuda(), yf.cuda(),target.cuda()
        
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=True):
            output = model(input,yf)
            loss = loss_fn(output, target)
            
        losses_m.update(loss.item(), input.size(0))

        grad_scaler.scale(loss).backward()
        grad_scaler.step(optimizer)
        grad_scaler.update()

        if lr_scheduler is not None:
            lr_scheduler.step()

        pbar.update(batch_idx + 1)
        pbar.comment = f"{losses_m.avg:.4f}"

    pbar.on_iter_end()
    return OrderedDict([("loss", losses_m.avg)])


@torch.inference_mode()
def validate(model: nn.Module, loader: Iterable, loss_fn: Callable, mbar: master_bar):
    model.eval()

    l1_loss_m = utils.AverageMeter()
    losses_m = utils.AverageMeter()

    pbar = progress_bar(loader, parent=mbar, leave=False)
    pbar.update(0)

    for batch_idx, (input, yf,target) in enumerate(loader):
        
        input, yf, target = input.cuda(), yf.cuda(), target.cuda()
        output = torch.round(model(input,yf))

        loss = loss_fn(output, target).item()
        losses_m.update(loss, input.size(0))

        l1_loss = nn.L1Loss()(output, target).item()
        l1_loss_m.update(l1_loss, output.size(0))

        pbar.update(batch_idx + 1)

    pbar.on_iter_end()
    return OrderedDict([("loss", losses_m.avg), ("l1_loss", l1_loss_m.avg)])


### save on disk

In [59]:
class WadhwaniDatasetSimple(Dataset):
    def __init__(self,
                 df=train_new,
                 mode='train',
                augs = None):
        
        self.augs = augs
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, ix):
        image_id = self.df.loc[ix,'image_id_worm']
        img_path = f'{IMG_PATH}/{image_id}'
        
        img = Image.open(img_path).convert("RGB")
        img = np.array(img)
        
        if self.augs is not None:
            img = self.augs(image=img)['image']
        
        return image_id,img
        

In [60]:
test_df

Unnamed: 0,image_id_worm
0,id_00332970f80fa9a47a39516d.jpg
1,id_0035981bc3ae42eb5b57a317.jpg
2,id_005102f664b820f778291dee.jpg
3,id_0066456f5fb2cd858c69ab39.jpg
4,id_007159c1fa015ba6f394deeb.jpg
...,...
2798,id_ffad8f3773a4222f8fe5ba1a.jpg
2799,id_ffb65e6de900c49d8f2ef95a.jpg
2800,id_ffbcb27fa549278f47505515.jpg
2801,id_ffc0e41e10b0c964d4a02811.jpg


In [61]:
for fold in range(1):
#     yValFeats = dict()
    yTestFeats = dict()

    yolo_detection_model = YOLOv5TorchBackend(weights=f'/mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/yolov5s6_exp001/fold {fold}/weights/best.pt', device=torch.device("cpu"))
    yolomodel = yolo_detection_model.model
    yolomodel.model[-1] = FeatureMapsAverager()
    yolomodel.cuda()
    yolomodel.eval()
    
    torch.backends.cudnn.benchmark = True
    dataset_test = WadhwaniDatasetSimple(test_df.reset_index(drop=True), augs=YOLO_AUG, mode="valid")
    
    loader_test = torch.utils.data.DataLoader(dataset_test, 1 , num_workers=8, shuffle=False)

    for k,(id,img) in tqdm(enumerate(loader_test),total=len(loader_test)):
        with torch.no_grad():
            yTestFeats[id] = yolomodel(img.cuda())    
try:
    del yolo_detection_model, yolomodel
except:
    pass

yolo.py:141 -                 fuse() | Fusing layers... 
Fusing layers... 
torch_utils.py:293 -           model_info() | Model summary: 281 layers, 12326164 parameters, 0 gradients, 16.3 GFLOPs
Model summary: 281 layers, 12326164 parameters, 0 gradients, 16.3 GFLOPs


  0%|          | 0/2803 [00:00<?, ?it/s]

In [62]:
for fold in range(1):
    yTestFeats1 = dict()

    yolo_detection_model = YOLOv5TorchBackend(weights=f'/mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/yolov5s6_exp002/fold {fold}/weights/best.pt', device=torch.device("cpu"))
    yolomodel = yolo_detection_model.model
    yolomodel.model[-1] = FeatureMapsAverager()
    yolomodel.cuda()
    yolomodel.eval()
    torch.backends.cudnn.benchmark = True
    dataset_test = WadhwaniDatasetSimple(test_df.reset_index(drop=True), augs=YOLO_AUG, mode="valid")
    loader_test = torch.utils.data.DataLoader(dataset_test, 1 , num_workers=8, shuffle=False)

    for k,(id,img) in tqdm(enumerate(loader_test),total=len(loader_test)):
        with torch.no_grad():
            yTestFeats1[id] = yolomodel(img.cuda())    
try:
    del yolo_detection_model, yolomodel
except:
    pass

yolo.py:141 -                 fuse() | Fusing layers... 
Fusing layers... 
torch_utils.py:293 -           model_info() | Model summary: 281 layers, 12326164 parameters, 0 gradients, 16.3 GFLOPs
Model summary: 281 layers, 12326164 parameters, 0 gradients, 16.3 GFLOPs


  0%|          | 0/2803 [00:00<?, ?it/s]

In [63]:
import pickle


with open(f'/mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/YOLO_Features/yolov5s6_exp001_test.pickle', 'wb') as handle:
    pickle.dump(yTestFeats, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'/mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/YOLO_Features/yolov5s6_exp002_test.pickle', 'wb') as handle:
    pickle.dump(yTestFeats1, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Fin 