### Regression notebook for Wadhwani AI competition

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import timm
from sklearn.preprocessing import minmax_scale
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2,torchvision
from ipyexperiments.ipyexperiments import IPyExperimentsPytorch
from timm.optim.optim_factory import create_optimizer_v2
from timm import utils
from fastprogress.fastprogress import format_time
from fastai.vision.all import *
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
class CFG:
    seed = 46
    n_splits = 5
    SZ = 1280
    debug = False
    BS = 16
    EP = 10
    MODEL = 'tf_efficientnet_b0_ns'
    LR = 5e-03
    WD = 1e-08

random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)
np.random.seed(CFG.seed)
plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

In [None]:
set_seed(CFG.seed)

In [None]:
DIR = '///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/data/'
IMG_PATH = '///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/data/images'
submit = pd.read_csv(os.path.join(DIR,'SampleSubmission.csv'))
train = pd.read_csv(os.path.join(DIR,'Train.csv'))
test_df = pd.read_csv(os.path.join(DIR,'Test.csv'))
labels_dir = '///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/Mixed/fold0_infer_1280_CONFTHRESH_45/labels'

VERSION = "NB_EXP_V0_001_Mixed"
MODEL_FOLDER = Path(f"///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/Mixed/{VERSION}/")
os.makedirs(MODEL_FOLDER,exist_ok=True)
KERNEL_TYPE = f"{CFG.MODEL}_{CFG.SZ}_bs{CFG.BS}_ep{CFG.EP}_lr{str(CFG.LR).replace('-','')}_wd{str(CFG.WD).replace('-','')}"

print(MODEL_FOLDER)
print(KERNEL_TYPE)

In [None]:
def make_train_dataset():
    train_new_pbw = pd.DataFrame()
    train_new_pbw['image_id_worm']= train['image_id_worm'].unique()
    train_new_pbw = pd.merge(train_new_pbw,train[train['worm_type']=='pbw'].reset_index(drop=True),on='image_id_worm',how='left')
    train_new_pbw['worm_type'] = 'pbw'
    train_new_pbw.fillna(0,inplace=True)
    
    train_new_abw = pd.DataFrame()
    train_new_abw['image_id_worm']= train['image_id_worm'].unique()
    train_new_abw = pd.merge(train_new_abw,train[train['worm_type']=='abw'].reset_index(drop=True),on='image_id_worm',how='left')
    train_new_abw['worm_type'] = 'abw'
    train_new_abw.fillna(0,inplace=True)
    
    train_out = pd.concat([train_new_pbw,train_new_abw],0).reset_index(drop=True)
    
    assert len(train_out) == train['image_id_worm'].nunique()*2
    train_out = pd.pivot(train_out,'image_id_worm','worm_type','number_of_worms').reset_index()
    train_out[['abw','pbw']] = train_out[['abw','pbw']].astype(int)
    
    labels = [f'{i}' for i in range(10)]
    train_out['abw_bins'] = pd.cut(train_out['abw'],10,labels=labels)
    train_out['pbw_bins'] = pd.cut(train_out['pbw'],10,labels=labels)
    train_out['consol_bins'] = train_out['abw_bins'].astype(str)+'_'+train_out['pbw_bins'].astype(str)
    
#     train_out = train_out[['image_id_worm','abw','pbw','abw_bins','pbw_bins','consol_bins']]
    return train_out

train_new = make_train_dataset()

In [None]:
train_new.head(1)

### Get kfolds

In [None]:
train_files = []
val_files = []

for folds in [1,2,3,4]:
    files = list(pd.read_csv(f'///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/data/splits/fold{folds}.txt',header=None)[0].apply(lambda x:x.split("/")[-1]))
    train_files.append(files)


for folds in [0]:
    files = list(pd.read_csv(f'///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/data/splits/fold{folds}.txt',header=None)[0].apply(lambda x:x.split("/")[-1]))
    val_files.append(files)
    
train_files= ([item for sublist in train_files for item in sublist])
val_files= ([item for sublist in val_files for item in sublist])

In [None]:
train_new['fold'] = 0
train_new.loc[train_new['image_id_worm'].isin(train_files),'fold'] = 0
train_new.loc[train_new['image_id_worm'].isin(val_files),'fold'] = 1

In [None]:
train_new.head()

In [None]:
test_df.head(1)

### Get Yolo outputs

In [None]:
out_train = dict()
for img_id in tqdm(list(train_new['image_id_worm'])):
    img = img_id.split('.')[0]
    out_train[f'{img}_abw'] = 0
    out_train[f'{img}_pbw'] = 0

    # read text file into pandas DataFrame
    try:
        df = pd.read_csv(f"{labels_dir}/{img}.txt", sep=" ",header=None)
        df.rename(columns = {0:'labels'},inplace=True)
        out_temp = df['labels'].value_counts().to_dict()
        for k,v in out_temp.items():
            if k == 0:
                out_train[f'{img}_abw'] += out_temp[k]
            if k == 1:
                out_train[f'{img}_pbw'] += out_temp[k]        
    except:
        continue

In [None]:
out_test = dict()
for img_id in tqdm(list(test_df['image_id_worm'])):
    img = img_id.split('.')[0]
    out_test[f'{img}_abw'] = 0
    out_test[f'{img}_pbw'] = 0

    # read text file into pandas DataFrame
    try:
        df = pd.read_csv(f"{labels_dir}/{img}.txt", sep=" ",header=None)
        df.rename(columns = {0:'labels'},inplace=True)
        out_temp = df['labels'].value_counts().to_dict()
        for k,v in out_temp.items():
            if k == 0:
                out_test[f'{img}_abw'] += out_temp[k]
            if k == 1:
                out_test[f'{img}_pbw'] += out_temp[k]        
    except:
        continue

### Get Regression Outputs

In [None]:
def get_wadhwani_regression_model(model_name, pretrained=True, **kwargs):
    model = timm.create_model(model_name, pretrained=pretrained, **kwargs)
    model = nn.Sequential(model, nn.Dropout(0.15), nn.Linear(model.num_classes, 2),nn.ReLU())
    return model

In [None]:
VALID_AUG = A.Compose([
    A.Resize(CFG.SZ,CFG.SZ,p=1),
    A.Normalize(),
    ToTensorV2(),
])

class WadhwaniRegDataset(Dataset):
    def __init__(self,
                 df=train_new,
                 mode='train',
                augs = None):
        
        self.augs = augs
        self.df = df
        self.mode  = mode
        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, ix):
        image_id = self.df.loc[ix,'image_id_worm']
        img_path = f'{IMG_PATH}/{image_id}'
        
        img = Image.open(img_path).convert("RGB")
        img = np.array(img)
        
        if self.augs is not None:
            img = self.augs(image=img)['image']
    
        if self.mode == 'test':
            return img
        
        label = torch.tensor(self.df[['abw','pbw']].loc[ix]).float()
        return img, label

In [None]:
train_dataset = WadhwaniRegDataset(train_new, augs=VALID_AUG, mode='train')
test_dataset = WadhwaniRegDataset(test_df, augs=VALID_AUG, mode='test')

In [None]:
train_dl = torch.utils.data.DataLoader(train_dataset, 16, num_workers=8, shuffle=False, pin_memory=True)
print('Train size: {}'.format(len(train_dl.dataset)))
    
preds_train = []
imageids_train = []

for input,_ in tqdm(train_dl, dynamic_ncols=True, desc="Predicting"):
    pred = []
    with torch.cuda.amp.autocast(), torch.no_grad():
        input = input.cuda()
        model = get_wadhwani_regression_model('tf_efficientnet_b0_ns', pretrained=False)
        model.load_state_dict(torch.load('///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/Regression/NB_EXP_V0_006_Regression/tf_efficientnet_b0_ns_1280_bs16_ep10_lr0.005_wd1e08_CV_0.pth'))
        model.eval()
        model.cuda()
        pred.append(model(input))            
        torch.cuda.empty_cache()
        gc.collect()
    pred = torch.mean(torch.stack(pred, -1),-1).data.cpu().numpy()
    preds_train.append(pred)
    
preds_train = np.round(np.concatenate(preds_train, axis=0))

In [None]:
test_dl = torch.utils.data.DataLoader(test_dataset, 16, num_workers=8, shuffle=False, pin_memory=True)
print('Train size: {}'.format(len(test_dl.dataset)))
    
preds_test = []
imageids_test = []

for input in tqdm(test_dl, dynamic_ncols=True, desc="Predicting"):
    pred = []
    with torch.cuda.amp.autocast(), torch.no_grad():
        input = input.cuda()
        model = get_wadhwani_regression_model('tf_efficientnet_b0_ns', pretrained=False)
        model.load_state_dict(torch.load('///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/Regression/NB_EXP_V0_006_Regression/tf_efficientnet_b0_ns_1280_bs16_ep10_lr0.005_wd1e08_CV_0.pth'))
        model.eval()
        model.cuda()
        pred.append(model(input))            
        torch.cuda.empty_cache()
        gc.collect()
    pred = torch.mean(torch.stack(pred, -1),-1).data.cpu().numpy()
    preds_test.append(pred)
    
preds_test = np.round(np.concatenate(preds_test, axis=0))

#### Reg Preds

In [None]:
preds_train_df_reg = pd.concat([train_new,pd.DataFrame(preds_train)],1)
preds_train_df_reg['image_id_worm'] = preds_train_df_reg['image_id_worm'].apply(lambda x:x.split(".")[0])
preds_train_df_reg.rename(columns={0:'abw_pred_reg',1:'pbw_pred_reg'},inplace=True)
preds_train_df_reg.head()

In [None]:
preds_test_df_reg = pd.concat([test_df,pd.DataFrame(preds_test)],1)
preds_test_df_reg['image_id_worm'] = preds_test_df_reg['image_id_worm'].apply(lambda x:x.split(".")[0])
preds_test_df_reg.rename(columns={0:'abw_pred_reg',1:'pbw_pred_reg'},inplace=True)
preds_test_df_reg.head()

#### Yolo preds

In [None]:
preds_train_df_yolo = pd.DataFrame.from_dict(out_train,orient='index').reset_index()
preds_train_df_yolo.rename(columns={'index':'image_id_worm',0:'number_of_worms'},inplace=True)
preds_train_df_yolo.head()

In [None]:
preds_train_df_yolo['category']=preds_train_df_yolo['image_id_worm'].apply(lambda x:x.split("_")[2])
preds_train_df_yolo['image_id_worm']=preds_train_df_yolo['image_id_worm'].apply(lambda x:x.split("_")[0]+"_"+x.split("_")[1])
preds_train_df_yolo = preds_train_df_yolo.pivot(index='image_id_worm', columns='category', values='number_of_worms').reset_index()
preds_train_df_yolo.rename(columns={'abw':'abw_pred_yolo','pbw':'pbw_pred_yolo'},inplace=True)
preds_train_df_yolo.head()

In [None]:
preds_test_df_yolo = pd.DataFrame.from_dict(out_test,orient='index').reset_index()
preds_test_df_yolo.rename(columns={'index':'image_id_worm',0:'number_of_worms'},inplace=True)
preds_test_df_yolo['category']=preds_test_df_yolo['image_id_worm'].apply(lambda x:x.split("_")[2])
preds_test_df_yolo['image_id_worm']=preds_test_df_yolo['image_id_worm'].apply(lambda x:x.split("_")[0]+"_"+x.split("_")[1])
preds_test_df_yolo = preds_test_df_yolo.pivot(index='image_id_worm', columns='category', values='number_of_worms').reset_index()
preds_test_df_yolo.rename(columns={'abw':'abw_pred_yolo','pbw':'pbw_pred_yolo'},inplace=True)
preds_test_df_yolo.head()

### Merge

In [None]:
preds_train_df_yolo.shape,preds_train_df_yolo.shape

In [None]:
preds_train_df_consol = pd.merge(preds_train_df_reg,preds_train_df_yolo,on='image_id_worm',how='left')
preds_test_df_consol = pd.merge(preds_test_df_reg,preds_test_df_yolo,on='image_id_worm',how='left')

In [None]:
preds_train_df_consol.head()

In [None]:
preds_test_df_consol.head()

In [None]:
preds_train_df_consol['fold'].value_counts()

### Run!

In [None]:
def training_loop(fold):
    out = pd.DataFrame()
    pred_ = pd.DataFrame()
    tr = preds_train_df_consol[preds_train_df_consol['fold']==0].reset_index(drop=True)
    val = preds_train_df_consol[preds_train_df_consol['fold']==0].reset_index(drop=True)
    
    X_tr = tr[['abw_pred_reg','pbw_pred_reg','abw_pred_yolo','pbw_pred_yolo']]
    Y_tr = tr[['abw','pbw']]

    X_val = val[['abw_pred_reg','pbw_pred_reg','abw_pred_yolo','pbw_pred_yolo']]
    Y_val = val[['abw','pbw']]

    X_test = preds_test_df_consol[['abw_pred_reg','pbw_pred_reg','abw_pred_yolo','pbw_pred_yolo']]
    
    for i in range(2):
        model = LGBMRegressor(verbose=1,random_state=42,num_leaves=31,metric='mae')
        model.fit(X_tr, Y_tr.iloc[:,i], eval_set=(X_val,Y_val.iloc[:,i]),)    
        p = np.round(model.predict(X_val).clip(0,600))
        pred = np.round(model.predict(X_test).clip(0,600))
        out = pd.concat([out,pd.DataFrame(pred)],1)
        pred_ = pd.concat([pred_,pd.DataFrame(p)],1)
    
    pred_.columns = ['abw','pbw']
    
    print(f'mean_absolute_error:{mean_absolute_error(Y_val.values.reshape(-1,1),pred_.values.reshape(-1,1))}')
    return model,out

In [None]:
model,pred = training_loop(1)

In [None]:
pred.min(),pred.max()

In [None]:
pred.shape

In [None]:
pred.columns = ['abw','pbw']
pred.head()


In [None]:
submit.head()

In [None]:
submit.loc[::2,'number_of_worms'] = pred.loc[:,'abw'].values
submit.loc[1::2,'number_of_worms'] = pred.loc[:,'pbw'].values

In [None]:
submit.head(10)

In [None]:
submit.to_csv('///mnt/c/Personal/Competitions/Zindi/Wadhwani AI/runs/Mixed/lgbm_reg_yolo.csv',index=False)

### Fin 