In [1]:
from fastai.vision.all import *
from timm import create_model
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
# Settings and defination

####################################
SEED = 1033
N_FOLDS = 3
BATCH_SIZE = 16
IMGSZ = 384
EPOCHS = 3
INIT_LR = 0.3
NUM_WORKER = 8
PATIENCE = 3
MODEL_BASE = 'convnext_base.fb_in22k'
####################################

DATASET_DIR = '/home/derrick/DataProjects/zindi-cgiar'
set_seed(SEED, reproducible=True)

In [2]:

def prepare_train_data(data, kfold, image_dir):
    """
    Helper function to get the data ready
    """
    df = data.copy()
    print(df['filename'])
    df['image_id'] = df['filename'].apply(lambda x: x.split('.')[0])
    df = df.drop_duplicates(subset='image_id', keep='first')

    df['target'] = df['damage']

    df['fold'] = -1
    for i, (train_idx, val_idx) in enumerate(kfold.split(df, df['target'])):
        df.loc[val_idx, 'fold'] = i

    print(df.groupby(['fold', 'target']).size())

    df['path'] = df['filename'].apply(lambda x: f'{image_dir}/{x}')
    df['fold'] = df['fold'].astype('int')

    return df

In [3]:
def cross_entropy(predictions, targets):
        predictions = predictions.sigmoid()
        return torch.where(targets==1, 1-predictions, predictions).mean()

def train_model(data):
    df = data.copy()

    for fold in range(N_FOLDS):
        df['is_valid'] = (df['fold'] == fold)
        print(f'Training fold: {fold}')
        dls = ImageDataLoaders.from_df(
            df, #pass in train DataFrame
            valid_col='is_valid',
            seed=SEED, #seed
            fn_col='path', #filename/path is in the second column of the DataFrame
            label_col='target', #label is in the first column of the DataFrame
            label_delim=' ',
            y_block=MultiCategoryBlock, #The type of target
            bs=BATCH_SIZE, #pass in batch size
            num_workers=NUM_WORKER,
            item_tfms=Resize(IMGSZ), #pass in item_tfms
            batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Flip(), Rotate()]))

        model = create_model(f'{MODEL_BASE}', pretrained=True, num_classes=dls.c)
        learn = Learner(dls, model, loss_func=BCEWithLogitsLossFlat(), metrics=AccumMetric(cross_entropy)).to_fp16()
        learn.fit_one_cycle(EPOCHS, INIT_LR, cbs=[SaveModelCallback(), EarlyStoppingCallback(monitor='cross_entropy', comp=np.less, patience=PATIENCE), CSVLogger(append=True)])

        learn = learn.to_fp32()
        learn.save(f'{MODEL_BASE}_fold{fold}', with_opt=False)


In [4]:
train = pd.read_csv(f'{DATASET_DIR}/Train_1000.csv')
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [5]:
train_data = prepare_train_data(train, skf, 'images')
train_model(train_data)

0      d036341be8d6cd59851cb80bcc9a70cc9fbdba30.jpg
1      9ea16180c50d0cd539897eefbfe585314e50a56b.jpg
2      b6b564844041bc68774a553eaf43d61654657dd9.jpg
3      62262859f0cd411aaf484082ed3ef0b625bc452a.jpg
4      03b6e2a8ace7cb611eccba289f3c83d9bd497584.jpg
                           ...                     
994    e884ab8090df019d10914b59feeb78a17e91c645.jpg
995    dcde9dbed8d8c225fe3d482490194721ebecd1f2.jpg
996    10951a96101e3c49e3a3060db507fd8df07493ff.jpg
997    783225d73a1018169901f31fae11d1bfa7499484.jpg
998    fd9c6de07ecb0f7a2c910f11255f38f66d81934d.jpg
Name: filename, Length: 999, dtype: object
fold  target
0     DR         60
      G         143
      ND          4
      WD        119
      other       7
1     DR         60
      G         142
      ND          5
      WD        119
      other       7
2     DR         60
      G         142
      ND          4
      WD        119
      other       8
dtype: int64
Training fold: 0


  return getattr(torch, 'has_mps', False)


epoch,train_loss,valid_loss,cross_entropy,time
0,2.482796,8.49679,0.228404,30:16
1,4.497619,3.697568,0.200028,33:59
2,3.087187,1.375424,0.250541,33:58


Better model found at epoch 0 with valid_loss value: 8.496789932250977.
Better model found at epoch 1 with valid_loss value: 3.697567939758301.
Better model found at epoch 2 with valid_loss value: 1.3754241466522217.
Training fold: 1


  return getattr(torch, 'has_mps', False)


epoch,train_loss,valid_loss,cross_entropy,time
0,4.1504,10.338453,0.229429,32:17
1,5.849542,2.393939,0.254978,32:42
2,2.827431,0.577004,0.238418,35:01


Better model found at epoch 0 with valid_loss value: 10.33845329284668.
Better model found at epoch 1 with valid_loss value: 2.39393949508667.
Better model found at epoch 2 with valid_loss value: 0.5770037770271301.
Training fold: 2


  return getattr(torch, 'has_mps', False)


epoch,train_loss,valid_loss,cross_entropy,time
0,4.162902,5.006231,0.229429,35:20
1,4.305408,4.151606,0.332,34:44
2,2.779977,0.917902,0.247795,34:45


Better model found at epoch 0 with valid_loss value: 5.006231307983398.
Better model found at epoch 1 with valid_loss value: 4.15160608291626.
Better model found at epoch 2 with valid_loss value: 0.9179016947746277.


In [10]:
####################################
TTA = 5
SAVE_NAME = 'convnext_base.fb_in22k'
####################################

os.makedirs('submission', exist_ok=True)

test_df = pd.read_csv(f'{DATASET_DIR}/Test.csv')
test_df['path'] = test_df['filename'].map(lambda x: f'images/{x}')

ensemble = []
for fold in range(N_FOLDS):
    dls = ImageDataLoaders.from_df(
        train_data, #pass in train DataFrame
        valid_pct=0.2, #80-20 train-validation random split
        seed=SEED, #seed
        fn_col='path', #filename/path is in the second column of the DataFrame
        label_col='target', #label is in the first column of the DataFrame
        label_delim=' ',
        y_block=MultiCategoryBlock, #The type of target
        bs=BATCH_SIZE, #pass in batch size
        num_workers=NUM_WORKER,
        item_tfms=Resize(IMGSZ), #pass in item_tfms
        batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Flip(), Rotate()]))
    model = create_model(f'{MODEL_BASE}', pretrained=False, num_classes=dls.c)
    learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=AccumMetric(cross_entropy)).to_fp16()

    model = learn.load(f'{MODEL_BASE}_fold{fold}')
    test_df['target'] = [1]*len(test_df)

    test_dl = dls.test_dl(test_df)
    preds, _ = learn.tta(dl=test_dl, n=TTA, beta=0)
    ensemble.append(preds.numpy())

test_df = test_df.join(pd.DataFrame(np.mean(ensemble, axis=0), columns=dls.vocab))

sample_submission_df = pd.read_csv(f"{DATASET_DIR}/SampleSubmission.csv")
sample_submission_df = sample_submission_df['ID']
sample_submission_df = pd.merge(sample_submission_df, test_df, on='ID')
sample_submission_df = sample_submission_df[['ID']+dls.vocab]
sample_submission_df.to_csv(f"submission/{MODEL_BASE}_tta_{TTA}.csv", index=False)

  return getattr(torch, 'has_mps', False)
  elif with_opt: warn("Saved file doesn't contain an optimizer state.")
  return getattr(torch, 'has_mps', False)


  return getattr(torch, 'has_mps', False)
  elif with_opt: warn("Saved file doesn't contain an optimizer state.")


  return getattr(torch, 'has_mps', False)
  elif with_opt: warn("Saved file doesn't contain an optimizer state.")


TypeError: can only concatenate list (not "CategoryMap") to list

In [11]:
sample_submission_df

Unnamed: 0,ID,filename,path,target,DR,G,ND,WD,other
0,ID_SJBCSZ,b7f981642f9049d77a18834409eb01e7fa185bf1.jpg,images/b7f981642f9049d77a18834409eb01e7fa185bf1.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225878e-10
1,ID_4UMPJL,6851d1406fb9f528d3cfebd02e0c75e026d6f4a9.jpg,images/6851d1406fb9f528d3cfebd02e0c75e026d6f4a9.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225878e-10
2,ID_AB4DZF,d263fbacd291e8c2cb2d0eabc34dee5e832713bd.jpg,images/d263fbacd291e8c2cb2d0eabc34dee5e832713bd.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225875e-10
3,ID_H257JS,8e8b1ecca639febef8ab5ef1631cba056affdb2e.jpg,images/8e8b1ecca639febef8ab5ef1631cba056affdb2e.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225878e-10
4,ID_JKST46,439a2bd12979fc7f55ec479292117fdfe47f3a70.jpg,images/439a2bd12979fc7f55ec479292117fdfe47f3a70.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225877e-10
...,...,...,...,...,...,...,...,...,...
8658,ID_IZB98I,50de5e54e99546ef14892af0a399f1e62619bbd9.jpg,images/50de5e54e99546ef14892af0a399f1e62619bbd9.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225878e-10
8659,ID_ARS74L,5436f5023e2335053497c37668c785ea99f89f7a.jpg,images/5436f5023e2335053497c37668c785ea99f89f7a.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225878e-10
8660,ID_XEEX0K,54a0c060c5b1db1725763a828477f537f923cb48.jpg,images/54a0c060c5b1db1725763a828477f537f923cb48.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225878e-10
8661,ID_2FWYPY,2f8d0776cf2e485cd197076a35b72a1a4fbff24c.jpg,images/2f8d0776cf2e485cd197076a35b72a1a4fbff24c.jpg,1,0.083291,0.429475,0.000003,0.487231,7.225878e-10


In [12]:
submission_df = sample_submission_df[["ID", "DR","G","ND","WD","other"]]

In [13]:
submission_df

Unnamed: 0,ID,DR,G,ND,WD,other
0,ID_SJBCSZ,0.083291,0.429475,0.000003,0.487231,7.225878e-10
1,ID_4UMPJL,0.083291,0.429475,0.000003,0.487231,7.225878e-10
2,ID_AB4DZF,0.083291,0.429475,0.000003,0.487231,7.225875e-10
3,ID_H257JS,0.083291,0.429475,0.000003,0.487231,7.225878e-10
4,ID_JKST46,0.083291,0.429475,0.000003,0.487231,7.225877e-10
...,...,...,...,...,...,...
8658,ID_IZB98I,0.083291,0.429475,0.000003,0.487231,7.225878e-10
8659,ID_ARS74L,0.083291,0.429475,0.000003,0.487231,7.225878e-10
8660,ID_XEEX0K,0.083291,0.429475,0.000003,0.487231,7.225878e-10
8661,ID_2FWYPY,0.083291,0.429475,0.000003,0.487231,7.225878e-10


In [14]:
submission_df.to_csv('submission.csv', index=False)