# Analysing Effects of Pretraining On Similar Dataset

In this notebook I will try to do the following:
* Pretrain the model on a previous competiton similar dataset
* Finetune the pretrained model on the current competition dataset
* Generate pseudo labels for the previous dataset from fine tuned dataset
* Retrain the model on the generated pseudo labels and observe the model performance on the current competition dataset.

In [None]:
!nvidia-smi

In [None]:
!pip install efficientnet -q

In [None]:
import os
import cv2
import glob

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import tensorflow as tf
import efficientnet.tfkeras as efn

from sklearn.model_selection import GroupKFold, train_test_split

# Coniguration


In [None]:
class Config:
    IMAGES = '../input/vinbigdata-chest-xray-resized-png-1024x1024/train'
    DATA = '../input/vinbigdata-chest-xray-abnormalities-detection/train.csv'
    
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    
    PRETRAINING_IMAGE_SIZE = 256 # I am using smaller image size for faster training
    PRETRAINING_BATCH_SIZE = 8
    PRETRAINING_NUM_CLASSES = 15
    PRETRAINING_LR = 0.001

In [None]:
config = Config()

# Loading the pretraining data

In [None]:
df = pd.read_csv(config.DATA)
df.head(2)

In [None]:
# SELECTING ONLY REQUIRED COLUMNS
df['image_path'] = df['image_id'].map(lambda x: f'{config.IMAGES}/{x}.png')
df = df[['image_path', 'class_id']]
df.head(2)

In [None]:
df.class_id.nunique()

# Creating the pretraining dataset

In [None]:
df_train, df_valid = train_test_split(df, test_size=0.1, random_state=1234)

In [None]:
def aug_func(image_path, label):
    file_bytes = tf.io.read_file(image_path)
    img = tf.image.decode_png(file_bytes, channels=3)
    img = tf.image.resize(img, [config.PRETRAINING_IMAGE_SIZE, config.PRETRAINING_IMAGE_SIZE])
    img = img/255.
    return img, label

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((df_train['image_path'].values, df_train['class_id'].values))
train_dataset = train_dataset.map(aug_func, num_parallel_calls=config.AUTOTUNE)
train_dataset = train_dataset.repeat()
train_dataset = train_dataset.batch(config.PRETRAINING_BATCH_SIZE)
train_dataset = train_dataset.prefetch(config.AUTOTUNE)

In [None]:
valid_dataset = tf.data.Dataset.from_tensor_slices((df_valid['image_path'].values, df_valid['class_id'].values))
valid_dataset = valid_dataset.map(aug_func, num_parallel_calls=config.AUTOTUNE)
valid_dataset = valid_dataset.batch(config.PRETRAINING_BATCH_SIZE)
valid_dataset = valid_dataset.prefetch(config.AUTOTUNE)

In [None]:
for i, j in zip(train_dataset, valid_dataset):
    print(i[0].shape, i[1].shape, j[0].shape, j[1].shape)
    plt.figure(figsize=(20,10))
    plt.subplot(1,2,1)
    plt.imshow(i[0][0])
    plt.subplot(1,2,2)
    plt.imshow(j[0][0])
    break

# Creating the model

In [None]:
model = tf.keras.Sequential([
    efn.EfficientNetB2(
        input_shape=(config.PRETRAINING_IMAGE_SIZE, config.PRETRAINING_IMAGE_SIZE, 3),
        weights='imagenet',
        include_top=False),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(config.PRETRAINING_NUM_CLASSES, activation='softmax')
])

In [None]:
# Training only the classifier layer for 1 epoch
model.layers[0].trainable = False

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=config.PRETRAINING_LR),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=tf.keras.metrics.SparseCategoricalAccuracy())

model.summary()

In [None]:
STEPS = len(df_train) // config.PRETRAINING_BATCH_SIZE

model_checkpoint = tf.keras.callbacks.ModelCheckpoint('./efficientNet_Pretraining', save_best_only=True)
lr_schedular = tf.keras.callbacks.ReduceLROnPlateau(patience=1, min_delta=0.01)
early_stopping = tf.keras.callbacks.EarlyStopping(min_delta=0.001, patience=1)

# Pretraining the model

In [None]:
# Only last layer

model.fit(x = train_dataset,
         epochs = 1,
         steps_per_epoch = STEPS,
         validation_data = valid_dataset,
         callbacks = [model_checkpoint, lr_schedular, early_stopping])

In [None]:
# Training whole model with a small learning rate for 1 epoch

model.layers[0].trainable = True

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=tf.keras.metrics.SparseCategoricalAccuracy())

model.summary()

In [None]:
# Whole model with hlaf steps so that weights don't distort much

model.fit(x = train_dataset,
         epochs = 3,
         steps_per_epoch = STEPS,
         validation_data = valid_dataset,
         callbacks = [model_checkpoint, lr_schedular, early_stopping])

I tried with a little larger learning rate but it didn't seem to be working well so I moved back to the smaller one.

In [None]:
# Reloading the saved model and also saving the model in the output dir
model = tf.keras.models.load_model('./efficientNet_Pretraining')

In [None]:
# Cross checking the model performance
model.evaluate(valid_dataset)

In [None]:
# Training whole model 

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=tf.keras.metrics.SparseCategoricalAccuracy())

In [None]:
# Whole model

model.fit(x = train_dataset,
         epochs = 3,
         steps_per_epoch = STEPS,
         validation_data = valid_dataset,
         callbacks = [model_checkpoint, lr_schedular, early_stopping])

This much pre-training is enough so let's use this model as feature extractor for our current competition data

-----------------------------------------------

# Starting work on current Dataset

___________________________________________________

# Configuration Current

In [None]:
class CurrentConfig:
    DATA = '../input/siim-covid19-detection/train_study_level.csv'
    IMAGE_FOLDER = '../input/siimfisabiorsna-covid19-image-size-1024/study'
    
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    
    IMAGE_SIZE = 256 # I am using smaller image size for faster training
    BATCH_SIZE = 8
    NUM_CLASSES = 15
    LR = 0.001

In [None]:
current_config = CurrentConfig()

# Loading the dataset

In [None]:
df = pd.read_csv(current_config.DATA)

image_files = glob.glob(current_config.IMAGE_FOLDER + '/*')
temp_df = pd.DataFrame()
temp_df['paths'] = image_files
temp_df['id'] = temp_df['paths'].map(lambda x: x.split('/')[-1].split('.')[0])

In [None]:
df = pd.merge(df, temp_df, on='id')
df.head(2)

In [None]:
np.sum(df.isna())

In [None]:
for i in tqdm(df.paths):
    if not os.path.exists(i):
        print('image not found')
        break

# Spliting the dataset
For our purpose we will split the dataset in 3 parts. train, valid and test. I will not be doing k-fold cross validation as it takes too much time.

In [None]:
def mapper(row):
    if row[0] == 1: return 1
    if row[1] == 1: return 2
    if row[2] == 1: return 3
    if row[3] == 1: return 4

labels = []    

for row in df[df.columns[1:-1]].values:
    labels.append(mapper(row))
    
df['class'] = labels
df.columns = ['id', 'N', 'T', 'I', 'A', 'paths', 'class']

df.head(2)

In [None]:
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=1234, stratify=df['class'])

df_valid, df_test = train_test_split(df_temp, test_size=0.5, random_state=1234, stratify=df_temp['class'])

# Clearing GPU Memory

In [None]:
# Clearing GPU memory
from numba import cuda
cuda.get_current_device().reset()

# Creating the dataset

For some reason my session crashed so I have restarted from this point.

In [None]:
def aug_func(image_path, label):
    file_bytes = tf.io.read_file(image_path)
    img = tf.image.decode_png(file_bytes, channels=3)
    img = tf.image.resize(img, [current_config.IMAGE_SIZE, current_config.IMAGE_SIZE])
    img = img/255.
    return img, label

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((df_train['paths'].values, (df_train['N'].values, 
                                                                               df_train['T'].values,
                                                                               df_train['I'].values,
                                                                               df_train['A'].values)))

train_dataset = train_dataset.map(aug_func, num_parallel_calls=current_config.AUTOTUNE)
train_dataset = train_dataset.repeat()
train_dataset = train_dataset.batch(current_config.BATCH_SIZE)
train_dataset = train_dataset.prefetch(current_config.AUTOTUNE)

In [None]:
valid_dataset = tf.data.Dataset.from_tensor_slices((df_valid['paths'].values, (df_valid['N'].values, 
                                                                               df_valid['T'].values,
                                                                               df_valid['I'].values,
                                                                               df_valid['A'].values)))

valid_dataset = valid_dataset.map(aug_func, num_parallel_calls=current_config.AUTOTUNE)
valid_dataset = valid_dataset.batch(current_config.BATCH_SIZE)
valid_dataset = valid_dataset.prefetch(current_config.AUTOTUNE)

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices((df_test['paths'].values, (df_test['N'].values, 
                                                                             df_test['T'].values,
                                                                             df_test['I'].values,
                                                                             df_test['A'].values)))

test_dataset = test_dataset.map(aug_func, num_parallel_calls=current_config.AUTOTUNE)
test_dataset = test_dataset.batch(current_config.BATCH_SIZE)
test_dataset = test_dataset.prefetch(current_config.AUTOTUNE)

In [None]:
for i, j, k in zip(train_dataset, valid_dataset, test_dataset):
    print(i[0].shape, i[1][0].shape, i[1][1].shape, i[1][2].shape, i[1][3].shape)
    plt.figure(figsize=(20,10))
    plt.subplot(1,3,1)
    plt.imshow(i[0][0])
    plt.subplot(1,3,2)
    plt.imshow(j[0][0])
    plt.subplot(1,3,3)
    plt.imshow(k[0][0])
    break

# How would a not-so pretrained model perform

In [None]:
base_model = model = tf.keras.Sequential([
    efn.EfficientNetB2(
        input_shape=(current_config.IMAGE_SIZE, current_config.IMAGE_SIZE, 3),
        weights='imagenet',
        include_top=False),
    tf.keras.layers.GlobalAveragePooling2D()
])

In [None]:
def get_model(base_model):
    
    inputs = tf.keras.layers.Input(shape=(current_config.IMAGE_SIZE, current_config.IMAGE_SIZE, 3))
    
    classifier_one   = tf.keras.layers.Dense(1, activation='softmax', name='out1')
    classifier_two   = tf.keras.layers.Dense(1, activation='softmax', name='out2')
    classifier_three = tf.keras.layers.Dense(1, activation='softmax', name='out3')    
    classifier_four  = tf.keras.layers.Dense(1, activation='softmax', name='out4')    
    
    x = base_model(inputs)
    
    out1 = classifier_one(x)
    out2 = classifier_two(x)
    out3 = classifier_three(x)
    out4 = classifier_four(x)
    
    return tf.keras.models.Model(inputs=[inputs], outputs=[out1, out2, out3, out4])

In [None]:
model = get_model(base_model)
model.layers[0].trainable = False
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=current_config.LR),
              
              loss=[tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.BinaryCrossentropy(),
                   tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.BinaryCrossentropy()],
             
              metrics={'out1': tf.keras.metrics.AUC(name='auc_precision_recall', curve='PR'), 
                      'out2': tf.keras.metrics.AUC(name='auc_precision_recall', curve='PR'),
                      'out3': tf.keras.metrics.AUC(name='auc_precision_recall', curve='PR'),
                      'out4': tf.keras.metrics.AUC(name='auc_precision_recall', curve='PR')})

# Training the Model
I will not finetune the models and their hyperparameter as that will be very tidious and the notebook is already long enough. My main objective to see which pretrained weights models performs better.

In [None]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('./efficientNet_before_pretraining', save_best_only=True)
lr_schedular = tf.keras.callbacks.ReduceLROnPlateau(patience=1, min_delta=0.01)
early_stopping = tf.keras.callbacks.EarlyStopping(min_delta=0.001, patience=1)

In [None]:
# Training the last layers

STEPS = len(df_train)//current_config.BATCH_SIZE

# Before using Pretrained weights
BP_history = model.fit(x = train_dataset, 
                  validation_data = valid_dataset,
                  steps_per_epoch = STEPS,
                  epochs = 2,
                  callbacks=[model_checkpoint, lr_schedular, early_stopping]
                 )

# Creating the model with the pretrained base_model

In [None]:
base_model = tf.keras.models.load_model('./efficientNet_Pretraining')
base_model.summary()

In [None]:
# Removing the top layer
base_model.pop()
base_model.summary()

In [None]:
model = get_model(base_model)
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=current_config.LR),
              
              loss=[tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.BinaryCrossentropy(),
                   tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.BinaryCrossentropy()],
             
              metrics={'out1': tf.keras.metrics.AUC(name='auc_precision_recall', curve='PR'), 
                      'out2': tf.keras.metrics.AUC(name='auc_precision_recall', curve='PR'),
                      'out3': tf.keras.metrics.AUC(name='auc_precision_recall', curve='PR'),
                      'out4': tf.keras.metrics.AUC(name='auc_precision_recall', curve='PR')})

# Training the model

In [None]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('./efficientNet_after_pretraining', save_best_only=True)
lr_schedular = tf.keras.callbacks.ReduceLROnPlateau(patience=1, min_delta=0.01)
early_stopping = tf.keras.callbacks.EarlyStopping(min_delta=0.001, patience=1)

In [None]:
# Training the last layers

STEPS = len(df_train)//current_config.BATCH_SIZE

AP_history = model.fit(x = train_dataset, 
                  validation_data = valid_dataset,
                  steps_per_epoch = STEPS,
                  epochs = 2,
                  callbacks=[model_checkpoint, lr_schedular, early_stopping]
                 )

# Conclusion (Work in Progress)
For now as you can see I am getting this error (if anyone knows why this error is coming then pls do tell in the comments) but one more thing to note is that even in the first epochs there is considerable difference. Now you have to decide weather you want to pretrain your models on similar dataset or not. In my opinion the difference will only increase with bigger models with larger Image size.

# Kindly Upvote 😊