## **Libraries**

In [None]:
!pip install ensemble_boxes

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ensemble_boxes
  Downloading ensemble_boxes-1.0.9-py3-none-any.whl (23 kB)
Installing collected packages: ensemble_boxes
Successfully installed ensemble_boxes-1.0.9


In [None]:
# Libraries
import os
import zipfile
import shutil
from os import listdir
from glob import glob
from os.path import isfile, join
import yaml


import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from sklearn.model_selection import GroupKFold

from PIL import Image
import cv2
from ensemble_boxes import weighted_boxes_fusion

## **Pre-Processing**

### **Unzip**

In [None]:
# Create folders to be used for storage
os.makedirs('/content/labels', exist_ok = True)
os.makedirs('/content/images', exist_ok = True)
os.makedirs('/content/submission', exist_ok = True)

In [None]:
# Unzip the JPG images for 'training'
zip_training_path = '/content/drive/MyDrive/AI_CRANFIELD/images_jpg/dataset-images-jpg.zip'
with zipfile.ZipFile(zip_training_path, 'r') as zip_ref:
    zip_ref.extractall('/content/images')

In [None]:
# Unzip the JPG images for 'submission'
zip_submission_path = '/content/drive/MyDrive/AI_CRANFIELD/submission_images_jpg/dataset-submission-images-jpg.zip'
with zipfile.ZipFile(zip_submission_path, 'r') as zip_ref:
    zip_ref.extractall('/content/submission')

### **Dataframes**

In [None]:
# Import data as dataframe
data = pd.read_csv("/content/drive/MyDrive/AI_CRANFIELD/train_with_sizes.csv")
submission_data = pd.read_csv("/content/drive/MyDrive/AI_CRANFIELD/sample_submission_with_sizes.csv")

In [None]:
# Dispay 'data' dataframe
data

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,2080,2336
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,2304,2880
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,2540,3072
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,2285,2555
4,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,2568,3353
...,...,...,...,...,...,...,...,...,...,...
36091,b53d1dd80e99ca6bcef9d592f65d3321,Pleural effusion,10,R9,240.0,1550.0,562.0,2001.0,2304,2880
36092,26d1d5a0ef2e692c6340e74859ffdc53,Pulmonary fibrosis,13,R10,1163.0,787.0,1338.0,941.0,3072,3072
36093,22672ab82c290c20b86863291e25ef6c,ILD,5,R9,299.0,664.0,794.0,1508.0,2048,2500
36094,db169d0be36123bd55b866d6aa73983b,Other lesion,9,R8,6.0,670.0,272.0,1736.0,2304,2880


In [None]:
# Dispay 'submission_data' dataframe
submission_data.drop('PredictionString', axis=1, inplace=True)
submission_data

Unnamed: 0,image_id,width,height
0,002a34c58c5b758217ed1f584ccbcfe9,2345,2584
1,004f33259ee4aef671c2b95d54e4be68,2517,3028
2,008bdde2af2462e86fd373a445d0f4cd,2304,2880
3,009bc039326338823ca3aa84381f17f1,1994,2430
4,00a2145de1886cb9eb88869c85d74080,2136,2408
...,...,...,...
2995,ff91fb82429a27521bbec8569b041f02,3072,3072
2996,ff9fcc4087ed5e941209aa3fa948e364,2304,2880
2997,ffaa288c8abca300974f043b57d81521,2684,2958
2998,ffc441e0c8b7153844047483a577e7c3,1944,2328


### **Images**

In [None]:
# Function that pre-processes correctly the images from a folder and save them into another folder
# INPUT: dataframe, input folder, output folder
# OUTPUT: None
def pre_process_images(data, in_folder, out_folder):

    # Create the output folder
    os.makedirs(out_folder, exist_ok = True)

    # Go through each image of the dataframe
    for id in tqdm(data['image_id'].unique()):

        # Open this image with PIL
        img = Image.open(in_folder + id + ".jpg")

        # Convert to an float array
        img_array = np.array(img)
        img_array = img_array.astype(np.float64)

        # Resize the image to (640,640)
        img_array = cv2.resize(img_array, (640, 640), interpolation=cv2.INTER_CUBIC)

        # Standardize the image
        img_array = img_array - np.min(img_array)
        img_array = img_array / np.max(img_array)
        img_array = (img_array * 255).astype(np.uint8)
        
        # Save this image as .png into the output folder
        cv2.imwrite(out_folder + '{}.png'.format(id), img_array)

In [None]:
# Apply the function to create a new folder with pre-processed images for training
pre_process_images(data, '/content/images/', '/content/images-640/')

  0%|          | 0/4394 [00:00<?, ?it/s]

In [None]:
# Apply the function to create a new folder with pre-processed images for submission
pre_process_images(submission_data, '/content/submission/', '/content/submission-640/')

  0%|          | 0/3000 [00:00<?, ?it/s]

### **WBF**

In [None]:
# Function applying the WBF method on all the image for the training and create a new csv from it
# INPUT: dataframe
# OUPUT: None
def apply_wbf(data):
    
    # Groupby image per id
    groupby = data.groupby('image_id')

    # Initialise the new csv
    OUTPUT_PATH = "/content/"
    out = open(OUTPUT_PATH + 'boxes_description_iou_{}_div_{}.csv'.format(0.4, 4), 'w')
    out.write('image_id,x1,y1,x2,y2,class,score\n')

    # Go through the goupby dataframe
    for index, group in tqdm(groupby):

        # Initialise lists
        boxes = []
        scores = []
        labels = []
        
        # Recover x and y values and calculate standardised values
        for _, row in group.iterrows():
            
            width = row['width']
            height = row['height']
            
            x1 = row['x_min'] / width
            y1 = row['y_min'] / height
            x2 = row['x_max'] / width
            y2 = row['y_max'] / height

            boxes.append([x1, y1, x2, y2])
            labels.append(row['class_id'])
            scores.append(1.0)
        
        # Use the 'weighted_boxes_fusion' method
        boxes, scores, labels = weighted_boxes_fusion([boxes], [scores], [labels], iou_thr=0.4, weights=None, allows_overflow=True)

        # Change the type of labels to int
        labels = labels.astype(np.int32)

        # Write the new csv with the values coming from the WBF approach
        for i in range(len(boxes)):
            if scores[i] > 3:
                scores[i] = 3

            out.write("{},{},{},{},{},{},{:.0f}\n".format(
                index,
                boxes[i, 0],
                boxes[i, 1],
                boxes[i, 2],
                boxes[i, 3],
                labels[i],
                scores[i]
            ))

    # Close the new csv file
    out.close()

In [None]:
# Apply function to use the "WBF" method
apply_wbf(data)

  0%|          | 0/4394 [00:00<?, ?it/s]

In [None]:
# Function that creates a dictionnary with the class id and their names
# INPUT: dataframe
# OUTPUT: dictionnary
def recover_class_names(data):

    # Initialise the dictionnary
    cls_names = {}

    # Datframe with all the classes and their id & name
    cls_data = data[['class_name','class_id']].drop_duplicates().sort_values(by='class_id').reset_index(drop=True)

    # Go through this dataframe
    for i in range(len(cls_data)):
        
        cls_name = cls_data['class_name'][i]
        cls_id = cls_data['class_id'][i]
        
        # Save values in the dictionnary
        cls_names[cls_id] = cls_name

    return cls_names

In [None]:
# Apply function to save class id/names in a dictionnary
cls_names = recover_class_names(data)

In [None]:
# Function that recover the csv file from the 'apply_wbf' function and pre-process bounding box values
# INPUT: path of the csv
# OUPUT: 2 pre-processed dataframes
def pre_process_wbf_data(file_path):

    # Read the csv file
    df = pd.read_csv(file_path)

    # Calculate the middle, width, height and area of each bounding box
    df['x_mid'] = (df['x1']+df['x2'])/2
    df['y_mid'] = (df['y1']+df['y2'])/2
    df['w'] = df['x2']-df['x1']
    df['h'] = df['y2']-df['y1']
    df['area'] = df['w']*df['h']

    # Groupby list bounding boxes of a same image
    df1 = df.groupby('image_id')['class'].apply(list).reset_index(name='class')

    df2 = df.groupby('image_id')['x_mid'].apply(list).reset_index(name='x_mid')
    df3 = df.groupby('image_id')['y_mid'].apply(list).reset_index(name='y_mid')

    df4 = df.groupby('image_id')['w'].apply(list).reset_index(name='w')
    df5 = df.groupby('image_id')['h'].apply(list).reset_index(name='h')

    # Merge all these previous groupby dataframes
    data_final = df1.merge(df2, how='inner', on='image_id')
    data_final = data_final.merge(df3, how='inner', on='image_id')
    data_final = data_final.merge(df4, how='inner', on='image_id')
    data_final = data_final.merge(df5, how='inner', on='image_id')

    return df, data_final

In [None]:
# Specify the path where the csv file from the 'apply_wbf' function has been stored
file_path = '/content/boxes_description_iou_0.4_div_4.csv'

# Apply function to create 2 dataframes
df, data_final = pre_process_wbf_data(file_path)

In [None]:
# Display the dataframe for training
df

Unnamed: 0,image_id,x1,y1,x2,y2,class,score,x_mid,y_mid,w,h,area
0,0005e8e3701dfb1dd93d53e2ff537b6e,0.293783,0.190430,0.391927,0.289388,7,2,0.342855,0.239909,0.098145,0.098958,0.009712
1,0005e8e3701dfb1dd93d53e2ff537b6e,0.303385,0.184570,0.389648,0.291667,4,1,0.346517,0.238118,0.086263,0.107096,0.009238
2,0005e8e3701dfb1dd93d53e2ff537b6e,0.292969,0.191081,0.392253,0.289062,6,1,0.342611,0.240072,0.099284,0.097982,0.009728
3,0005e8e3701dfb1dd93d53e2ff537b6e,0.303385,0.184570,0.389648,0.291667,8,1,0.346517,0.238118,0.086263,0.107096,0.009238
4,0007d316f756b3fa0baea2ff514ce945,0.544705,0.357639,0.646701,0.437847,0,2,0.595703,0.397743,0.101997,0.080208,0.008181
...,...,...,...,...,...,...,...,...,...,...,...,...
22714,ffceb71a80efba3b83c88e11f4b9694b,0.677572,0.491468,0.908013,0.546075,13,1,0.792793,0.518771,0.230441,0.054608,0.012584
22715,ffe6f9fe648a7ec29a50feb92d6c15a4,0.426319,0.641232,0.857728,0.825925,3,3,0.642024,0.733578,0.431409,0.184693,0.079678
22716,ffe6f9fe648a7ec29a50feb92d6c15a4,0.209683,0.401820,0.220857,0.414771,9,1,0.215270,0.408295,0.011173,0.012951,0.000145
22717,ffe6f9fe648a7ec29a50feb92d6c15a4,0.538175,0.363318,0.654004,0.469723,0,1,0.596089,0.416521,0.115829,0.106405,0.012325


### **Labels**

In [None]:
# Display the dataframe created for labels
data_final

Unnamed: 0,image_id,class,x_mid,y_mid,w,h
0,0005e8e3701dfb1dd93d53e2ff537b6e,"[7, 4, 6, 8]","[0.3428548127412796, 0.34651692708333337, 0.34...","[0.2399088442325592, 0.23811848958333334, 0.24...","[0.09814456105232239, 0.08626302083333331, 0.0...","[0.09895831346511841, 0.10709635416666669, 0.0..."
1,0007d316f756b3fa0baea2ff514ce945,"[0, 5, 5, 3, 11, 11, 11, 13, 13, 13]","[0.595703125, 0.8550347222222222, 0.318359375,...","[0.3977430611848831, 0.6085069444444444, 0.652...","[0.1019965410232544, 0.10677083333333326, 0.17...","[0.08020833134651179, 0.23854166666666665, 0.0..."
2,000d68e42b71d3eac10ccc077aba07c1,"[9, 7, 7, 0, 13, 11, 11, 9, 9, 9, 9, 9, 9, 9, 9]","[0.9461805522441864, 0.4782986111111111, 0.601...","[0.5188367962837219, 0.34027777777777773, 0.09...","[0.0763888955116272, 0.8090277777777778, 0.174...","[0.22413194179534918, 0.3486111111111111, 0.02..."
3,00150343289f317a0ad5629d5b7d9ef9,"[10, 0, 3, 7, 7, 9, 10, 10, 13, 13, 0, 11, 11,...","[0.8367326855659485, 0.5900000035762787, 0.540...","[0.4862440377473831, 0.26365629583597183, 0.47...","[0.11702966690063477, 0.15544551610946655, 0.4...","[0.08931419253349304, 0.15570174157619482, 0.1..."
4,001d127bad87592efe45a5c7678f8b8d,"[13, 2, 13, 13, 13]","[0.3516438752412796, 0.38525390625, 0.69449869...","[0.2374674454331398, 0.19596354166666666, 0.26...","[0.1424153745174408, 0.07454427083333337, 0.05...","[0.2275390475988388, 0.14908854166666669, 0.12..."
...,...,...,...,...,...,...
4389,ff924bcbd38f123aec723aa7040d7e43,"[10, 11, 13, 13, 10, 11, 7, 1, 1, 4, 4, 13, 13...","[0.8931159377098083, 0.8931159377098083, 0.313...","[0.8720858097076416, 0.8720858097076416, 0.367...","[0.08001208305358887, 0.08001208305358887, 0.3...","[0.07849705219268799, 0.07849693298339844, 0.3..."
4390,ffb5d0b005261ed350f7a08c06613a34,"[3, 0]","[0.5203993171453476, 0.5201099663972855]","[0.5194444507360458, 0.28680554777383804]","[0.44241896271705633, 0.11111113429069525]","[0.10763886570930487, 0.1032407134771347]"
4391,ffceb71a80efba3b83c88e11f4b9694b,"[3, 13, 13]","[0.6028133183717728, 0.8418681839734472, 0.792...","[0.6183873862028122, 0.509598976109215, 0.5187...","[0.6571834981441497, 0.10668563300142242, 0.23...","[0.3764220774173737, 0.036262798634812354, 0.0..."
4392,ffe6f9fe648a7ec29a50feb92d6c15a4,"[3, 9, 0]","[0.6420235484838486, 0.21527001862197387, 0.59...","[0.7335783541202545, 0.4082954147707385, 0.416...","[0.4314090311527253, 0.011173184357541915, 0.1...","[0.18469256162643433, 0.012950647532376625, 0...."


In [None]:
# Function that creates labels from the dataframe where each line corresponding to an image with all its bounding box
# INPUT: dataframe
# OUPUT: None
def create_labels(data_final):

    # Create the folder just in case
    os.makedirs('/content/labels', exist_ok = True)

    # Initialise the path
    cwd = '/content/labels/'

    # Go through the dataframe
    for i in tqdm(range(len(data_final))):
        
        # Initialise the name of the label file
        label_file = data_final.loc[i].image_id + '.txt'
        
        # Write the label file
        with open(os.path.join(cwd , label_file), 'w') as f:
            for j in range(len(data_final.loc[i]['class'])):
                f.write(str(data_final.loc[i]['class'][j]) + ' ' + str(data_final.loc[i].x_mid[j]) + ' ' + str(data_final.loc[i].y_mid[j]) + ' ' + str(data_final.loc[i].w[j]) + ' ' + str(data_final.loc[i].h[j]) + '\n')

In [None]:
# Apply function to create labels
create_labels(data_final)

  0%|          | 0/4394 [00:00<?, ?it/s]

### **Train and validation**

In [None]:
# Function that splits the dataframe into 5 with the 'KFOLD' methods for the training and validation part
# Then, copy images and labels in right folders
# INPUT: dataframe, fold of the validation
# OUTPUT: None
def setup_training(df, fold):

    # Create columns for images and labels path
    df['image_path'] = '/content/images-640/' + df['image_id'] + '.png'
    df['label_path'] = '/content/labels/' + df['image_id'] + '.txt'

    # Allocate a value between 0 to 4 for each image
    gkf  = GroupKFold(n_splits = 5)
    df['fold'] = -1
    for fold, (train_idx, val_idx) in enumerate(gkf.split(df, groups = df.image_id.tolist())):
        df.loc[val_idx, 'fold'] = fold

    # Initialise lists
    train_files = []
    val_files   = []

    # Store the path of images from "training" and "validation"
    val_files += list(df[df.fold==fold].image_path.unique())
    train_files += list(df[df.fold!=fold].image_path.unique())

    # Create respective directories
    os.makedirs('/content/vinbigdata/labels/train', exist_ok = True)
    os.makedirs('/content/vinbigdata/labels/val', exist_ok = True)
    os.makedirs('/content/vinbigdata/images/train', exist_ok = True)
    os.makedirs('/content/vinbigdata/images/val', exist_ok = True)

    label_dir = '/content/labels'
    
    # Copy images and labels to right folders
    for file in tqdm(train_files):
        shutil.copy(file, '/content/vinbigdata/images/train')
        filename = file.split('/')[-1].split('.')[0]
        shutil.copy(os.path.join(label_dir, filename+'.txt'), '/content/vinbigdata/labels/train')
        
    for file in tqdm(val_files):
        shutil.copy(file, '/content/vinbigdata/images/val')
        filename = file.split('/')[-1].split('.')[0]
        shutil.copy(os.path.join(label_dir, filename+'.txt'), '/content/vinbigdata/labels/val')

In [None]:
# Specify which fold will be the validation part
fold = 4

# Apply function to split data between training and validation 
setup_training(df, fold)

  0%|          | 0/3516 [00:00<?, ?it/s]

  0%|          | 0/878 [00:00<?, ?it/s]

### **YAML File**

In [28]:
# Function that creates a YAML file necessary for the YOLOv5x training
# INPUT: classes dictionnary
# OUPUT: None
def create_yaml(cls_names):

    # Create a list with all class names
    classes = []
    for i in range(14):
        classes.append(cls_names[i])

    # Specify the common path
    cwd = '/content/'

    # Write a txt file with all the path from the 'train' images folder
    with open(join( cwd , 'train.txt'), 'w') as f:
        for path in glob('/content/vinbigdata/images/train/*'):
            f.write(path+'\n')

    # Write a txt file with all the path from the 'validation' images folder     
    with open(join( cwd , 'val.txt'), 'w') as f:
        for path in glob('/content/vinbigdata/images/val/*'):
            f.write(path+'\n')

    # Create a dictionnary with all the information needed
    data = dict(
        train =  join( cwd , 'train.txt') ,
        val   =  join( cwd , 'val.txt' ),
        nc    = 14,
        names = classes
        )

    # Write the yaml file based on that previous dictionnary
    with open(join( cwd , 'vinbigdata.yaml'), 'w') as outfile:
        yaml.dump(data, outfile, default_flow_style=False)

    # Display it
    f = open(join( cwd , 'vinbigdata.yaml'), 'r')
    print('\nyaml:')
    print(f.read())

In [29]:
# Apply function to create the yaml file
create_yaml(cls_names)


yaml:
names:
- Aortic enlargement
- Atelectasis
- Calcification
- Cardiomegaly
- Consolidation
- ILD
- Infiltration
- Lung Opacity
- Nodule/Mass
- Other lesion
- Pleural effusion
- Pleural thickening
- Pneumothorax
- Pulmonary fibrosis
nc: 14
train: /content/train.txt
val: /content/val.txt



## **Training**

In [30]:
# Clone the github with the YOLOv5 model and needed files
!git clone https://github.com/ultralytics/yolov5
%cd yolov5
# Install extra requirements
%pip install -qr requirements.txt

import torch
import utils
# Check everything
display = utils.notebook_init()

YOLOv5 🚀 v7.0-120-g3e55763 Python-3.9.16 torch-1.13.1+cu116 CPU


Setup complete ✅ (2 CPUs, 12.7 GB RAM, 34.2/107.7 GB disk)


In [32]:
# Specify as the root directory
os.chdir('/content/yolov5')

In [None]:
# Train YOLOv5x on VinBigData for 60 epochs with images size of 640 and batch size of 32
!python train.py --img 640 --batch 32 --epochs 60 --data /content/vinbigdata.yaml --weights yolov5x.pt --cache

In [None]:
# Test YOLOv5x on VinBigData submission data with images size of 640 and IoU score of 0.45
!python detect.py --weights 'runs/train/exp/weights/best.pt' --img 640 --conf 0.15 --iou 0.45 --source /content/submission-640/ --name yolov5x_results --save-txt --save-conf --exist-ok

## **Post-processing**

In [None]:
# Function that recovers the real size of each bounding box regarding the size of the image
# INPUT: image height and width, bounding box
# OUPUT: bounding box
def yolo2voc(image_height, image_width, bboxes):

    bboxes = bboxes.copy().astype(float) 
    
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]]* image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]]* image_height
    
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] - bboxes[..., [2, 3]]/2
    bboxes[..., [2, 3]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]]
    
    return bboxes

In [None]:
# Function that save the prediction of bounding boxes and class of the YOLOv5x trained on submission data
# INPUT: submission dataframe
# OUPUT: None
def submission(submission_data):

    # Initialise lists
    image_ids = []
    PredictionStrings = []

    # Recover each prediction stored as txt file in the 'labels' folder
    for file_path in tqdm(glob('runs/detect/yolov5x_results/labels/*txt')):

        # Recover image id
        image_id = file_path.split('/')[-1].split('.')[0]

        # From the image id, retrieve the width and height of the image
        w, h = submission_data.loc[submission_data.image_id==image_id,['width', 'height']].values[0]

        # Read the txt file
        f = open(file_path, 'r')

        # Convert predictions into arrays
        data = np.array(f.read().replace('\n', ' ').strip().split(' ')).astype(np.float32).reshape(-1, 6)

        # Keep useful columns
        data = data[:, [0, 5, 1, 2, 3, 4]]

        # List all the bounding box for that specific image 
        bboxes = list(np.round(np.concatenate((data[:, :2], np.round(yolo2voc(h, w, data[:, 2:]))), axis =1).reshape(-1), 1).astype(str))

        # Change the type bounding boxes values
        for idx in range(len(bboxes)):
            bboxes[idx] = str(int(float(bboxes[idx]))) if idx%6!=1 else bboxes[idx]
        if str(bboxes[0]) != "15":

            # Store image id and its prediction
            image_ids.append(image_id)
            if str(bboxes[0]) != "14":
                PredictionStrings.append(' '.join(bboxes))
            else:
                PredictionStrings.append("14 1 0 0 1 1")
    
    # Create a dataframe with these two lists
    pred_df = pd.DataFrame({'image_id':image_ids, 'PredictionString':PredictionStrings})

    # Merge this previous dataframe with the submission dataframe on 'image_id'
    # Fill with "No Finding" class missing values
    sub_df = pd.merge(submission_data, pred_df, on = 'image_id', how = 'left').fillna("14 1 0 0 1 1")

    # Keep only the columns 'image_id' and 'PredictionString'
    sub_df = sub_df[['image_id', 'PredictionString']]
    
    # Save as csv
    sub_df.to_csv('/content/submission.csv',index = False)

    # Display the end of the submission dataframe
    sub_df.tail()