## Install


## Imports


In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import json
import random
from sklearn.model_selection import train_test_split
import shutil
import textwrap
from tqdm import tqdm
from PIL import Image
import cv2
import pickle
import torch
from ultralytics.data.dataset import YOLODataset
import glob
from pathlib import Path
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from ultralytics import YOLO
import tqdm
import os

## Computing class weights


In [3]:
# Defining the file paths
current_dir = os.getcwd()
current_dir = os.path.dirname(current_dir)  # Get the current working directory

CSV = f"{current_dir}/data/train.csv"  # CSV metadata file
IMAGES = f"{current_dir}/train"  # Folder containing training images
SPLITDIR = "splits"  # Directory to store train/val/test splits
os.makedirs(SPLITDIR, exist_ok=True)

# Loading the raw dataset
raw = pd.read_csv(CSV)

# Light cleaning: we remove useless whitespaces and empty labels
raw["Finding Label"] = raw["Finding Label"].str.strip()
raw = raw[raw["Finding Label"].notna()]

# Counting samples per class
cls_counts = raw["Finding Label"].value_counts().sort_values(ascending=False)
display(cls_counts)

# We compute here simple class weights (inverse frequency scaling)
tot = cls_counts.sum()
class_weights = (tot / (len(cls_counts) * cls_counts)).round(2).tolist()

print(
    "Class weights:", dict(zip(cls_counts.index, class_weights, strict=True))
)

Finding Label
Atelectasis     153
Effusion        130
Cardiomegaly    124
Infiltrate      105
Pneumonia       102
Pneumothorax     83
Mass             72
Nodule           67
Name: count, dtype: int64

Class weights: {'Atelectasis': 0.68, 'Effusion': 0.8, 'Cardiomegaly': 0.84, 'Infiltrate': 1.0, 'Pneumonia': 1.02, 'Pneumothorax': 1.26, 'Mass': 1.45, 'Nodule': 1.56}


## Creating training/validation stratified splits


In [4]:
# random seeds for reproducibility
random.seed(42)
np.random.seed(42)


# Group annotations (bounding boxes + labels) per image
grouped = (
    raw.groupby("Image Index")
    .apply(
        lambda g: {
            "labels": g["Finding Label"].tolist(),
            "bboxes": g[["Bbox [x", "y", "w", "h]"]].values.tolist(),
        }
    )
    .reset_index()
    .rename(columns={0: "annotation"})
)

# Create a column for stratified splitting (use the first label as stratum)
grouped["strat"] = grouped["annotation"].apply(lambda d: d["labels"][0])

# Stratified train/val split (80/20)
train_df, val_df = train_test_split(
    grouped, test_size=0.2, random_state=42, stratify=grouped["strat"]
)

# Saving splits
train_df.to_pickle(f"{SPLITDIR}/train_split.pkl")
val_df.to_pickle(f"{SPLITDIR}/val_split.pkl")

print(len(train_df), "training samples –", len(val_df), "validation samples")


608 training samples – 152 validation samples


  .apply(


## Converting our dataset to YOLO format


In [5]:
# Setting-up YOLO directory structure
YOLO_ROOT = "yolo_cxr"
IMG_TRAIN = os.path.join(YOLO_ROOT, "images", "train")
IMG_VAL = f"{YOLO_ROOT}/images/val"
LAB_TRAIN = f"{YOLO_ROOT}/labels/train"
LAB_VAL = f"{YOLO_ROOT}/labels/val"


for path in [IMG_TRAIN, IMG_VAL, LAB_TRAIN, LAB_VAL]:
    os.makedirs(path, exist_ok=True)


# Extracting all unique classes and build class-to-index mapping
CLASSES = sorted(
    {label for ann in grouped["annotation"] for label in ann["labels"]}
)
cls2idx = {class_name: idx for idx, class_name in enumerate(CLASSES)}


# Conversion function: from our basic style to YOLOv5 format
def _convert(row, split):
    image_name = row["Image Index"]
    src_path = os.path.join(IMAGES, image_name)

    # Choose target image and label path based on split
    dst_img_path = os.path.join(
        IMG_TRAIN if split == "train" else IMG_VAL, image_name
    )
    dst_label_path = os.path.join(
        LAB_TRAIN if split == "train" else LAB_VAL,
        image_name.replace(".png", ".txt"),
    )

    # Copy image if not already copied
    if not os.path.exists(dst_img_path):
        shutil.copy2(src_path, dst_img_path)

    # We create here the YOLO-format
    w, h = Image.open(src_path).size
    with open(dst_label_path, "w") as f:
        for bbox, label in zip(
            row["annotation"]["bboxes"],
            row["annotation"]["labels"],
            strict=True,
        ):
            x, y, w_box, h_box = bbox
            x_center = (x + w_box / 2) / w
            y_center = (y + h_box / 2) / h
            norm_w = w_box / w
            norm_h = h_box / h
            f.write(
                f"{cls2idx[label]} {x_center:.6f} {y_center:.6f} {norm_w:.6f} {norm_h:.6f}\n"
            )


# We then convert all images and annotations in the right format for YOLO
for _, row in tqdm.tqdm(train_df.iterrows(), total=len(train_df)):
    _convert(row, split="train")

for _, row in tqdm.tqdm(val_df.iterrows(), total=len(val_df)):
    _convert(row, split="val")

print("Conversion complete.")


# Here we write YOLO dataset config YAML file (required by YOLO)
yaml_content = textwrap.dedent(f"""
    path: {os.path.abspath(YOLO_ROOT)}
    train: images/train
    val: images/val
    names: {CLASSES}
""")

with open(f"{SPLITDIR}/chest.yaml", "w") as f:
    f.write(yaml_content)

print(yaml_content)


100%|██████████| 608/608 [00:00<00:00, 6558.86it/s]
100%|██████████| 152/152 [00:00<00:00, 8202.54it/s]

Conversion complete.

path: /home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/src/yolo_cxr
train: images/train
val: images/val
names: ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltrate', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax']






## Dealing with rare classes


In [None]:
# Defining rare classes to extract patches from (based on EDA)
RARE_CLASSES = {"Pneumothorax", "Mass", "Nodule"}
PATCH_BANK = "splits/rare_patches.pkl"
MARGIN = 4  # pixels of padding around each box for context

# We build here patch bank only if it doesn't already exist
if not os.path.exists(PATCH_BANK):
    patches = []

    # Iterating over all annotated samples
    for _, row in tqdm.tqdm(raw.iterrows(), total=len(raw)):
        label = row["Finding Label"]
        if label not in RARE_CLASSES:
            continue  # Skip non-rare classes

        # We load the image and extract the bounding box coordinates with margin
        img_path = os.path.join(IMAGES, row["Image Index"])
        img = cv2.imread(img_path)

        x, y, w_box, h_box = row[["Bbox [x", "y", "w", "h]"]]
        x1 = max(int(x - MARGIN), 0)
        y1 = max(int(y - MARGIN), 0)
        x2 = min(int(x + w_box + MARGIN), img.shape[1])
        y2 = min(int(y + h_box + MARGIN), img.shape[0])

        # We crop the patch and store it with its label
        patch = img[y1:y2, x1:x2]
        patches.append({"patch": patch, "label": label})

    with open(PATCH_BANK, "wb") as f:
        pickle.dump(patches, f)

    print(f" Patch bank created with {len(patches)} rare-class samples.")
else:
    print(" Patch bank already exists.")

100%|██████████| 836/836 [00:03<00:00, 266.76it/s]


 Patch bank created with 375 rare-class samples.


In [9]:
# We load the rare class patch bank (previously saved with pickle)
with open("splits/rare_patches.pkl", "rb") as f:
    _rare_bank = pickle.load(f)

# We get the class indices of rare classes
_rare_cls_idx = {cls2idx[c] for c in RARE_CLASSES}


# Custom dataset class that applies Copy-Paste augmentation with rare class patches
class RareCopyPasteDataset(YOLODataset):
    def __init__(self, *args, p_paste=0.8, max_patches=3, **kwargs):
        super().__init__(*args, **kwargs)
        self.p_paste = p_paste  # Probability of applying copy-paste
        self.max_patches = (
            max_patches  # Max number of patches to paste per image
        )

    def _paste(self, img, patch):
        # Paste a patch at a random location within the image (with some margin)
        H, W = img.shape[:2]
        ph, pw = patch.shape[:2]
        x = random.randint(int(0.15 * W), max(int(0.85 * W - pw), 1))
        y = random.randint(int(0.15 * H), max(int(0.85 * H - ph), 1))
        img[y : y + ph, x : x + pw] = patch
        return x, y, pw, ph

    def __getitem__(self, idx):
        img, target, _, _ = super().__getitem__(idx)

        # We skip copy-paste if the image already contains a rare class
        if any(int(c) in _rare_cls_idx for c in target["cls"]):
            return img, target, img.copy(), img.copy()

        # With probability p_paste, paste up to max_patches rare class objects
        if random.random() < self.p_paste:
            for _ in range(random.randint(1, self.max_patches)):
                entry = random.choice(_rare_bank)
                x, y, w, h = self._paste(img, entry["patch"])
                target["bboxes"].append([x, y, w, h])
                target["cls"].append(torch.tensor(cls2idx[entry["label"]]))

        return img, target, img.copy(), img.copy()


# Monkey-patch the Ultralytics dataset builder with our custom dataset
from ultralytics.data import build

build.YOLODataset = RareCopyPasteDataset

print("RareCopyPasteDataset enabled")


RareCopyPasteDataset enabled


## Training of the model


In [34]:
!yolo detect train \
    data="splits/chest.yaml" \
    model=yolov8s.pt \
    imgsz=1024 \
    batch=4 \
    epochs=30 \
    freeze=10 \
    patience=70 \
    close_mosaic=50 \
    mosaic=0.25 \
    scale=0.3 \
    cls=1.5 \
    box=10.0 \
    dfl=1.5 \
    hsv_h=0 \
    hsv_s=0 \
    hsv_v=0 \
    lr0=0.002 \
    optimizer=SGD \
    seed=42 \
    amp=True \
    project="runs/detect"


Ultralytics 8.3.161 🚀 Python-3.11.10 torch-2.7.0+cu126 CPU (AMD Ryzen 7 PRO 7840U w/ Radeon 780M Graphics)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=4, bgr=0.0, box=10.0, cache=False, cfg=None, classes=None, close_mosaic=50, cls=1.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=splits/chest.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=30, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=10, half=False, hsv_h=0, hsv_s=0, hsv_v=0, imgsz=1024, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.002, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8s.pt, momentum=0.937, mosaic=0.25, multi_scale=False, name=train, nbs=64, nms=False, opset=None, optimize=False, optimizer=SGD, overlap_mask=True, patience=70, perspective=0.0, plots=True, pose=

In [35]:
# Second part of the training, without freezing any layers
!yolo detect train \
    model=runs/detect/yolo8x_cxr_v2_res1024_f1/weights/last.pt \
    data=splits/chest.yaml \
    epochs=70 \
    freeze=0 \
    lr0=0.0005 \
    patience=70 \
    imgsz=1024 \
    batch=4 \
    project=runs/detect \
    name=yolo8x_cxr_v2_res1024_f2


Traceback (most recent call last):
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/bin/yolo", line 8, in <module>
    sys.exit(entrypoint())
             ^^^^^^^^^^^^
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/lib/python3.11/site-packages/ultralytics/cfg/__init__.py", line 956, in entrypoint
    model = YOLO(model, task=task)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/lib/python3.11/site-packages/ultralytics/models/yolo/model.py", line 79, in __init__
    super().__init__(model=model, task=task, verbose=verbose)
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/lib/python3.11/site-packages/ultralytics/engine/model.py", line 151, in __init__
    self._load(model, task=task)
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/lib/python3.11/site-packages/ultralytics/e

## Validation of the model


In [None]:
# Quick validation on the validation set using best model weights
!yolo val \
    model=runs/detect/yolo8x_cxr_v2_res1024_f2/weights/best.pt \
    data=splits/chest.yaml \
    imgsz=1024


## Prediction of the model and converting to COCO format


In [36]:
# In this section, we run inference on validation images using the best model checkpoint.
# This generates YOLO-format predictions in .txt files (1 per image).
!yolo predict \
    model=runs/detect/yolo8x_cxr_v2_res1024_f2/weights/best.pt \
    source=yolo_cxr/images/val \
    save_txt=True \
    save_conf=True \
    project=runs/detect \
    name=yolo8x_pred


Traceback (most recent call last):
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/bin/yolo", line 8, in <module>
    sys.exit(entrypoint())
             ^^^^^^^^^^^^
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/lib/python3.11/site-packages/ultralytics/cfg/__init__.py", line 956, in entrypoint
    model = YOLO(model, task=task)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/lib/python3.11/site-packages/ultralytics/models/yolo/model.py", line 79, in __init__
    super().__init__(model=model, task=task, verbose=verbose)
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/lib/python3.11/site-packages/ultralytics/engine/model.py", line 151, in __init__
    self._load(model, task=task)
  File "/home/brice/Documents/PERSONAL_PROJECT/Chest_X-ray_Detection_Challenge/.venv/lib/python3.11/site-packages/ultralytics/e

In [39]:
# Function to convert YOLO .txt predictions → COCO-style JSON like we had at the beginning
def yolo_to_coco_pred(txt_dir, output_json):
    preds = []
    pred_id = 1  # Unique ID for each predicted instance

    for img_id, (_, row) in enumerate(val_df.iterrows(), 1):
        img_name = row["Image Index"]
        txt_path = Path(txt_dir) / "labels" / img_name.replace(".png", ".txt")

        if not txt_path.exists():
            continue

        w, h = Image.open(Path(IMAGES) / img_name).size

        with open(txt_path) as f:
            for line in f:
                cls, xc, yc, bw, bh, *conf = map(float, line.split())
                conf = (
                    conf[0] if conf else 0.5
                )  # Default confidence if missing
                x = (xc - bw / 2) * w
                y = (yc - bh / 2) * h

                preds.append(
                    {
                        "id": pred_id,
                        "image_id": img_id,
                        "category_id": int(cls) + 1,  # COCO IDs start at 1
                        "bbox": [
                            x,
                            y,
                            bw * w,
                            bh * h,
                        ],  # Format: [x_min, y_min, width, height]
                        "score": conf,
                    }
                )
                pred_id += 1

    # Write all predictions to a single COCO-style JSON file
    with open(output_json, "w") as f:
        json.dump(preds, f)


# We convert the YOLO .txt predictions to COCO-style JSON
PRED_NATIVE = "runs/detect/yolo8x_cxr_v2_res1024_f2/predictions_native.json"
yolo_to_coco_pred("runs/detect/yolo8x_pred", PRED_NATIVE)

print(f"COCO-format predictions written to → {PRED_NATIVE}")


FileNotFoundError: [Errno 2] No such file or directory: 'runs/detect/yolo8x_cxr_v2_res1024_f2/predictions_native.json'

## Creating the submission.csv file


In [None]:
# Here we build submission.csv from test predictions

# Loading the fine-tuned model
MODEL_PATH = "runs/detect/yolo8x_cxr_v2_res1024_f2/weights/best.pt"
model = YOLO(MODEL_PATH)

# Here, we read the ID ↔ image mapping
mapping = pd.read_csv(
    os.path.join(current_dir, "data", "ID_to_Image_Mapping.csv")
)


TEST_DIR = os.path.join(current_dir, "test")
image_files = {f for f in os.listdir(TEST_DIR) if f.endswith(".png")}
mapping = mapping[mapping["image_id"].isin(image_files)].copy()


# We add a rank column (0, 1, 2, …) to distinguish multiple boxes per image
mapping["rank"] = mapping.groupby("image_id").cumcount()

# And run inference on the test set
results = model.predict(
    source=TEST_DIR,
    imgsz=1024,
    conf=0.0,  # we keep all boxes we’ll filter later
    save=False,
    verbose=False,
)

# In this below section,
# we re-format predictions and assign a rank based on confidence
pred_records = []

for res in results:
    img_name = os.path.basename(res.path)

    confs = res.boxes.conf.cpu().numpy()
    order = confs.argsort()[::-1]  # we order them by descending confidence

    boxes = res.boxes.xyxy.cpu().numpy()
    clss = res.boxes.cls.cpu().numpy()

    for rank, idx in enumerate(order):
        x_min, y_min, x_max, y_max = boxes[idx]
        pred_records.append(
            {
                "image_id": img_name,
                "rank": rank,
                "x_min": float(x_min),
                "y_min": float(y_min),
                "x_max": float(x_max),
                "y_max": float(y_max),
                "confidence": float(confs[idx]),
                "label": model.names[int(clss[idx])],
            }
        )

preds_df = pd.DataFrame(pred_records)

#  Merge mapping and predictions on (image_id, rank)
submission_df = mapping.merge(
    preds_df,
    on=["image_id", "rank"],
    how="left",
    validate="one_to_one",
    # every mapping row must find exactly one box
    # (submission.csv format requirements)
)

assert not submission_df["id"].isna().any(), "Missing IDs in submission"
assert not submission_df["confidence"].isna().any(), (
    "Missing boxes for some IDs"
)

# Select final columns and export to CSV
submission_df = submission_df[
    [
        "id",
        "image_id",
        "x_min",
        "y_min",
        "x_max",
        "y_max",
        "confidence",
        "label",
    ]
]

OUT_PATH = os.path.join(current_dir, "/outputs/submission.csv")
submission_df.to_csv(OUT_PATH, index=False)

print(f"submission.csv saved – {len(submission_df)} rows (all IDs unique).")

FileNotFoundError: [Errno 2] No such file or directory: 'runs/detect/yolo8x_cxr_v2_res1024_f2/weights/best.pt'