# Project: CliniScan - Lung Abnormality Detection
# Dataset: VinDr-CXR (Chest X-rays)
# Model: YOLOv8
# Created by: Prarthana


# Preprocessing

In [None]:
import os

BASE = "/content/drive/MyDrive/CliniScan/3_Preprocessing"

folders = [
    "train_jpg",
    "test_jpg",
    "images/train",
    "images/val",
    "images/test"
]

for f in folders:
    os.makedirs(os.path.join(BASE, f), exist_ok=True)

print("Subfolders created inside Preprocessing")


Subfolders created inside Preprocessing


In [None]:
from PIL import Image
import os
from tqdm import tqdm

OLD_TRAIN = "/content/drive/MyDrive/CliniScan/1_Dataset/vinbigdata-chest-xray-abnormalities-detection-512x512-jp2/train"
NEW_TRAIN = "/content/drive/MyDrive/CliniScan/3_Preprocessing/train_jpg"

os.makedirs(NEW_TRAIN, exist_ok=True)

train_files = [f for f in os.listdir(OLD_TRAIN) if f.endswith(".jp2")]

for f in tqdm(train_files, desc="Converting TRAIN"):
    new_name = f.replace(".jp2", ".jpg")
    dst = os.path.join(NEW_TRAIN, new_name)

    if os.path.exists(dst):
        continue

    img = Image.open(os.path.join(OLD_TRAIN, f)).convert("RGB")
    img.save(dst, "JPEG", quality=95)

print("TRAIN conversion done")


Converting TRAIN: 100%|██████████| 15000/15000 [18:24<00:00, 13.57it/s]

TRAIN conversion done





In [None]:
OLD_TEST = "/content/drive/MyDrive/CliniScan/1_Dataset/vinbigdata-chest-xray-abnormalities-detection-512x512-jp2/test"
NEW_TEST = "/content/drive/MyDrive/CliniScan/3_Preprocessing/test_jpg"

os.makedirs(NEW_TEST, exist_ok=True)

test_files = [f for f in os.listdir(OLD_TEST) if f.endswith(".jp2")]

for f in tqdm(test_files, desc="Converting TEST"):
    new_name = f.replace(".jp2", ".jpg")
    dst = os.path.join(NEW_TEST, new_name)

    if os.path.exists(dst):
        continue

    img = Image.open(os.path.join(OLD_TEST, f)).convert("RGB")
    img.save(dst, "JPEG", quality=95)

print("TEST conversion done")

Converting TEST: 100%|██████████| 3000/3000 [03:28<00:00, 14.39it/s]

TEST conversion done





In [None]:
!ls /content/drive/MyDrive/CliniScan/3_Preprocessing/train_jpg | wc -l
!ls /content/drive/MyDrive/CliniScan/3_Preprocessing/test_jpg | wc -l

15000
3000


In [None]:
import os, random, shutil
from tqdm import tqdm

SOURCE = "/content/drive/MyDrive/CliniScan/3_Preprocessing/train_jpg"
TRAIN_OUT = "/content/drive/MyDrive/CliniScan/3_Preprocessing/images/train"
VAL_OUT   = "/content/drive/MyDrive/CliniScan/3_Preprocessing/images/val"

os.makedirs(TRAIN_OUT, exist_ok=True)
os.makedirs(VAL_OUT, exist_ok=True)

files = os.listdir(SOURCE)
random.shuffle(files)

split = int(0.9 * len(files))
train_files = files[:split]
val_files   = files[split:]

for f in tqdm(train_files, desc="Train split"):
    shutil.copy(os.path.join(SOURCE, f), os.path.join(TRAIN_OUT, f))

for f in tqdm(val_files, desc="Val split"):
    shutil.copy(os.path.join(SOURCE, f), os.path.join(VAL_OUT, f))

print("Split Done")
print("Train:", len(os.listdir(TRAIN_OUT)))
print("Val:", len(os.listdir(VAL_OUT)))

Train split: 100%|██████████| 13500/13500 [04:50<00:00, 46.51it/s]
Val split: 100%|██████████| 1500/1500 [00:18<00:00, 81.00it/s]


Split Done
Train: 13500
Val: 1500


In [None]:
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm

# Paths
CSV_PATH = "/content/drive/MyDrive/CliniScan/1_Dataset/vinbigdata-chest-xray-abnormalities-detection-512x512-jp2/train.csv"

IMG_TRAIN = "/content/drive/MyDrive/CliniScan/3_Preprocessing/images/train"
IMG_VAL   = "/content/drive/MyDrive/CliniScan/3_Preprocessing/images/val"

LABEL_TRAIN = "/content/drive/MyDrive/CliniScan/3_Preprocessing/labels/train"
LABEL_VAL   = "/content/drive/MyDrive/CliniScan/3_Preprocessing/labels/val"

os.makedirs(LABEL_TRAIN, exist_ok=True)
os.makedirs(LABEL_VAL, exist_ok=True)

df = pd.read_csv(CSV_PATH)
classes = sorted(df["class_name"].unique())
cls2id = {c:i for i,c in enumerate(classes)}

print("Class mapping:")
print(cls2id)

# Function to convert bbox to YOLO format
def write_yolo(grp, img_path, out_path):
    img = Image.open(img_path)
    w, h = img.size
    lines = []

    for _, row in grp.iterrows():
        x_min, y_min = row["x_min"], row["y_min"]
        x_max, y_max = row["x_max"], row["y_max"]

        xc = (x_min + x_max) / 2 / w
        yc = (y_min + y_max) / 2 / h
        bw = (x_max - x_min) / w
        bh = (y_max - y_min) / h

        cls_id = cls2id[row["class_name"]]
        lines.append(f"{cls_id} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}")

    with open(out_path, "w") as f:
        f.write("\n".join(lines))

train_set = set(os.listdir(IMG_TRAIN))
val_set   = set(os.listdir(IMG_VAL))
image_ids = df["image_id"].unique()

print("\nStarting label generation...\n")

for img_id in tqdm(image_ids):
    fname = img_id + ".jpg"

    if fname in train_set:
        img_path = os.path.join(IMG_TRAIN, fname)
        out_path = os.path.join(LABEL_TRAIN, img_id + ".txt")

    elif fname in val_set:
        img_path = os.path.join(IMG_VAL, fname)
        out_path = os.path.join(LABEL_VAL, img_id + ".txt")

    else:
        continue

    grp = df[df["image_id"] == img_id]
    write_yolo(grp, img_path, out_path)

print("\nLabels created successfully")
print("Train labels:", len(os.listdir(LABEL_TRAIN)))
print("Val labels:", len(os.listdir(LABEL_VAL)))

Class mapping:
{'Aortic enlargement': 0, 'Atelectasis': 1, 'Calcification': 2, 'Cardiomegaly': 3, 'Consolidation': 4, 'ILD': 5, 'Infiltration': 6, 'Lung Opacity': 7, 'No finding': 8, 'Nodule/Mass': 9, 'Other lesion': 10, 'Pleural effusion': 11, 'Pleural thickening': 12, 'Pneumothorax': 13, 'Pulmonary fibrosis': 14}

Starting label generation...



100%|██████████| 15000/15000 [06:53<00:00, 36.25it/s]



Labels created successfully
Train labels: 13500
Val labels: 1500
