# This is only a rough draft for our image preprocessing and captioning pipeline.
# We will refactor this into its own script later.

Setup

In [1]:
import pandas as pd
from pathlib import Path

# Use shared project paths
PROJECT_ROOT = Path().resolve().parent  # if running from notebooks/
RAW_DATA_DIR = PROJECT_ROOT / "data" / "raw"
CSV_PATH = RAW_DATA_DIR / "Chest_xray_Corona_Metadata.csv"  # adjust if different
IMG_DIR = RAW_DATA_DIR / "images"  # adjust if needed

Load the Metadata

In [2]:
df = pd.read_csv(CSV_PATH)
print("Original shape:", df.shape)
df.head()

Original shape: (5910, 6)


Unnamed: 0.1,Unnamed: 0,X_ray_image_name,Label,Dataset_type,Label_2_Virus_category,Label_1_Virus_category
0,0,IM-0128-0001.jpeg,Normal,TRAIN,,
1,1,IM-0127-0001.jpeg,Normal,TRAIN,,
2,2,IM-0125-0001.jpeg,Normal,TRAIN,,
3,3,IM-0122-0001.jpeg,Normal,TRAIN,,
4,4,IM-0119-0001.jpeg,Normal,TRAIN,,


Clean the data

In [3]:
# Drop rows with missing image names or labels
df = df.dropna(subset=["X_ray_image_name", "Label"])
df = df.drop_duplicates(subset=["X_ray_image_name"])
print("Cleaned shape:", df.shape)

# Quick check
df["Label"].value_counts()

Cleaned shape: (5910, 6)


Label
Pnemonia    4334
Normal      1576
Name: count, dtype: int64

Define rule based caption function

In [4]:
def generate_caption(row):
    label = row["Label"]
    virus_1 = row.get("Label_1_Virus_category")
    virus_2 = row.get("Label_2_Virus_category")

    if label == "Normal":
        return "No signs of pneumonia."

    elif label == "Pnemonia":
        if virus_2 == "COVID-19":
            return "Lung opacity consistent with COVID-19."
        elif virus_2 in {"SARS", "ARDS", "Streptococcus"}:
            return "Signs of pneumonia, likely viral origin."
        elif virus_1 == "bacteria":
            return "Pneumonia likely due to bacterial infection."
        elif virus_1 == "Virus":
            return "Pneumonia likely due to viral infection."
        else:
            return "Pneumonia detected."

    return "Unspecified condition."

Apply captions

In [5]:
df["caption"] = df.apply(generate_caption, axis=1)
df[["X_ray_image_name", "Label", "Label_1_Virus_category", "Label_2_Virus_category", "caption"]].head(10)

Unnamed: 0,X_ray_image_name,Label,Label_1_Virus_category,Label_2_Virus_category,caption
0,IM-0128-0001.jpeg,Normal,,,No signs of pneumonia.
1,IM-0127-0001.jpeg,Normal,,,No signs of pneumonia.
2,IM-0125-0001.jpeg,Normal,,,No signs of pneumonia.
3,IM-0122-0001.jpeg,Normal,,,No signs of pneumonia.
4,IM-0119-0001.jpeg,Normal,,,No signs of pneumonia.
5,IM-0117-0001.jpeg,Normal,,,No signs of pneumonia.
6,IM-0115-0001.jpeg,Normal,,,No signs of pneumonia.
7,IM-0189-0001.jpeg,Normal,,,No signs of pneumonia.
8,IM-0187-0001.jpeg,Normal,,,No signs of pneumonia.
9,IM-0185-0001.jpeg,Normal,,,No signs of pneumonia.


In [6]:
# Add full image path using Dataset_type (train/test) and image name
df["image_path"] = df.apply(
    lambda row: RAW_DATA_DIR / "Coronahack-Chest-XRay-Dataset" / "Coronahack-Chest-XRay-Dataset" / row["Dataset_type"] / row["X_ray_image_name"],
    axis=1
)

# Filter out entries with missing files
valid = df["image_path"].apply(lambda p: p.exists())
print(f"Valid image paths: {valid.sum()} / {len(df)}")
df = df[valid]

Valid image paths: 5910 / 5910


explore the result

In [7]:
df["caption"].value_counts()

caption
Pneumonia likely due to bacterial infection.    2772
No signs of pneumonia.                          1576
Pneumonia likely due to viral infection.        1493
Lung opacity consistent with COVID-19.            58
Signs of pneumonia, likely viral origin.          11
Name: count, dtype: int64

Save the output for training

In [9]:
# Save as CSV for later training
out_path = RAW_DATA_DIR / "Coronahack-Chest-XRay-Dataset" / "Coronahack-Chest-XRay-Dataset" / "image_captions.csv"
df.to_csv(out_path, index=False)
print(f"Saved to {out_path}")

Saved to /Users/wyattlamberth/dev/ai/covid-detection/data/raw/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/image_captions.csv
