In [8]:
# Collapse rare labels into broader categories for prototype sanity
# Update: metadata_sample.csv -> metadata_collapsed.csv

import pandas as pd
import os

# Load your metadata
df = pd.read_csv("../data/metadata_sample_minimal.csv")

# Inspect current findings
print("Original Findings Samples:")
print(df["finding"].head())
print(f"\nTotal unique findings: {df['finding'].nunique()}")
print("\nOriginal label distribution:")
print(df["finding"].value_counts().head(10))

# Define mapping from fine-grained labels → broader categories
collapse_map = {
    "COVID-19": "COVID",
    "SARS": "COVID", 
    "MERS-CoV": "COVID",
    "Pneumonia": "Pneumonia",
    "Viral": "Pneumonia",
    "Bacterial": "Pneumonia", 
    "Aspiration": "Pneumonia",
    "Tuberculosis": "TB",
    "Mycoplasma": "Other",
    "Chlamydophila": "Other",
    "Fungal": "Other",
    "Aspergillosis": "Other",
    "Lipoid": "Other",
    "Legionella": "Other",
    "MRSA": "Other",
    "Staphylococcus": "Other",
    "Streptococcus": "Other",
    "Klebsiella": "Other",
    "E.Coli": "Other",
    "Influenza": "Other",
    "H1N1": "Other",
    "Herpes": "Other",
    "Nocardia": "Other",
    "Varicella": "Other",
    "Unknown": "Other",
    "todo": "Other",
    "No Finding": "No Finding"
}

# Preserve original finding column
if 'orig_finding' not in df.columns:
    df['orig_finding'] = df['finding']

# Function to collapse each finding string into new categories
def collapse_labels(finding_str):
    if pd.isna(finding_str):
        return "Other"
    
    labels = finding_str.split("/")
    collapsed = set()
    for label in labels:
        label = label.strip()
        if label in collapse_map:
            collapsed.add(collapse_map[label])
        else:
            # Handle unknown labels
            collapsed.add("Other")
    
    return "/".join(sorted(collapsed)) if collapsed else "Other"

# Apply collapsing and overwrite 'finding' so existing code continues to work
df["finding"] = df["finding"].apply(collapse_labels)

# Show result
print("\n" + "="*60)
print("COLLAPSED RESULTS")
print("="*60)
print("Collapsed Findings Samples:")
# Now both 'finding' and 'orig_finding' are available
print(df[["finding", "orig_finding"]].head(10))

# Save new collapsed dataset (preserve 'finding' column name)
output_path = "../data/metadata_collapsed.csv"
df.to_csv(output_path, index=False)
print(f"\n✅ Saved collapsed dataset to {output_path}")

# Show label distribution
print("\nLabel counts after collapsing:")
collapsed_counts = df["finding"].value_counts()
print(collapsed_counts)

print(f"\nTotal unique collapsed findings: {df['finding'].nunique()}")

# Show some examples of the mapping
print("\nExample mappings:")
examples = df[["finding", "orig_finding"]].drop_duplicates().head(15)
for _, row in examples.iterrows():
    print(f"  '{row['orig_finding']}' → '{row['finding']}'")


Original Findings Samples:
0    Pneumonia/Viral/COVID-19
1    Pneumonia/Viral/COVID-19
2    Pneumonia/Viral/COVID-19
3    Pneumonia/Viral/COVID-19
4    Pneumonia/Viral/COVID-19
Name: finding, dtype: object

Total unique findings: 25

Original label distribution:
finding
Pneumonia/Viral/COVID-19             584
todo                                  83
Pneumonia                             81
Pneumonia/Fungal/Pneumocystis         30
Pneumonia/Bacterial/Streptococcus     22
No Finding                            22
Tuberculosis                          18
Pneumonia/Viral/SARS                  16
Pneumonia/Lipoid                      13
Pneumonia/Bacterial/Mycoplasma        11
Name: count, dtype: int64

COLLAPSED RESULTS
Collapsed Findings Samples:
           finding              orig_finding
0  COVID/Pneumonia  Pneumonia/Viral/COVID-19
1  COVID/Pneumonia  Pneumonia/Viral/COVID-19
2  COVID/Pneumonia  Pneumonia/Viral/COVID-19
3  COVID/Pneumonia  Pneumonia/Viral/COVID-19
4  COVID/Pneumonia  P

In [6]:
import pandas as pd
import os

# --- Configuration ---
input_filename = '../data/nih_chestxray_filtered.csv'  # Update to your actual filtered file path
output_filename = '../data/final_single_label_dataset.csv'

# --- Priority list (highest to lowest) ---
PRIORITY = ["Infiltration", "Effusion", "Atelectasis", "No Finding"]

def apply_priority_label(finding_string: str) -> str:
    """
    Given a string of findings (possibly separated by '|'),
    this function returns a SINGLE label based on a fixed priority.
    """
    if pd.isna(finding_string) or finding_string.strip() == "":
        return "Unknown"
    
    # Split the findings (the NIH dataset uses '|' as delimiter)
    findings = [f.strip() for f in finding_string.split('|')]
    
    # Apply priority order
    for disease in PRIORITY:
        if disease in findings:
            return disease
    
    # Fallback if none matched
    return "Unknown"


# --- Main Script ---
try:
    print(f"Loading data from '{input_filename}'...")
    df = pd.read_csv(input_filename)
    
    column_name = 'Finding Labels'
    if column_name not in df.columns:
        raise KeyError(f"The required column '{column_name}' was not found in the CSV.")
    
    print("Applying priority rules to reduce to single-label classification...")
    df['label'] = df[column_name].apply(apply_priority_label)
    
    # Keep only rows that belong to one of the target classes
    df = df[df['label'].isin(PRIORITY)]
    
    # Save the cleaned single-label dataset
    df.to_csv(output_filename, index=False)
    
    print(f"\n✅ Successfully created the single-label dataset at: '{output_filename}'")
    print("\nFinal distribution of labels:")
    print(df['label'].value_counts())

except FileNotFoundError:
    print(f"❌ ERROR: Could not find the file '{input_filename}'. Please make sure the path is correct.")
except KeyError as e:
    print(f"❌ ERROR: {e}")


Loading data from '../data/nih_chestxray_filtered.csv'...
Applying priority rules to reduce to single-label classification...
Applying priority rules to reduce to single-label classification...

✅ Successfully created the single-label dataset at: '../data/final_single_label_dataset.csv'

Final distribution of labels:
label
No Finding      60361
Infiltration    19894
Effusion         9317
Atelectasis      6259
Name: count, dtype: int64

✅ Successfully created the single-label dataset at: '../data/final_single_label_dataset.csv'

Final distribution of labels:
label
No Finding      60361
Infiltration    19894
Effusion         9317
Atelectasis      6259
Name: count, dtype: int64
