In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Set the working directory
cbis_path = '/content/drive/MyDrive/colab_scripts/repository/datasets/CBIS-DDSM'
os.chdir(cbis_path)

# Confirm current directory
print("Current Directory:", os.getcwd())

Current Directory: /content/drive/MyDrive/colab_scripts/repository/datasets/CBIS-DDSM


In [3]:
import pandas as pd

# Load datasets
mass_df = pd.read_csv('csv/mass_case_description_test_set.csv')
dicom_df = pd.read_csv('csv/dicom_info.csv')

# Column mapping to expected SeriesDescriptions
column_mapping = {
    'image file path': ('jpg image file path', 'full mammogram images'),
    'cropped image file path': ('jpg cropped image file path', 'cropped images'),
    'ROI mask file path': ('jpg ROI mask file path', 'ROI mask images')
}

# Function to extract JPG path with SeriesDescription validation
def extract_jpg_paths(row):
    new_row = {}

    for orig_col, (jpg_col, expected_description) in column_mapping.items():
        dcm_path = row[orig_col]
        parts = dcm_path.strip().split('/')

        if len(parts) < 3:
            new_row[jpg_col] = None
            print(f"⚠️ Row {row.name}: Malformed path '{dcm_path}' in column '{orig_col}'")
            continue

        study_uid = parts[1]
        series_uid = parts[2]

        # Find matches in dicom_info.csv
        matches = dicom_df[
            (dicom_df['StudyInstanceUID'] == study_uid) &
            (dicom_df['SeriesInstanceUID'] == series_uid)
        ]

        # Check expected SeriesDescription
        valid_match = matches[matches['SeriesDescription'] == expected_description]

        if valid_match.empty:
            print(f"\n❗ Mismatch in row {row.name} ({orig_col}):")
            print(f"  Expected SeriesDescription: '{expected_description}'")
            print(f"  Got {len(matches)} matching rows, but none had expected SeriesDescription")
            print(f"  ➤ StudyInstanceUID: {study_uid}")
            print(f"  ➤ SeriesInstanceUID: {series_uid}")
            print("  ➤ Found:")
            print(matches[['StudyInstanceUID', 'SeriesInstanceUID', 'SeriesDescription']])
            print("  ➤ Please verify manually.\n")
            new_row[jpg_col] = None
        else:
            # Use first match (or adjust if needed)
            new_row[jpg_col] = valid_match['image_path'].iloc[0]

    return pd.Series(new_row)

# Apply the function across the DataFrame
jpg_columns_df = mass_df.apply(extract_jpg_paths, axis=1)

# Combine original DataFrame with new columns
final_df = pd.concat([mass_df, jpg_columns_df], axis=1)

# Save to new CSV
output_path = '/content/drive/MyDrive/colab_scripts/repository/datasets/CBIS-DDSM/csv/mass_test_jpg.csv'
final_df.to_csv(output_path, index=False)

print(f"\n✅ Finished! Saved updated DataFrame to:\n{output_path}")


✅ Finished! Saved updated DataFrame to:
/content/drive/MyDrive/colab_scripts/repository/datasets/CBIS-DDSM/csv/mass_test_jpg.csv
