In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Set the working directory
cbis_path = '/content/drive/MyDrive/colab_scripts/repository/datasets'
os.chdir(cbis_path)

# Confirm current directory
print("Current Directory:", os.getcwd())

Current Directory: /content/drive/MyDrive/colab_scripts/repository/datasets


In [None]:
import pandas as pd
from PIL import Image
import os


# Step 1: Load the CSV file
csv_path = 'CBIS-DDSM/csv/mass_test_jpg.csv'  # Change path if necessary
df = pd.read_csv(csv_path)

# Check that required columns exist
required_cols = ['jpg image file path', 'jpg ROI mask file path']
if not all(col in df.columns for col in required_cols):
    raise ValueError(f"CSV must contain columns: {required_cols}")

# Step 2: Initialize tracking lists
valid_rows = []
excluded_rows = []

# Step 3: Process each row
for idx, row in df.iterrows():
    img_path = row['jpg image file path']
    mask_path = row['jpg ROI mask file path']

    # Debug message
    print(f"Processing row {idx}:\n  Image: {img_path}\n  Mask:  {mask_path}")

    # Check file existence
    if not os.path.exists(img_path):
        print(f"  ‚ùå Image file not found.")
        excluded_rows.append((idx, "Image file not found"))
        continue
    if not os.path.exists(mask_path):
        print(f"  ‚ùå Mask file not found.")
        excluded_rows.append((idx, "Mask file not found"))
        continue

    try:
        # Verify image and reopen to read it fully
        img = Image.open(img_path)
        img.verify()
        img = Image.open(img_path)

        mask = Image.open(mask_path)
        mask.verify()
        mask = Image.open(mask_path)

        # Compare dimensions
        if img.size == mask.size:
            valid_rows.append(row)
        else:
            print(f"  ‚ùå Dimension mismatch: Image {img.size}, Mask {mask.size}")
            excluded_rows.append((idx, f"Dimension mismatch: Image {img.size}, Mask {mask.size}"))

    except Exception as e:
        print(f"  ‚ùå Error loading image or mask: {e}")
        excluded_rows.append((idx, str(e)))

# Step 4: Save valid rows to a new DataFrame
mass_train_jpg2 = pd.DataFrame(valid_rows)
mass_train_jpg2.to_csv('mass_train_jpg2.csv', index=False)
print("\n‚úÖ Valid rows saved to 'mass_train_jpg2.csv'.")

# Step 5: Show summary and optionally export excluded rows
print(f"\nüìä Summary:")
print(f"  Total rows processed: {len(df)}")
print(f"  Rows kept: {len(mass_train_jpg2)}")
print(f"  Rows excluded: {len(excluded_rows)}")

# Optional: Save excluded row information for review
if excluded_rows:
    excluded_df = pd.DataFrame(excluded_rows, columns=['row_index', 'reason'])
    excluded_df.to_csv('excluded_rows_log.csv', index=False)
    print("üìù Excluded rows and reasons saved to 'excluded_rows_log.csv'")


Processing row 0:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.342386194811267636608694132590482924515/1-211.jpg
  Mask:  CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.296736403313792599626368780122205399650/1-250.jpg
Processing row 1:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.359308329312397897125630708681441180834/1-207.jpg
  Mask:  CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.227955274711225756835838775062793186053/2-288.jpg
Processing row 2:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.89180046211022531834352631483669346540/1-250.jpg
  Mask:  CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.429120414011832984817094399141838850375/1-296.jpg
Processing row 3:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.295360926313492745441868049270168300162/1-067.jpg
  Mask:  CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.115134232113001553100559896703407510515/1-245.jpg
Processing row 4:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.410524754913057908920631336070876889890/1-056.jpg
  Mask:  CBIS-DDSM/j

In [2]:
import cv2
import matplotlib.pyplot as plt
import pandas as pd

# üîπ Load your CSV (adjust the path to where your train_plus_test.csv or other file is stored)
csv_path = "/content/drive/MyDrive/colab_scripts/repository/datasets/CBIS-DDSM/csv/mass_test_jpg.csv"
df = pd.read_csv(csv_path)

# üîπ Function to show image and mask for a given row index
def show_image_and_mask(row_index):
    row = df.iloc[row_index]

    img_path = row["jpg image file path"]      # adjust if your column name differs
    mask_path = row["jpg ROI mask file path"]  # adjust if your column name differs

    # Read image and mask
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

    if img is None or mask is None:
        print(f"‚ùå Could not load files at row {row_index}")
        return

    # Check dimensions
    print(f"Row {row_index}")
    print(f"Image shape: {img.shape}")
    print(f"Mask shape:  {mask.shape}")

    # Show side by side
    plt.figure(figsize=(10,5))

    plt.subplot(1,2,1)
    plt.imshow(img, cmap="gray")
    plt.title("Image")
    plt.axis("off")

    plt.subplot(1,2,2)
    plt.imshow(mask, cmap="gray")
    plt.title("ROI Mask")
    plt.axis("off")

    plt.show()

# üîπ Example: pick a row you already know has mismatched dimensions
show_image_and_mask(42)  # change 42 to the row index you want to demonstrate


‚ùå Could not load files at row 42
