In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Set the working directory
cbis_path = '/content/drive/MyDrive/colab_scripts/repository/datasets'
os.chdir(cbis_path)

# Confirm current directory
print("Current Directory:", os.getcwd())

Current Directory: /content/drive/MyDrive/colab_scripts/repository/datasets


In [3]:


import pandas as pd
from PIL import Image
import os

# Optional: Mount Google Drive if working with files stored there
# from google.colab import drive
# drive.mount('/content/drive')

# Step 1: Load the CSV file
csv_path = 'CBIS-DDSM/csv/mass_test_jpg.csv'  # Change path if necessary
df = pd.read_csv(csv_path)

# Check that required columns exist
required_cols = ['jpg image file path', 'jpg ROI mask file path']
if not all(col in df.columns for col in required_cols):
    raise ValueError(f"CSV must contain columns: {required_cols}")

# Step 2: Initialize tracking lists
valid_rows = []
excluded_rows = []

# Step 3: Process each row
for idx, row in df.iterrows():
    img_path = row['jpg image file path']
    mask_path = row['jpg ROI mask file path']

    # Debug message
    print(f"Processing row {idx}:\n  Image: {img_path}\n  Mask:  {mask_path}")

    # Check file existence
    if not os.path.exists(img_path):
        print(f"  ‚ùå Image file not found.")
        excluded_rows.append((idx, "Image file not found"))
        continue
    if not os.path.exists(mask_path):
        print(f"  ‚ùå Mask file not found.")
        excluded_rows.append((idx, "Mask file not found"))
        continue

    try:
        # Verify image and reopen to read it fully
        img = Image.open(img_path)
        img.verify()
        img = Image.open(img_path)

        mask = Image.open(mask_path)
        mask.verify()
        mask = Image.open(mask_path)

        # Compare dimensions
        if img.size == mask.size:
            valid_rows.append(row)
        else:
            print(f"  ‚ùå Dimension mismatch: Image {img.size}, Mask {mask.size}")
            excluded_rows.append((idx, f"Dimension mismatch: Image {img.size}, Mask {mask.size}"))

    except Exception as e:
        print(f"  ‚ùå Error loading image or mask: {e}")
        excluded_rows.append((idx, str(e)))

# Step 4: Save valid rows to a new DataFrame
mass_test_jpg2 = pd.DataFrame(valid_rows)
mass_test_jpg2.to_csv('mass_test_jpg2.csv', index=False)
print("\n‚úÖ Valid rows saved to 'mass_test_jpg2.csv'.")

# Step 5: Show summary and optionally export excluded rows
print(f"\nüìä Summary:")
print(f"  Total rows processed: {len(df)}")
print(f"  Rows kept: {len(mass_test_jpg2)}")
print(f"  Rows excluded: {len(excluded_rows)}")

# Optional: Save excluded row information for review
if excluded_rows:
    excluded_df = pd.DataFrame(excluded_rows, columns=['row_index', 'reason'])
    excluded_df.to_csv('excluded_rows_log_test.csv', index=False)
    print("üìù Excluded rows and reasons saved to 'excluded_rows_log_test.csv'")


Processing row 0:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.245063149211255120613007755642780114172/1-271.jpg
  Mask:  CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.30820586311062570442302321942433426184/1-083.jpg
Processing row 1:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.85952214611170506017891429690540035518/1-100.jpg
  Mask:  CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.381440141511137044327302306604206077287/1-084.jpg
Processing row 2:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.22131189612893294827907969600765582967/1-101.jpg
  Mask:  CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.212143028513012144941507232513982203672/2-085.jpg
Processing row 3:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.239949064412092068706566726490415129934/1-102.jpg
  Mask:  CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.15403043813402510742192372832381918984/2-086.jpg
Processing row 4:
  Image: CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.215081818713600536113960661873725083371/1-103.jpg
  Mask:  CBIS-DDSM/jpeg