In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# -----------------------------------------------------------------------------
# Cell 1 (REVISED): Installs and PyTorch/HuggingFace Imports
# -----------------------------------------------------------------------------
print("Installing/Updating PyTorch, Hugging Face, and related libraries for MedGemma...")
# Install PyTorch first (cu118 is common for Colab T4/V100 GPUs)
!pip install -q -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install Hugging Face and other necessary libraries
# Let pip resolve numpy and scikit-learn based on these packages' needs
!pip install -q -U transformers accelerate bitsandbytes peft pydicom pandas opencv-python Pillow scikit-learn

# No explicit numpy install/uninstall here; let other packages specify their needs.
# pydicom, pandas, opencv-python, Pillow are generally fine with default versions.

print("\nImporting libraries...")
# Python Standard Libraries
import os
import shutil
import zipfile

# Third-party Libraries
import pandas as pd
import numpy as np # Should be a compatible version now
import pydicom
import cv2
from PIL import Image

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader

# Scikit-learn (should import fine after pip installs a compatible version)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Hugging Face Transformers
from transformers import AutoProcessor

# TensorFlow (still imported from your original code, we'll manage its numpy needs)
# If you are ONLY using PyTorch for MedGemma, you can comment out TF imports later.
# For now, keeping them to see if the environment stabilizes.
import tensorflow as tf
# from tensorflow.keras.applications import ResNet50V2 # Not needed for MedGemma
# from tensorflow.keras.applications.resnet_v2 import preprocess_input as resnet_preprocess_input # Not needed
# from tensorflow.keras.utils import Sequence # Not needed for PyTorch Dataset
# from tensorflow.keras import layers, models # Not needed

# Plotting (optional, but often useful)
import matplotlib.pyplot as plt
import seaborn as sns

# Colab specific (if needed)
from google.colab import drive # Moved drive mount to Cell 2

print("--- Library Version Checks ---")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Scikit-learn version: {pd.__version__}") # Oops, should be sklearn.__version__
import sklearn
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"TensorFlow Version: {tf.__version__}")
if torch.cuda.is_available():
    print(f"PyTorch version: {torch.__version__}")
    print(f"PyTorch CUDA version: {torch.version.cuda}")
    print(f"GPU available for PyTorch: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available for PyTorch, using CPU.")
print(f"GPU Available for TensorFlow: {tf.config.list_physical_devices('GPU')}")


# For reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
if 'torch' in globals(): # Check if torch was successfully imported
    torch.manual_seed(RANDOM_SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(RANDOM_SEED)

print("\nCell 1: Installs and Imports complete.")

Installing/Updating PyTorch, Hugging Face, and related libraries for MedGemma...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m118.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m663.9/663.9 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.4/168.4 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.2/128

In [None]:
# --------------------------------------------------
# Cell 2: Configuration and Unzip Data
# (Your existing Cell 2 - ensure drive is mounted first if not done in Cell 1)
# --------------------------------------------------
if 'drive' not in globals(): # If drive wasn't imported/mounted in Cell 1
    from google.colab import drive
drive.mount('/content/drive', force_remount=True) # force_remount if running again

# --- Configuration ---
DRIVE_CSV_PATH = "/content/drive/MyDrive/cp.csv"
DRIVE_ZIP_PATH = "/content/drive/MyDrive/1000-20250517T062750Z-1-001.zip" # Your image ZIP on Drive

LOCAL_EXTRACT_PATH = "/content/medgemma_extracted_images" # Using the MedGemma specific path
LOCAL_IMAGES_ROOT = os.path.join(LOCAL_EXTRACT_PATH, "1000")
LOCAL_CSV_PATH = "/content/medgemma_cp.csv" # Using the MedGemma specific path

# --- Unzip Data (if not already done or if re-running) ---
if os.path.exists(DRIVE_CSV_PATH):
    shutil.copy(DRIVE_CSV_PATH, LOCAL_CSV_PATH)
    print(f"CSV copied to {LOCAL_CSV_PATH}")
else:
    print(f"ERROR: CSV file not found at {DRIVE_CSV_PATH}")

if os.path.exists(LOCAL_EXTRACT_PATH):
    print(f"Removing existing extraction directory: {LOCAL_EXTRACT_PATH}")
    shutil.rmtree(LOCAL_EXTRACT_PATH)
os.makedirs(LOCAL_EXTRACT_PATH, exist_ok=True)
print(f"Created local extraction directory: {LOCAL_EXTRACT_PATH}")

if os.path.exists(DRIVE_ZIP_PATH):
    print(f"Unzipping {DRIVE_ZIP_PATH} to {LOCAL_EXTRACT_PATH}...")
    with zipfile.ZipFile(DRIVE_ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_EXTRACT_PATH)
    print("Unzipping complete.")
    if os.path.exists(LOCAL_IMAGES_ROOT):
        print(f"Image root folder found at: {LOCAL_IMAGES_ROOT}")
    else:
        print(f"ERROR: Expected image root folder '{LOCAL_IMAGES_ROOT}' not found after unzipping.")
else:
    print(f"ERROR: ZIP file not found at {DRIVE_ZIP_PATH}")

print("\nCell 2: Data unzipping complete.")

Mounted at /content/drive
CSV copied to /content/medgemma_cp.csv
Created local extraction directory: /content/medgemma_extracted_images
Unzipping /content/drive/MyDrive/1000-20250517T062750Z-1-001.zip to /content/medgemma_extracted_images...
Unzipping complete.
Image root folder found at: /content/medgemma_extracted_images/1000

Cell 2: Data unzipping complete.


In [None]:
# --------------------------------------------------
# Cell 3: Load and Filter Clinical Data to create image_df
# (Your existing Cell 3 - adapted for clarity and robustness)
# --------------------------------------------------
image_df = pd.DataFrame() # Initialize to ensure it exists

if not os.path.exists(LOCAL_CSV_PATH):
    print(f"FATAL ERROR: Clinical CSV file not found at the expected local path: {LOCAL_CSV_PATH}")
else:
    df_raw_from_cell3 = pd.read_csv(LOCAL_CSV_PATH) # Use a distinct name to avoid confusion
    print(f"Initial number of rows in clinical data (Cell 3): {len(df_raw_from_cell3)}")

    # IMPORTANT: Verify these column names EXACTLY match your CSV file
    person_id_col_name_c3 = 'person_id'
    ldl_col_name_c3 = "LDL Cholesterol Calculation (mg/dL)"

    if not (person_id_col_name_c3 in df_raw_from_cell3.columns and ldl_col_name_c3 in df_raw_from_cell3.columns):
        print(f"ERROR: Required columns ('{person_id_col_name_c3}' or '{ldl_col_name_c3}') not found in CSV.")
        print(f"Available columns: {df_raw_from_cell3.columns.tolist()}")
    else:
        # Select and clean
        df_selected_c3 = df_raw_from_cell3[[person_id_col_name_c3, ldl_col_name_c3]].copy()
        df_selected_c3.rename(columns={ldl_col_name_c3: 'LDL_temp'}, inplace=True) # Use temp name
        df_selected_c3['LDL_temp'] = pd.to_numeric(df_selected_c3['LDL_temp'], errors='coerce')
        df_selected_c3.dropna(subset=['LDL_temp'], inplace=True)
        df_selected_c3 = df_selected_c3[df_selected_c3['LDL_temp'] > 0].copy()
        df_selected_c3[person_id_col_name_c3] = df_selected_c3[person_id_col_name_c3].astype(str)
        print(f"Cleaned clinical data (positive LDLs only): {len(df_selected_c3)} records.")

        ldl_lookup_c3 = df_selected_c3.set_index(person_id_col_name_c3)['LDL_temp'].to_dict()

        # Map to images
        if not (os.path.exists(LOCAL_IMAGES_ROOT) and os.path.isdir(LOCAL_IMAGES_ROOT)):
            print(f"FATAL ERROR: Images root path '{LOCAL_IMAGES_ROOT}' does not exist or is not a directory.")
        else:
            available_folders_c3 = set(os.listdir(LOCAL_IMAGES_ROOT))
            valid_ids_clinical_c3 = set(ldl_lookup_c3.keys())
            common_person_ids_c3 = sorted(list(valid_ids_clinical_c3 & available_folders_c3))
            print(f"Found {len(common_person_ids_c3)} common person_ids for mapping.")

            image_records_list = []
            for pid_c3 in common_person_ids_c3:
                folder_path_c3 = os.path.join(LOCAL_IMAGES_ROOT, pid_c3)
                ldl_val_c3 = ldl_lookup_c3[pid_c3]
                if os.path.isdir(folder_path_c3):
                    for filename_c3 in os.listdir(folder_path_c3):
                        if filename_c3.lower().endswith(".dcm"):
                            image_path_c3 = os.path.join(folder_path_c3, filename_c3)
                            image_records_list.append({
                                "person_id": pid_c3, # Final column name
                                "image_path": image_path_c3, # Final column name
                                "LDL": ldl_val_c3 # Final column name
                            })
            image_df = pd.DataFrame(image_records_list) # Assign to the main image_df
            if not image_df.empty:
                print(f"Final image_df created with {len(image_df)} image-LDL pairs.")
                print(image_df.head())
                print(f"LDL stats in final image_df: min={image_df['LDL'].min()}, max={image_df['LDL'].max()}, mean={image_df['LDL'].mean()}")
            else:
                print("WARNING: image_df is empty after mapping. Check paths and IDs.")
print("\nCell 3: image_df preparation complete.")

Initial number of rows in clinical data (Cell 3): 1067
Cleaned clinical data (positive LDLs only): 1025 records.
Found 527 common person_ids for mapping.
Final image_df created with 973 image-LDL pairs.
  person_id                                         image_path         LDL
0      1002  /content/medgemma_extracted_images/1000/1002/1...  133.485054
1      1004  /content/medgemma_extracted_images/1000/1004/1...   59.674544
2      1004  /content/medgemma_extracted_images/1000/1004/1...   59.674544
3      1005  /content/medgemma_extracted_images/1000/1005/1...   74.956702
4      1007  /content/medgemma_extracted_images/1000/1007/1...   92.278412
LDL stats in final image_df: min=10.77327021, max=278.5634775, mean=92.26371915419321

Cell 3: image_df preparation complete.


In [None]:
# -----------------------------------------------------------------------------
# Cell 4: Verify image_df and Set MedGemma Model ID
# -----------------------------------------------------------------------------

# VERIFY `image_df` IS READY FROM YOUR PREVIOUS CELLS
if 'image_df' in locals() and isinstance(image_df, pd.DataFrame) and not image_df.empty:
    print(f"Continuing with 'image_df' which has {len(image_df)} records.")
    print("Columns in image_df:", image_df.columns.tolist())
    print("Sample of image_df:")
    display(image_df.head()) # Use display for better DataFrame formatting in Colab

    required_cols = ['person_id', 'image_path', 'LDL']
    if not all(col in image_df.columns for col in required_cols):
        print(f"ERROR: 'image_df' is missing one or more required columns: {required_cols}. Please re-run previous data preparation cells.")
    elif image_df['LDL'].min() <= 0:
        print(f"ERROR: 'image_df' still contains non-positive LDL values. LDL min: {image_df['LDL'].min()}. Please re-run filtering.")
    else:
        print("'image_df' seems okay to proceed.")
else:
    print("ERROR: 'image_df' not found or is empty. Please ensure your data preparation cells (your original Cells 1-3, now adapted) have been run successfully.")
    # In a real run, you'd stop and fix. For script flow, create empty to avoid NameError.
    image_df = pd.DataFrame(columns=['person_id', 'image_path', 'LDL'])


# --- MedGemma Model ID Configuration ---
# ACTION: YOU MUST VERIFY THIS MODEL ID FROM HUGGING FACE HUB
# Search on Hugging Face Hub: https://huggingface.co/models?search=google/medgemma
# Look for the 4B "pt" (pre-trained) variant.
# Common candidates: "google/medgemma-4b-pt" or "google/medgem_vision_text_4b_pt"
MEDGEMMA_PT_MODEL_ID = "google/medgemma-4b-pt"  # <<<--- REPLACE WITH VERIFIED ID
# Example: MEDGEMMA_PT_MODEL_ID = "google/medgem_vision_text_4b_pt" # If this is the correct one

print(f"\nConfigured MEDGEMMA_PT_MODEL_ID: {MEDGEMMA_PT_MODEL_ID}")
if MEDGEMMA_PT_MODEL_ID == "google/medgemma-4b-pt": # Check if it's still the placeholder
    print("WARNING: MEDGEMMA_PT_MODEL_ID might still be the placeholder. Please verify this ID on Hugging Face Hub.")

print("\nCell 4: image_df verification and MedGemma Model ID configuration complete.")

Continuing with 'image_df' which has 973 records.
Columns in image_df: ['person_id', 'image_path', 'LDL']
Sample of image_df:


Unnamed: 0,person_id,image_path,LDL
0,1002,/content/medgemma_extracted_images/1000/1002/1...,133.485054
1,1004,/content/medgemma_extracted_images/1000/1004/1...,59.674544
2,1004,/content/medgemma_extracted_images/1000/1004/1...,59.674544
3,1005,/content/medgemma_extracted_images/1000/1005/1...,74.956702
4,1007,/content/medgemma_extracted_images/1000/1007/1...,92.278412


'image_df' seems okay to proceed.

Configured MEDGEMMA_PT_MODEL_ID: google/medgemma-4b-pt

Cell 4: image_df verification and MedGemma Model ID configuration complete.


In [None]:
from huggingface_hub import login
login("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")

In [None]:
# -----------------------------------------------------------------------------
# Cell 5: Load MedGemma Processor
# -----------------------------------------------------------------------------

medgemma_processor = None
# Default target size from model card, will try to confirm/override from processor
TARGET_SIZE_MEDGEMMA = (896, 896) # MedGemma model card specifies 896x896

# Check if the model ID is still the placeholder
if MEDGEMMA_PT_MODEL_ID == "google/medgemma-4b-pt" or "YOUR_VERIFIED_MODEL_ID_HERE" in MEDGEMMA_PT_MODEL_ID: # A more generic placeholder check
    print(f"WARNING: MEDGEMMA_PT_MODEL_ID ('{MEDGEMMA_PT_MODEL_ID}') looks like a placeholder.")
    print("Please verify and update it in Cell 4 with the correct model ID from Hugging Face Hub for the 4B pre-trained variant before proceeding.")

try:
    print(f"\nAttempting to load MedGemma processor for: {MEDGEMMA_PT_MODEL_ID}...")
    # trust_remote_code=True is often needed for newer models or those with custom code (like Gemma family)
    medgemma_processor = AutoProcessor.from_pretrained(MEDGEMMA_PT_MODEL_ID, trust_remote_code=True)
    print("MedGemma Processor loaded successfully!")

    # Inspect the processor's image_processor component for expected size
    if hasattr(medgemma_processor, 'image_processor') and hasattr(medgemma_processor.image_processor, 'size'):
        img_proc_size_info = medgemma_processor.image_processor.size
        print(f"Processor's image_processor.size attribute: {img_proc_size_info}")

        parsed_h, parsed_w = None, None
        if isinstance(img_proc_size_info, dict):
            parsed_h = img_proc_size_info.get('height', img_proc_size_info.get('shortest_edge'))
            if parsed_h is not None:
                 parsed_w = img_proc_size_info.get('width', parsed_h if 'shortest_edge' in img_proc_size_info else None)
        elif isinstance(img_proc_size_info, (list, tuple)) and len(img_proc_size_info) == 2:
            parsed_h, parsed_w = img_proc_size_info[0], img_proc_size_info[1]
        elif isinstance(img_proc_size_info, int):
            parsed_h = parsed_w = img_proc_size_info

        if parsed_h and parsed_w:
            # Override TARGET_SIZE_MEDGEMMA if successfully parsed from processor
            TARGET_SIZE_MEDGEMMA = (parsed_h, parsed_w)
            print(f"Target Image Size for MedGemma (from processor): {TARGET_SIZE_MEDGEMMA}")
            if TARGET_SIZE_MEDGEMMA != (896, 896): # Compare with model card expectation
                print(f"Note: Processor-derived size {TARGET_SIZE_MEDGEMMA} differs from model card's typical 896x896. Using processor's size.")
        else:
            print(f"Could not reliably parse size from processor.image_processor.size. Using default from model card: {TARGET_SIZE_MEDGEMMA}")
    else:
        print(f"Warning: MedGemma processor for {MEDGEMMA_PT_MODEL_ID} does not have 'image_processor.size'. Using default: {TARGET_SIZE_MEDGEMMA}")

except Exception as e:
    print(f"Error loading MedGemma processor for '{MEDGEMMA_PT_MODEL_ID}': {e}")
    print("Ensure the MEDGEMMA_PT_MODEL_ID in Cell 4 is correct and you have internet access.")
    print("If the ID is correct, the model might require specific dependencies or there might be an issue with the Hugging Face Hub or the model's configuration.")
    # medgemma_processor will remain None

print("\nCell 5: MedGemma Processor loading attempt complete.")

Please verify and update it in Cell 4 with the correct model ID from Hugging Face Hub for the 4B pre-trained variant before proceeding.

Attempting to load MedGemma processor for: google/medgemma-4b-pt...


processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

MedGemma Processor loaded successfully!
Processor's image_processor.size attribute: {'height': 896, 'width': 896}
Target Image Size for MedGemma (from processor): (896, 896)

Cell 5: MedGemma Processor loading attempt complete.


In [None]:
print(type(medgemma_processor))

<class 'transformers.models.gemma3.processing_gemma3.Gemma3Processor'>


In [None]:
# -----------------------------------------------------------------------------
# Cell 6: Data Splitting (Patient-Level) and LDL Normalization
# -----------------------------------------------------------------------------

train_df, val_df, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() # Initialize
ldl_scaler = None # Will store the fitted StandardScaler

if 'image_df' in locals() and not image_df.empty:
    print(f"\nStarting data splitting for {len(image_df)} image-LDL pairs...")
    if 'person_id' not in image_df.columns:
        print("ERROR: 'person_id' column missing in image_df. Cannot perform patient-level split. Please check image_df preparation.")
    else:
        unique_person_ids = image_df['person_id'].unique()
        print(f"Total unique patients for splitting: {len(unique_person_ids)}")

        if len(unique_person_ids) < 3:
            print("Warning: Not enough unique patients for a robust 3-way (train/validation/test) split.")
            # Simplified split logic for few patients (adjust as needed for your minimum requirements)
            if len(unique_person_ids) == 2:
                train_pids, val_pids = train_test_split(unique_person_ids, test_size=0.5, random_state=RANDOM_SEED)
                test_pids = np.array([]) # Empty array for consistency
            elif len(unique_person_ids) == 1:
                train_pids = unique_person_ids
                val_pids, test_pids = np.array([]), np.array([])
            else: # 0 patients
                train_pids, val_pids, test_pids = np.array([]), np.array([]), np.array([])
        else:
            # Standard 70% train, 15% validation, 15% test split of person_ids
            train_pids, temp_pids = train_test_split(
                unique_person_ids, test_size=0.30, random_state=RANDOM_SEED
            )
            if len(temp_pids) > 1 : # Ensure there's at least 2 for val/test split
                 val_pids, test_pids = train_test_split(
                    temp_pids, test_size=0.50, random_state=RANDOM_SEED
                )
            elif len(temp_pids) == 1: # Only one patient left
                val_pids = temp_pids
                test_pids = np.array([])
            else:
                val_pids, test_pids = np.array([]), np.array([])


        train_df = image_df[image_df['person_id'].isin(train_pids)].copy()
        val_df = image_df[image_df['person_id'].isin(val_pids)].copy()
        test_df = image_df[image_df['person_id'].isin(test_pids)].copy()

        print(f"Train set: {len(train_df)} samples from {len(train_pids)} patients.")
        print(f"Validation set: {len(val_df)} samples from {len(val_pids)} patients.")
        print(f"Test set: {len(test_df)} samples from {len(test_pids)} patients.")

        # Sanity check for patient overlap - important!
        if len(train_pids)>0 and len(val_pids)>0: assert len(set(train_pids) & set(val_pids)) == 0, "Patient overlap train/val!"
        if len(train_pids)>0 and len(test_pids)>0: assert len(set(train_pids) & set(test_pids)) == 0, "Patient overlap train/test!"
        if len(val_pids)>0 and len(test_pids)>0: assert len(set(val_pids) & set(test_pids)) == 0, "Patient overlap val/test!"
        print("Patient-level splits verified (no overlap if sets are non-empty).")

        # --- LDL Value Normalization ---
        if not train_df.empty and 'LDL' in train_df.columns:
            print("\nNormalizing LDL values using StandardScaler...")
            ldl_scaler = StandardScaler()
            # Fit the scaler ONLY on the training data's LDL values
            train_df['LDL_scaled'] = ldl_scaler.fit_transform(train_df[['LDL']])

            # Transform validation and test data using the FITTED scaler
            if not val_df.empty:
                val_df['LDL_scaled'] = ldl_scaler.transform(val_df[['LDL']])
            else:
                # Add LDL_scaled column even if empty, for consistency
                val_df['LDL_scaled'] = pd.Series(dtype='float64')

            if not test_df.empty:
                test_df['LDL_scaled'] = ldl_scaler.transform(test_df[['LDL']])
            else:
                test_df['LDL_scaled'] = pd.Series(dtype='float64')


            print("LDL normalization complete.")
            print("Scaled LDL stats in train_df (should be mean~0, std~1):")
            display(train_df['LDL_scaled'].describe())

            # Save the scaler for later use during inference/evaluation
            # import joblib
            # scaler_filename = 'ldl_scaler_medgemma.joblib'
            # joblib.dump(ldl_scaler, scaler_filename)
            # print(f"LDL scaler saved to {scaler_filename}")
        else:
            print("Train DataFrame is empty or 'LDL' column missing. Skipping LDL normalization.")
else:
    print("image_df is empty (from Cell 3). Skipping data splitting and LDL normalization.")

print("\nCell 6: Data splitting and LDL normalization attempt complete.")


Starting data splitting for 973 image-LDL pairs...
Total unique patients for splitting: 527
Train set: 681 samples from 368 patients.
Validation set: 145 samples from 79 patients.
Test set: 147 samples from 80 patients.
Patient-level splits verified (no overlap if sets are non-empty).

Normalizing LDL values using StandardScaler...
LDL normalization complete.
Scaled LDL stats in train_df (should be mean~0, std~1):


Unnamed: 0,LDL_scaled
count,681.0
mean,2.086763e-16
std,1.000735
min,-2.303724
25%,-0.7148712
50%,-0.04975462
75%,0.6670533
max,2.756859



Cell 6: Data splitting and LDL normalization attempt complete.


In [None]:
# Cell 5.1: Check medgemma_processor
if 'medgemma_processor' in locals() and medgemma_processor is not None:
    print(f"Cell 5.1 Check: medgemma_processor IS LOADED. Type: {type(medgemma_processor)}")
    if hasattr(medgemma_processor, 'image_processor'):
        print(f"  It has an image_processor of type: {type(medgemma_processor.image_processor)}")
    else:
        print("  WARNING: It does NOT have an image_processor attribute.")
else:
    print("Cell 5.1 Check: medgemma_processor IS NOT LOADED or is None.")

Cell 5.1 Check: medgemma_processor IS LOADED. Type: <class 'transformers.models.gemma3.processing_gemma3.Gemma3Processor'>
  It has an image_processor of type: <class 'transformers.models.gemma3.image_processing_gemma3.Gemma3ImageProcessor'>


In [None]:
"""
def load_dicom_image_medgemma_DEBUG(path, processor_obj): # processor_obj is the main Gemma3Processor
    print(f"\n--- Debugging load_dicom_image_medgemma_DEBUG for: {path} ---")
    if processor_obj is None:
        print(f"DEBUG: Main processor object (processor_obj) is None. Cannot proceed.")
        return None

    # Check if the main processor has the image_processor component
    if not hasattr(processor_obj, 'image_processor') or processor_obj.image_processor is None:
        print(f"DEBUG: processor_obj does not have a valid 'image_processor' attribute.")
        return None

    actual_image_processor = processor_obj.image_processor # This is the specific image handler
    print(f"DEBUG: Using actual_image_processor: {type(actual_image_processor)}")

    pil_image_for_processor = None
    numpy_array_for_processor = None

    # 1. Pydicom Read
    try:
        # ... (pydicom reading and dcm.convert_pixel_data() logic remains the same as your last version) ...
        print("DEBUG: Attempting pydicom.dcmread(path)...")
        dcm = pydicom.dcmread(path)
        photometric_interpretation = dcm.get('PhotometricInterpretation', 'N/A')
        print(f"  DICOM PhotometricInterpretation: {photometric_interpretation}")

        print("DEBUG: Attempting to get pixel data via dcm.convert_pixel_data()...")
        dcm.convert_pixel_data()
        img_array = dcm.pixel_array
        print(f"  DEBUG: dcm.pixel_array after convert_pixel_data(). Shape: {img_array.shape}, dtype: {img_array.dtype}")
        numpy_array_for_processor = img_array.astype(np.uint8).copy() # Keep as uint8 if convert_pixel_data worked well
        img_array = img_array.astype(np.float32) # For normalization if needed

    except Exception as e:
        print(f"DEBUG: Pydicom read or convert_pixel_data error for {path}: {e}")
        return None

    # 2. Normalization to 0-255 range (if necessary after convert_pixel_data)
    print("\nDEBUG: Normalizing to 0-255 range (if necessary)...")
    img_min_val = np.min(img_array)
    img_max_val = np.max(img_array)

    # If convert_pixel_data results in uint8 0-255, this normalization might be redundant but harmless
    if not (img_array.dtype == np.uint8 and img_min_val >= 0 and img_max_val <= 255):
        if img_max_val - img_min_val > 1e-7:
            img_normalized_255 = 255.0 * (img_array - img_min_val) / (img_max_val - img_min_val)
            img_to_pil = img_normalized_255.astype(np.uint8)
        else:
            img_to_pil = np.zeros_like(img_array, dtype=np.uint8)
    else:
        img_to_pil = img_array.astype(np.uint8) # Already good

    print(f"  Array for PIL - Shape: {img_to_pil.shape}, dtype: {img_to_pil.dtype}")
    # numpy_array_for_processor = img_to_pil.copy() # This was already set after convert_pixel_data

    # 3. PIL Image Conversion
    print("\nDEBUG: Converting to PIL Image...")
    try:
        if len(img_to_pil.shape) == 3 and img_to_pil.shape[-1] == 3:
            pil_image_for_processor = Image.fromarray(img_to_pil, mode='RGB')
            print(f"  PIL Image created. Mode: {pil_image_for_processor.mode}, Size: {pil_image_for_processor.size}")
            # (Save image logic can remain)
            save_path = f"/content/debug_pil_image_{os.path.basename(path)}.png"
            pil_image_for_processor.save(save_path)
            print(f"  DEBUG: Saved intermediate PIL image to {save_path}")
        else:
            print(f"  Array for PIL has unexpected shape: {img_to_pil.shape}. Cannot create RGB PIL image.")
            return None
    except Exception as e_pil_create:
        print(f"  DEBUG: Error creating PIL image: {e_pil_create}")
        return None

    # 4. MedGemma Processor - Attempt 1: Using direct image_processor with PIL Image
    if pil_image_for_processor:
        print("\nDEBUG: Applying actual_image_processor (Attempt 1: with PIL Image)...")
        try:
            # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
            # CRITICAL CHANGE: Use actual_image_processor directly
            # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
            inputs_pil = actual_image_processor(images=pil_image_for_processor, return_tensors="pt")
            # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

            if inputs_pil is None: # Some image processors might return None on failure
                print("  DEBUG (PIL with actual_image_processor): actual_image_processor returned None.")
                # Try with NumPy array if PIL failed
            elif 'pixel_values' not in inputs_pil or inputs_pil['pixel_values'] is None:
                print("  DEBUG (PIL with actual_image_processor): 'pixel_values' not in output or is None.")
                # Try with NumPy array
            else:
                processed_tensor_pil = inputs_pil['pixel_values'].squeeze(0) # Squeeze only if batched
                if len(processed_tensor_pil.shape) == 4 and processed_tensor_pil.shape[0] == 1: # Check if it added a batch dim
                    processed_tensor_pil = processed_tensor_pil.squeeze(0)

                print("  DEBUG (PIL with actual_image_processor): Processing successful.")
                print(f"    Processed tensor shape: {processed_tensor_pil.shape}, dtype: {processed_tensor_pil.dtype}")
                print(f"    Processed tensor min: {processed_tensor_pil.min().item()}, max: {processed_tensor_pil.max().item()}")
                if not torch.all(processed_tensor_pil == 0):
                    return processed_tensor_pil # SUCCESS!
                else:
                    print("    WARNING (PIL with actual_image_processor): Processed tensor is ALL ZEROS.")
                    # Fall through to try NumPy array
        except Exception as e_proc_pil_direct:
            print(f"  DEBUG (PIL with actual_image_processor): Error: {e_proc_pil_direct}")
            # Fall through to try NumPy array

    # 4. MedGemma Processor - Attempt 2: Using direct image_processor with NumPy array
    if numpy_array_for_processor is not None and \
       (len(numpy_array_for_processor.shape) == 3 and numpy_array_for_processor.shape[-1] == 3):
        print("\nDEBUG: Applying actual_image_processor (Attempt 2: with NumPy Array)...")
        print(f"  NumPy array shape: {numpy_array_for_processor.shape}, dtype: {numpy_array_for_processor.dtype} (This is likely uint8 HWC)")
        try:
            # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
            # CRITICAL CHANGE: Use actual_image_processor directly
            # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
            inputs_np = actual_image_processor(images=numpy_array_for_processor, return_tensors="pt")
            # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

            if inputs_np is None:
                print("  DEBUG (NumPy with actual_image_processor): actual_image_processor returned None.")
                return None
            elif 'pixel_values' not in inputs_np or inputs_np['pixel_values'] is None:
                print("  DEBUG (NumPy with actual_image_processor): 'pixel_values' not in output or is None.")
                return None
            else:
                processed_tensor_np = inputs_np['pixel_values'] # Don't squeeze yet
                if len(processed_tensor_np.shape) == 4 and processed_tensor_np.shape[0] == 1: # Check if it added a batch dim
                    processed_tensor_np = processed_tensor_np.squeeze(0)

                print("  DEBUG (NumPy with actual_image_processor): Processing successful.")
                print(f"    Processed tensor shape: {processed_tensor_np.shape}, dtype: {processed_tensor_np.dtype}")
                print(f"    Processed tensor min: {processed_tensor_np.min().item()}, max: {processed_tensor_np.max().item()}")
                if not torch.all(processed_tensor_np == 0):
                    return processed_tensor_np # SUCCESS!
                else:
                    print("    WARNING (NumPy with actual_image_processor): Processed tensor is ALL ZEROS.")
                    return None # Return None if still zeros
        except Exception as e_proc_np_direct:
            print(f"  DEBUG (NumPy with actual_image_processor): Error: {e_proc_np_direct}")
            return None
    else:
        print("\nDEBUG: NumPy array not suitable for processor attempt (not 3-channel RGB).")
        return None

    print("DEBUG: All attempts with actual_image_processor failed or resulted in zeros.")
    return None


# --- The block to call the DEBUG function remains the same ---
# (in Cell 7.1, after the function definition)
if 'train_df' in locals() and not train_df.empty and medgemma_processor is not None:
    if len(train_df) > 0:
        path_to_debug = train_df.iloc[0]['image_path']
        print(f"\n>>> Initiating DEBUG for path: {path_to_debug} <<<")
        debug_output_tensor = load_dicom_image_medgemma_DEBUG(path_to_debug, medgemma_processor) # Pass the main processor
        if debug_output_tensor is not None:
            print(f"\n>>> DEBUG Result for {path_to_debug}: Tensor received, shape {debug_output_tensor.shape}, min {debug_output_tensor.min().item():.2f}, max {debug_output_tensor.max().item():.2f}")
        else:
            print(f"\n>>> DEBUG Result for {path_to_debug}: Function returned None (Error occurred)")
    else:
        print("train_df is empty, cannot select a path for debugging.")
else:
    print("Could not run debug: train_df or medgemma_processor (main) not available.")

"""



>>> Initiating DEBUG for path: /content/medgemma_extracted_images/1000/1004/1004_eidon_mosaic_cfp_r_1.2.826.0.1.3680043.8.641.1.20230809.2448.36605.dcm <<<

--- Debugging load_dicom_image_medgemma_DEBUG for: /content/medgemma_extracted_images/1000/1004/1004_eidon_mosaic_cfp_r_1.2.826.0.1.3680043.8.641.1.20230809.2448.36605.dcm ---
DEBUG: Using actual_image_processor: <class 'transformers.models.gemma3.image_processing_gemma3.Gemma3ImageProcessor'>
DEBUG: Attempting pydicom.dcmread(path)...
  DICOM PhotometricInterpretation: YBR_FULL_422
DEBUG: Attempting to get pixel data via dcm.convert_pixel_data()...
  DEBUG: dcm.pixel_array after convert_pixel_data(). Shape: (1804, 3223, 3), dtype: uint8

DEBUG: Normalizing to 0-255 range (if necessary)...
  Array for PIL - Shape: (1804, 3223, 3), dtype: uint8

DEBUG: Converting to PIL Image...
  PIL Image created. Mode: RGB, Size: (3223, 1804)
  DEBUG: Saved intermediate PIL image to /content/debug_pil_image_1004_eidon_mosaic_cfp_r_1.2.826.0.1.36

In [None]:
# -----------------------------------------------------------------------------
# Cell 7: PyTorch Dataset and DataLoader Implementation
# This cell contains the FINAL working versions of the image loading function
# and the PyTorch Dataset class, based on our debugging.
# -----------------------------------------------------------------------------

# --- FINAL Image Loading Function for MedGemma ---
def load_dicom_image_medgemma(path, processor_obj): # processor_obj is the main Gemma3Processor
    """
    Loads a DICOM image, handles photometric interpretation (via convert_pixel_data),
    converts to PIL, and then preprocesses it using the MedGemma model's
    specific image_processor component.
    """
    if processor_obj is None:
        # print(f"Error: Main processor object is None for path {path}.") # Optional: for verbose dataset error logging
        return None

    if not hasattr(processor_obj, 'image_processor') or processor_obj.image_processor is None:
        # print(f"Error: Main processor {type(processor_obj)} does not have 'image_processor' for path {path}.") # Optional
        return None

    actual_image_processor = processor_obj.image_processor

    # 1. Pydicom Read and convert_pixel_data
    try:
        dcm = pydicom.dcmread(path, force=True) # force=True can help with some slightly non-compliant files
        # It's crucial to call convert_pixel_data() to apply Modality LUTs, VOI LUTs (if any),
        # and handle Photometric Interpretation to get a displayable pixel array.
        dcm.convert_pixel_data()
        img_array = dcm.pixel_array # This should now be more directly usable (e.g., RGB or MONOCHROME)

        # Ensure the array is uint8 for PIL conversion, common after convert_pixel_data for visual formats
        # If it's not, there might be an issue with how convert_pixel_data handled this specific DICOM
        if img_array.dtype != np.uint8:
            # Attempt to scale to uint8 if it's a different type (e.g. int16, float)
            # This is a basic scaling, more sophisticated windowing might be needed for some MONOCHROME images
            # if dcm.PhotometricInterpretation in ["MONOCHROME1", "MONOCHROME2"]
            if np.issubdtype(img_array.dtype, np.floating) or np.issubdtype(img_array.dtype, np.integer):
                img_min = np.min(img_array)
                img_max = np.max(img_array)
                if img_max - img_min > 1e-6:
                    img_array = 255.0 * (img_array - img_min) / (img_max - img_min)
                else:
                    img_array = np.zeros_like(img_array) # Flat image
            img_array = img_array.astype(np.uint8)

    except Exception: # as e_dicom:
        # print(f"Pydicom read or convert_pixel_data error for {path}: {e_dicom}") # Optional
        return None

    # 2. PIL Image Conversion
    pil_image = None
    try:
        if len(img_array.shape) == 3 and img_array.shape[-1] == 3: # Expecting HWC uint8 (RGB)
            pil_image = Image.fromarray(img_array, mode='RGB')
        elif len(img_array.shape) == 2: # Grayscale (e.g., MONOCHROME2)
            pil_image = Image.fromarray(img_array, mode='L').convert('RGB') # Convert L to RGB for consistency
        else:
            # print(f"Array for PIL has unexpected shape: {img_array.shape} for path {path}.") # Optional
            return None
    except Exception: # as e_pil:
        # print(f"Error creating PIL image for {path}: {e_pil}") # Optional
        return None

    if pil_image is None:
        return None # Should have been caught by returns above, but as a safeguard

    # 3. Using the actual_image_processor (e.g., Gemma3ImageProcessor's image_processor component)
    try:
        inputs = actual_image_processor(images=pil_image, return_tensors="pt")

        if inputs is None or 'pixel_values' not in inputs or inputs['pixel_values'] is None:
            # print(f"actual_image_processor returned None or no pixel_values for path {path}.") # Optional
            return None

        processed_tensor = inputs['pixel_values']
        # The processor might return a batched tensor [1, C, H, W] or unbatched [C, H, W]
        # Squeeze if a batch dimension of 1 was added.
        if len(processed_tensor.shape) == 4 and processed_tensor.shape[0] == 1:
            processed_tensor = processed_tensor.squeeze(0)

        # Final check for expected 3 dimensions [C,H,W]
        if len(processed_tensor.shape) != 3:
            # print(f"Processed tensor has unexpected shape {processed_tensor.shape} for path {path}.") # Optional
            return None

        return processed_tensor
    except Exception: # as e_proc:
        # print(f"Error during actual_image_processor call for {path}: {e_proc}") # Optional
        return None

# --- Custom PyTorch Dataset ---
class RetinalLdlDatasetPyTorch(Dataset):
    def __init__(self, df_input, processor_ref, scaled_ldl_col_name='LDL_scaled'):
        self.df = df_input.reset_index(drop=True)
        self.processor = processor_ref # This is the main medgemma_processor (Gemma3Processor instance)
        self.scaled_ldl_col = scaled_ldl_col_name

        if self.processor is None:
            raise ValueError("MedGemma processor (processor_ref) has not been loaded or passed correctly. Check Cell 5.")
        if self.scaled_ldl_col not in self.df.columns:
            raise ValueError(f"Scaled LDL column '{self.scaled_ldl_col}' not found in DataFrame. Check Cell 6.")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        row = self.df.iloc[idx]
        image_path = row['image_path']

        # Call the corrected image loading function
        image_tensor = load_dicom_image_medgemma(image_path, self.processor)

        if image_tensor is None:
            # Fallback: return a placeholder (zeros)
            # Use TARGET_SIZE_MEDGEMMA which should be set in Cell 5
            num_channels = 3 # Assume 3 for RGB
            if hasattr(self.processor, 'image_processor') and hasattr(self.processor.image_processor, 'num_channels'):
                 num_channels = self.processor.image_processor.num_channels

            height, width = TARGET_SIZE_MEDGEMMA # This variable is from Cell 5
            # print(f"Warning: Failed to load image {image_path} for index {idx}. Returning zeros of shape ({num_channels}, {height}, {width}).") # Optional
            image_tensor = torch.zeros((num_channels, height, width), dtype=torch.float32)
            ldl_value_scaled = torch.tensor(0.0, dtype=torch.float32) # Neutral placeholder for label
        else:
            ldl_value_scaled = torch.tensor(row[self.scaled_ldl_col], dtype=torch.float32)

        return {"pixel_values": image_tensor, "labels": ldl_value_scaled}

# --- Create Dataset and DataLoader instances ---
# These will be re-initialized here using the final load_dicom_image_medgemma
train_dataset_pytorch, val_dataset_pytorch, test_dataset_pytorch = None, None, None
train_dataloader, val_dataloader, test_dataloader = None, None, None

# Ensure processor is loaded (from Cell 5) and DataFrames are ready (from Cell 6)
if 'medgemma_processor' in locals() and medgemma_processor is not None:
    MEDGEMMA_BATCH_SIZE = 4 # You can adjust this
    print(f"\nRe-creating Datasets and DataLoaders with final image loading logic. Batch size: {MEDGEMMA_BATCH_SIZE}")

    if 'train_df' in locals() and not train_df.empty and 'LDL_scaled' in train_df.columns:
        try:
            train_dataset_pytorch = RetinalLdlDatasetPyTorch(
                df_input=train_df,
                processor_ref=medgemma_processor,
                scaled_ldl_col_name='LDL_scaled'
            )
            # Set num_workers=0 if you encounter issues with multiprocessing on Colab, especially on CPU runtime
            # For GPU, num_workers=2 is usually fine.
            num_dataloader_workers = 2 if torch.cuda.is_available() else 0
            train_dataloader = DataLoader(train_dataset_pytorch, batch_size=MEDGEMMA_BATCH_SIZE, shuffle=True, num_workers=num_dataloader_workers, pin_memory=torch.cuda.is_available(), drop_last=True)
            print(f"Train PyTorch Dataset created with {len(train_dataset_pytorch)} samples.")
        except ValueError as e:
            print(f"Error creating train_dataset_pytorch: {e}")
    else:
        print("Train DataFrame not ready or 'LDL_scaled' missing. Cannot create train_dataset_pytorch.")

    if 'val_df' in locals() and not val_df.empty and 'LDL_scaled' in val_df.columns:
        try:
            val_dataset_pytorch = RetinalLdlDatasetPyTorch(
                df_input=val_df,
                processor_ref=medgemma_processor,
                scaled_ldl_col_name='LDL_scaled'
            )
            val_dataloader = DataLoader(val_dataset_pytorch, batch_size=MEDGEMMA_BATCH_SIZE, shuffle=False, num_workers=num_dataloader_workers, pin_memory=torch.cuda.is_available())
            print(f"Validation PyTorch Dataset created with {len(val_dataset_pytorch)} samples.")
        except ValueError as e:
            print(f"Error creating val_dataset_pytorch: {e}")
    else:
        print("Validation DataFrame not ready or 'LDL_scaled' missing. Cannot create val_dataset_pytorch.")

    # Test DataLoader is optional here but good for completeness
    if 'test_df' in locals() and not test_df.empty and 'LDL_scaled' in test_df.columns:
        try:
            test_dataset_pytorch = RetinalLdlDatasetPyTorch(
                df_input=test_df,
                processor_ref=medgemma_processor,
                scaled_ldl_col_name='LDL_scaled'
            )
            test_dataloader = DataLoader(test_dataset_pytorch, batch_size=MEDGEMMA_BATCH_SIZE, shuffle=False, num_workers=num_dataloader_workers, pin_memory=torch.cuda.is_available())
            print(f"Test PyTorch Dataset created with {len(test_dataset_pytorch)} samples.")
        except ValueError as e:
            print(f"Error creating test_dataset_pytorch: {e}")
    else:
        print("Test DataFrame not ready or 'LDL_scaled' missing. Cannot create test_dataset_pytorch.")
else:
    print("MedGemma processor not loaded (Cell 5 likely failed). Cannot create PyTorch Datasets/DataLoaders.")


# --- Optional: Test one batch from DataLoader ---
if train_dataloader is not None and len(train_dataloader) > 0:
    print("\nAttempting to get one batch from train_dataloader (with final logic)...")
    try:
        sample_batch = next(iter(train_dataloader))
        images = sample_batch['pixel_values']
        labels = sample_batch['labels']
        print("Sample batch loaded successfully.")
        print(f"  Images shape: {images.shape}")
        print(f"  Images dtype: {images.dtype}")
        print(f"  Labels shape: {labels.shape}")
        print(f"  Labels dtype: {labels.dtype}")
        if images.numel() > 0 : print(f"  First image min/max/mean: {images[0].min().item():.2f} / {images[0].max().item():.2f} / {images[0].mean().item():.2f}")
        if labels.numel() > 0 : print(f"  First label (scaled): {labels[0].item():.2f}")

        # Count how many images in the batch are all zeros (placeholders)
        zero_images_in_batch = 0
        for i in range(images.shape[0]):
            if torch.all(images[i] == 0):
                zero_images_in_batch += 1
        if zero_images_in_batch > 0:
            print(f"  WARNING: {zero_images_in_batch}/{images.shape[0]} images in this batch are placeholders (all zeros).")

    except Exception as e:
        print(f"Error while testing train_dataloader: {e}")
else:
    print("\nTrain DataLoader not created or is empty, cannot test a batch.")

print("\nCell 7: Final PyTorch Dataset and DataLoader implementation complete.")


Re-creating Datasets and DataLoaders with final image loading logic. Batch size: 4
Train PyTorch Dataset created with 681 samples.
Validation PyTorch Dataset created with 145 samples.
Test PyTorch Dataset created with 147 samples.

Attempting to get one batch from train_dataloader (with final logic)...
Sample batch loaded successfully.
  Images shape: torch.Size([4, 3, 896, 896])
  Images dtype: torch.float32
  Labels shape: torch.Size([4])
  Labels dtype: torch.float32
  First image min/max/mean: -1.00 / 1.00 / -0.54
  First label (scaled): -0.13

Cell 7: Final PyTorch Dataset and DataLoader implementation complete.


In [None]:
"""
Congratulations! You now have a fully functional, end-to-end PyTorch data pipeline that:
Correctly loads your DICOM images.
Handles the YBR_FULL_422 photometric interpretation using dcm.convert_pixel_data().
Converts them to PIL images.
Uses the specific image_processor component of the medgemma_processor (which is a Gemma3ImageProcessor configured for MedGemma's vision needs) to preprocess the images into the correct format, size, and normalization range for MedGemma.
Normalizes your LDL target values.
Batches the data efficiently using DataLoader.
This is a major milestone. The "data" part, which is often the trickiest, is now solid.
"""

In [1]:
!git clone https://github.com/Ravikrishnan05/PrediscanMedtech_project.git

Cloning into 'PrediscanMedtech_project'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 9 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (9/9), 12.99 KiB | 302.00 KiB/s, done.


In [6]:
!ls

PrediscanMedtech_project  sample_data


In [2]:
!mv Megamma_from_scrach_with_google.ipynb PrediscanMedtech_project/

mv: cannot stat 'Megamma_from_scrach_with_google.ipynb': No such file or directory


In [None]:
# Add this temporarily after Cell 7 to inspect
if train_dataset_pytorch:
    print("\nInspecting first few samples from train_dataset_pytorch:")
    for i in range(min(5, len(train_dataset_pytorch))):
        try:
            sample = train_dataset_pytorch[i]
            img_tensor = sample['pixel_values']
            lbl_tensor = sample['labels']
            print(f"Sample {i}: img_shape={img_tensor.shape}, img_min={img_tensor.min().item():.2f}, img_max={img_tensor.max().item():.2f}, scaled_label={lbl_tensor.item():.2f}")
            if img_tensor.min().item() == 0.0 and img_tensor.max().item() == 0.0 and lbl_tensor.item() == 0.0:
                # Try to get the original path to see which image is causing issues
                original_row = train_df.iloc[i]
                print(f"  ^-- Might be a placeholder. Original path: {original_row['image_path']}")
        except Exception as e:
            print(f"Error inspecting sample {i}: {e}")


Inspecting first few samples from train_dataset_pytorch:
Sample 0: img_shape=torch.Size([3, 896, 896]), img_min=0.00, img_max=0.00, scaled_label=0.00
  ^-- Might be a placeholder. Original path: /content/medgemma_extracted_images/1000/1004/1004_eidon_mosaic_cfp_l_1.2.826.0.1.3680043.8.641.1.20230809.2436.96446.dcm
Sample 1: img_shape=torch.Size([3, 896, 896]), img_min=0.00, img_max=0.00, scaled_label=0.00
  ^-- Might be a placeholder. Original path: /content/medgemma_extracted_images/1000/1004/1004_eidon_mosaic_cfp_r_1.2.826.0.1.3680043.8.641.1.20230809.2448.36605.dcm
Sample 2: img_shape=torch.Size([3, 896, 896]), img_min=0.00, img_max=0.00, scaled_label=0.00
  ^-- Might be a placeholder. Original path: /content/medgemma_extracted_images/1000/1007/1007_eidon_mosaic_cfp_r_1.2.826.0.1.3680043.8.641.1.20230824.20355.67485.dcm
Sample 3: img_shape=torch.Size([3, 896, 896]), img_min=0.00, img_max=0.00, scaled_label=0.00
  ^-- Might be a placeholder. Original path: /content/medgemma_extracte

In [None]:
df = pd.read_csv(LOCAL_CSV_PATH)
#Drop everything column other than person id and LDL

print(df.columns[20])

LDL Cholesterol Calculation (mg/dL)


In [None]:
df["LDL Cholesterol Calculation (mg/dL)"]
print(df["LDL Cholesterol Calculation (mg/dL)"].isnull().sum())

41


In [None]:
df_clean = df.dropna(subset=["LDL Cholesterol Calculation (mg/dL)"])
print(df_clean.shape)  # Should be 41 rows fewer

print(df_clean["LDL Cholesterol Calculation (mg/dL)"].isnull().sum())

(1026, 111)
0


In [None]:
#write code to find number of folders inside /content/extracted_images/1000
import os
folder_count = len(os.listdir(LOCAL_IMAGES_ROOT))
print(f"Number of folders in {LOCAL_IMAGES_ROOT}: {folder_count}")

Number of folders in /content/extracted_images/1000: 541


In [None]:
# --------------------------------------------------
# Cell 3: Load and Filter Clinical Data (STEP 1)
# --------------------------------------------------

if not os.path.exists(clinical_csv_path):
    print(f"FATAL ERROR: Clinical CSV file not found at the expected local path: {clinical_csv_path}")
    # You might need to re-run Cell 2 or check paths
else:
    df = pd.read_csv(clinical_csv_path)
    print(f"Initial number of rows in clinical data: {len(df)}")

    ldl_column_name = "LDL Cholesterol Calculation (mg/dL)" # Make sure this matches your CSV header

    # Data Cleaning
    if ldl_column_name not in df.columns:
        print(f"ERROR: LDL column '{ldl_column_name}' not found in CSV. Available columns: {df.columns.tolist()}")
    else:
        print(f"Original LDL dtype: {df[ldl_column_name].dtype}")
        df[ldl_column_name] = pd.to_numeric(df[ldl_column_name], errors='coerce')
        print(f"Number of NaNs in '{ldl_column_name}' before explicit drop: {df[ldl_column_name].isnull().sum()}")
        df.dropna(subset=[ldl_column_name], inplace=True)
        print(f"Number of rows after dropping NaNs in '{ldl_column_name}': {len(df)}")
        if df[ldl_column_name].isnull().sum() > 0:
            print(f"WARNING: NaNs still present in '{ldl_column_name}' after dropna.")
        else:
            print(f"Successfully removed/handled NaNs from '{ldl_column_name}'.")

        # Ensure person_id is string for matching with folder names
        if 'person_id' not in df.columns:
            print(f"ERROR: 'person_id' column not found in CSV. Available columns: {df.columns.tolist()}")
        else:
            df['person_id'] = df['person_id'].astype(str)
            ldl_lookup = df.set_index('person_id')[ldl_column_name].to_dict()

            # Filter valid person_ids based on available image folders
            if not os.path.exists(images_root_path) or not os.path.isdir(images_root_path):
                print(f"FATAL ERROR: Images root path '{images_root_path}' does not exist or is not a directory.")
            else:
                available_folders = set(os.listdir(images_root_path))
                print(f"Found {len(available_folders)} folders in images_root_path: {list(available_folders)[:5]}...") # Print a few

                valid_ids_clinical = set(ldl_lookup.keys())
                print(f"Found {len(valid_ids_clinical)} unique person_ids with LDL data in CSV.")

                valid_ids = sorted(list(valid_ids_clinical & available_folders))
                print(f"Found {len(valid_ids)} common person_ids between CSV and image folders.")

                if not valid_ids:
                    print("ERROR: No common person_ids found. Check 'person_id' format in CSV and folder names, and paths.")
                else:
                    image_records = []
                    for idx, person_id in enumerate(valid_ids, 1):
                        folder_path = os.path.join(images_root_path, person_id)
                        ldl_value = ldl_lookup[person_id]
                        if pd.isna(ldl_value): continue # Should be caught already

                        if os.path.isdir(folder_path): # Ensure it's a directory
                            for filename in os.listdir(folder_path):
                                if filename.lower().endswith(".dcm"):
                                    image_path = os.path.join(folder_path, filename)
                                    image_records.append({
                                        "person_id": person_id,
                                        "image_path": image_path,
                                        "LDL": ldl_value
                                    })
                        if idx % (len(valid_ids)//10 if len(valid_ids) > 10 else 1) == 0 or idx == len(valid_ids): # Progress print
                             print(f"[{idx}/{len(valid_ids)}] Processed patient ID: {person_id}")


                    image_df = pd.DataFrame(image_records)
                    print(f"Total image samples mapped: {len(image_df)}")
                    if not image_df.empty:
                        print(f"Number of NaNs in final image_df['LDL']: {image_df['LDL'].isnull().sum()}")
                        if image_df['LDL'].isnull().sum() > 0:
                            image_df.dropna(subset=['LDL'], inplace=True)
                            print(f"Total image samples after final LDL NaN drop: {len(image_df)}")
                        print("Sample of image_df:")
                        print(image_df.head())
                    else:
                        print("ERROR: image_df is empty. No DICOM images found or linked.")

Initial number of rows in clinical data: 1067
Original LDL dtype: float64
Number of NaNs in 'LDL Cholesterol Calculation (mg/dL)' before explicit drop: 41
Number of rows after dropping NaNs in 'LDL Cholesterol Calculation (mg/dL)': 1026
Successfully removed/handled NaNs from 'LDL Cholesterol Calculation (mg/dL)'.
Found 541 folders in images_root_path: ['7162', '7398', '1157', '7192', '1072']...
Found 1026 unique person_ids with LDL data in CSV.
Found 528 common person_ids between CSV and image folders.
[52/528] Processed patient ID: 1129
[104/528] Processed patient ID: 1222
[156/528] Processed patient ID: 1313
[208/528] Processed patient ID: 4009
[260/528] Processed patient ID: 4180
[312/528] Processed patient ID: 4298
[364/528] Processed patient ID: 7093
[416/528] Processed patient ID: 7185
[468/528] Processed patient ID: 7286
[520/528] Processed patient ID: 7398
[528/528] Processed patient ID: 7409
Total image samples mapped: 974
Number of NaNs in final image_df['LDL']: 0
Sample of i

In [None]:
image_df.info()
image_df.head(5)

#print(image_df[image_df["LDL"]<=0])

#remove the negative ldl row and update the image_df
#drop the row
#give code

image_df = image_df[image_df["LDL"] > 0]
print(image_df[image_df["LDL"]<=0])
print(image_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 973 entries, 0 to 973
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   person_id   973 non-null    object 
 1   image_path  973 non-null    object 
 2   LDL         973 non-null    float64
dtypes: float64(1), object(2)
memory usage: 30.4+ KB
Empty DataFrame
Columns: [person_id, image_path, LDL]
Index: []
Empty DataFrame
Columns: [person_id, image_path, LDL]
Index: []
<class 'pandas.core.frame.DataFrame'>
Index: 973 entries, 0 to 973
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   person_id   973 non-null    object 
 1   image_path  973 non-null    object 
 2   LDL         973 non-null    float64
dtypes: float64(1), object(2)
memory usage: 30.4+ KB
None
