<a href="https://colab.research.google.com/github/Ravikrishnan05/PrediscanMedtech_project/blob/main/Unsloth_ptmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# To run this, press "Runtime" and press "Run all" on a free Tesla T4 Google Colab instance!

#    Join Discord if you need help + ⭐ Star us on Github ⭐
# To install Unsloth on your own computer, follow the installation instructions on our Github page here.

# You will learn how to do data prep, how to train, how to run the model, & how to save it

# News
# Unsloth now supports Text-to-Speech (TTS) models. Read our guide here.

# Read our Qwen3 Guide and check out our new Dynamic 2.0 quants which outperforms other quantization methods!

# Visit our docs for all our model uploads and notebooks.

# -----------------------------------------------------------------------------
# Cell 0.1: Unsloth Installation
# -----------------------------------------------------------------------------
#%%capture # Use %%capture to hide pip outputs if desired
import os
if "COLAB_" not in "".join(os.environ.keys()):
    print("Installing Unsloth for local environment...")
    !pip install "unsloth[colab-new]@git+https://github.com/unslothai/unsloth.git" # Adapting for general install
else:
    print("Installing Unsloth for Colab environment...")
    # Do this only in Colab notebooks! Otherwise use pip install "unsloth[colab-new]@git+https://github.com/unslothai/unsloth.git"
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
    !pip install --no-deps "unsloth[colab-new]@git+https://github.com/unslothai/unsloth.git"

In [None]:
# -----------------------------------------------------------------------------
# Cell 0.2: Additional Library Installations
# -----------------------------------------------------------------------------
print("\nInstalling additional libraries for data processing and DICOM handling...")
!pip install -q pydicom pandas opencv-python Pillow scikit-learn matplotlib seaborn "huggingface_hub>=0.23.0" "hf_transfer>=0.1.6" "datasets>=2.16.0" sentencepiece protobuf

In [None]:
# Install unsloth_zoo
print("\nInstalling unsloth_zoo...")
!pip install unsloth_zoo

In [None]:
# Unsloth FastModel supports loading nearly any model now! This includes Vision and Text models!

# -----------------------------------------------------------------------------
# Cell 0.3: Unsloth Model Loading
# -----------------------------------------------------------------------------
from unsloth import FastLanguageModel # Changed from FastModel to FastLanguageModel as per recent Unsloth examples for language models
import torch

In [None]:
# IMPORTANT: MODEL SELECTION FOR YOUR TASK
# The model "unsloth/gemma-3-4b-it" is a TEXT-BASED instruct model.
# Your original code used MedGemma, a VISION-LANGUAGE model, and processed images.
# If your task involves processing images to predict LDL, you MUST select a vision-language model.
# Examples:
#   - Search for Unsloth-quantized vision models: https://huggingface.co/unsloth
#   - Try loading a standard HF vision model (e.g., "google/medgemma-4b-pt", "llava-hf/llava-1.5-7b-hf", "microsoft/phi-3-vision-128k-instruct")
#     FastLanguageModel might support them. If so, set `finetune_vision_layers = True` in the PEFT setup.
# For this example, we'll use the text model from the Unsloth template.
# You will need to adapt your data processing (especially image handling in the Dataset)
# if you use a text model for a vision task, or change the model_name.

# List of potential models (mostly text, check for vision capabilities if needed)
fourbit_models = [
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit", # Current choice
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
    "unsloth/Llama-3.1-8B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
    # Potentially add your vision model here if Unsloth supports it directly or via HF path
    # "google/medgemma-4b-pt", # Example: if you want to try loading your original model
]

#selected_model_name = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit" # Defaulting to Unsloth's example text model
selected_model_name = "google/medgemma-4b-pt" # << TRY THIS FOR YOUR VISION TASK (or other vision model)

print(f"Attempting to load model: {selected_model_name}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = selected_model_name,
    max_seq_length = 2048,
    dtype = None, # None for auto detection. torch.float16 if Ampere+. torch.bfloat16 if Hopper+
    load_in_4bit = True,
    # token = "hf_...", # use one if using gated models like Llama
)
print(f"Model {selected_model_name} loaded.")

# Note: For vision models, the 'tokenizer' might be a composite object
# or you might access an image processor via `model.processor` or `tokenizer.image_processor`.
# This depends on how Unsloth handles vision models.

In [None]:
# -----------------------------------------------------------------------------
# Cell 0.4: Unsloth PEFT (LoRA) Setup
# -----------------------------------------------------------------------------
# We now add LoRA adapters so we only need to update a small amount of parameters!
model = FastLanguageModel.get_peft_model( # Changed from add_lora_weights to get_peft_model
    model,
    finetune_vision_layers = True, # Set to True if you load a vision model and want to finetune vision components
                                    # This parameter might not be directly in add_lora_weights,
                                    # target_modules selection would handle this.
    r = 8,
    lora_alpha = 8, # Recommended alpha == r or 2*r
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    # target_modules:
    # For Gemma: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    # For Llama: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    # For Mistral: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    # For Phi-3: ["q_proj", "k_proj", "v_proj", "o_proj", "dense", "fc1", "fc2"]
    # If using a vision model, you might need to identify and add its vision-specific modules to target_modules.
    # Unsloth might auto-detect common ones for known architectures.
    # Example for Llava (conceptual): target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "vision_tower.vision_model..."]
    target_modules = None, # None for Unsloth's automatic selection based on model type
                           # If None, Unsloth will try to find all linear layers
)
print("LoRA adapters added to the model.")

In [None]:
# -----------------------------------------------------------------------------
# Cell 1: PyTorch/HuggingFace Imports and Setup (Adapted from user's Cell 1)
# -----------------------------------------------------------------------------
print("\nImporting libraries...")
# Python Standard Libraries
import shutil # os, zipfile already imported or not needed here
import zipfile

# Third-party Libraries
import pandas as pd
import numpy as np
import pydicom
import cv2 # OpenCV
from PIL import Image

# PyTorch
# import torch # Already imported
from torch.utils.data import Dataset, DataLoader

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Hugging Face (tokenizer is already loaded by Unsloth)
# from transformers import AutoProcessor # Replaced by Unsloth's tokenizer

# Plotting (optional, but often useful)
import matplotlib.pyplot as plt
import seaborn as sns

# Colab specific
from google.colab import drive

print("--- Library Version Checks ---")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
import sklearn
print(f"Scikit-learn version: {sklearn.__version__}")
# print(f"TensorFlow Version: {tf.__version__}") # TensorFlow not used in this Unsloth/PyTorch setup
if torch.cuda.is_available():
    print(f"PyTorch version: {torch.__version__}")
    print(f"PyTorch CUDA version: {torch.version.cuda}")
    print(f"GPU available for PyTorch: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available for PyTorch, using CPU.")

# For reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

print("\nCell 1: Imports and basic setup complete.")

In [None]:
# --------------------------------------------------
# Cell 2: Configuration and Unzip Data (From user's Cell 2)
# --------------------------------------------------
drive.mount('/content/drive', force_remount=True)

# --- Configuration ---
DRIVE_CSV_PATH = "/content/drive/MyDrive/cp.csv"
DRIVE_ZIP_PATH = "/content/drive/MyDrive/1000-20250517T062750Z-1-001.zip" # Your image ZIP on Drive

LOCAL_EXTRACT_PATH = "/content/medgemma_extracted_images"
LOCAL_IMAGES_ROOT = os.path.join(LOCAL_EXTRACT_PATH, "1000") # Adjusted to match your structure
LOCAL_CSV_PATH = "/content/medgemma_cp.csv"

# --- Unzip Data (if not already done or if re-running) ---
if os.path.exists(DRIVE_CSV_PATH):
    shutil.copy(DRIVE_CSV_PATH, LOCAL_CSV_PATH)
    print(f"CSV copied to {LOCAL_CSV_PATH}")
else:
    print(f"ERROR: CSV file not found at {DRIVE_CSV_PATH}")

if os.path.exists(LOCAL_EXTRACT_PATH):
    print(f"Removing existing extraction directory: {LOCAL_EXTRACT_PATH}")
    shutil.rmtree(LOCAL_EXTRACT_PATH)
os.makedirs(LOCAL_EXTRACT_PATH, exist_ok=True)
print(f"Created local extraction directory: {LOCAL_EXTRACT_PATH}")

if os.path.exists(DRIVE_ZIP_PATH):
    print(f"Unzipping {DRIVE_ZIP_PATH} to {LOCAL_EXTRACT_PATH}...")
    with zipfile.ZipFile(DRIVE_ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_EXTRACT_PATH)
    print("Unzipping complete.")
    if os.path.exists(LOCAL_IMAGES_ROOT):
        print(f"Image root folder found at: {LOCAL_IMAGES_ROOT}")
    else:
        print(f"ERROR: Expected image root folder '{LOCAL_IMAGES_ROOT}' not found after unzipping. Check ZIP structure.")
        print(f"Contents of {LOCAL_EXTRACT_PATH}: {os.listdir(LOCAL_EXTRACT_PATH)}")

else:
    print(f"ERROR: ZIP file not found at {DRIVE_ZIP_PATH}")

print("\nCell 2: Data unzipping complete.")


In [None]:
# --------------------------------------------------
# Cell 3: Load and Filter Clinical Data to create image_df (From user's Cell 3)
# --------------------------------------------------
image_df = pd.DataFrame()

if not os.path.exists(LOCAL_CSV_PATH):
    print(f"FATAL ERROR: Clinical CSV file not found at the expected local path: {LOCAL_CSV_PATH}")
else:
    df_raw_from_cell3 = pd.read_csv(LOCAL_CSV_PATH)
    print(f"Initial number of rows in clinical data (Cell 3): {len(df_raw_from_cell3)}")

    person_id_col_name_c3 = 'person_id'
    ldl_col_name_c3 = "LDL Cholesterol Calculation (mg/dL)" # Ensure this matches your CSV header

    if not (person_id_col_name_c3 in df_raw_from_cell3.columns and ldl_col_name_c3 in df_raw_from_cell3.columns):
        print(f"ERROR: Required columns ('{person_id_col_name_c3}' or '{ldl_col_name_c3}') not found in CSV.")
        print(f"Available columns: {df_raw_from_cell3.columns.tolist()}")
    else:
        df_selected_c3 = df_raw_from_cell3[[person_id_col_name_c3, ldl_col_name_c3]].copy()
        df_selected_c3.rename(columns={ldl_col_name_c3: 'LDL_temp'}, inplace=True)
        df_selected_c3['LDL_temp'] = pd.to_numeric(df_selected_c3['LDL_temp'], errors='coerce')
        df_selected_c3.dropna(subset=['LDL_temp'], inplace=True)
        df_selected_c3 = df_selected_c3[df_selected_c3['LDL_temp'] > 0].copy()
        df_selected_c3[person_id_col_name_c3] = df_selected_c3[person_id_col_name_c3].astype(str)
        print(f"Cleaned clinical data (positive LDLs only): {len(df_selected_c3)} records.")

        ldl_lookup_c3 = df_selected_c3.set_index(person_id_col_name_c3)['LDL_temp'].to_dict()

        if not (os.path.exists(LOCAL_IMAGES_ROOT) and os.path.isdir(LOCAL_IMAGES_ROOT)):
            print(f"FATAL ERROR: Images root path '{LOCAL_IMAGES_ROOT}' does not exist or is not a directory.")
        else:
            available_folders_c3 = set(os.listdir(LOCAL_IMAGES_ROOT))
            valid_ids_clinical_c3 = set(ldl_lookup_c3.keys())
            common_person_ids_c3 = sorted(list(valid_ids_clinical_c3 & available_folders_c3))
            print(f"Found {len(common_person_ids_c3)} common person_ids for mapping.")

            image_records_list = []
            for pid_c3 in common_person_ids_c3:
                folder_path_c3 = os.path.join(LOCAL_IMAGES_ROOT, pid_c3)
                ldl_val_c3 = ldl_lookup_c3[pid_c3]
                if os.path.isdir(folder_path_c3):
                    for filename_c3 in os.listdir(folder_path_c3):
                        if filename_c3.lower().endswith(".dcm"):
                            image_path_c3 = os.path.join(folder_path_c3, filename_c3)
                            image_records_list.append({
                                "person_id": pid_c3,
                                "image_path": image_path_c3,
                                "LDL": ldl_val_c3
                            })
            image_df = pd.DataFrame(image_records_list)
            if not image_df.empty:
                print(f"Final image_df created with {len(image_df)} image-LDL pairs.")
                from IPython.display import display # For better display in Colab
                display(image_df.head())
                print(f"LDL stats in final image_df: min={image_df['LDL'].min()}, max={image_df['LDL'].max()}, mean={image_df['LDL'].mean()}")
            else:
                print("WARNING: image_df is empty after mapping. Check paths, IDs, and DICOM file existence.")
print("\nCell 3: image_df preparation complete.")

In [None]:
# -----------------------------------------------------------------------------
# Cell 4: Verify image_df (Adapted from user's Cell 4)
# -----------------------------------------------------------------------------
if 'image_df' in locals() and isinstance(image_df, pd.DataFrame) and not image_df.empty:
    print(f"\nContinuing with 'image_df' which has {len(image_df)} records.")
    print("Columns in image_df:", image_df.columns.tolist())
    print("Sample of image_df:")
    from IPython.display import display
    display(image_df.head())

    required_cols = ['person_id', 'image_path', 'LDL']
    if not all(col in image_df.columns for col in required_cols):
        print(f"ERROR: 'image_df' is missing one or more required columns: {required_cols}. Please re-run previous data preparation cells.")
    elif image_df['LDL'].min() <= 0: # type: ignore
        print(f"ERROR: 'image_df' still contains non-positive LDL values. LDL min: {image_df['LDL'].min()}. Please re-run filtering.") # type: ignore
    else:
        print("'image_df' seems okay to proceed.")
else:
    print("ERROR: 'image_df' not found or is empty. Please ensure your data preparation cells have been run successfully.")
    image_df = pd.DataFrame(columns=['person_id', 'image_path', 'LDL']) # Avoid NameError

print(f"\nUsing Unsloth loaded model: {selected_model_name}") # Was MEDGEMMA_PT_MODEL_ID
print("\nCell 4: image_df verification and Model ID check complete.")

In [None]:
# -----------------------------------------------------------------------------
# Cell 5: Unsloth Tokenizer/Processor Info (Adapted from user's Cell 5)
# -----------------------------------------------------------------------------
# The `medgemma_processor` is now replaced by the `tokenizer` from Unsloth.
# For vision models, this tokenizer might wrap an image processor,
# or `model.processor` might be set by Unsloth.

TARGET_SIZE_FOR_IMAGES = (896, 896) # Default, e.g. from MedGemma card
print(f"\nDefault TARGET_SIZE_FOR_IMAGES: {TARGET_SIZE_FOR_IMAGES}")

if 'tokenizer' in locals() and tokenizer is not None:
    print(f"Unsloth tokenizer type: {type(tokenizer)}")

    image_processor_found = False
    # Check 1: tokenizer.image_processor (common in Hugging Face for multimodal)
    if hasattr(tokenizer, 'image_processor') and tokenizer.image_processor is not None:
        print("Found `tokenizer.image_processor`.")
        try:
            img_proc_size_info = tokenizer.image_processor.size
            print(f"  tokenizer.image_processor.size attribute: {img_proc_size_info}")
            # Parsing logic similar to your original cell
            parsed_h, parsed_w = None, None
            if isinstance(img_proc_size_info, dict): # e.g. {'height': H, 'width': W} or {'shortest_edge': S}
                parsed_h = img_proc_size_info.get('height', img_proc_size_info.get('shortest_edge'))
                if parsed_h is not None:
                    parsed_w = img_proc_size_info.get('width', parsed_h if 'shortest_edge' in img_proc_size_info else None)
            elif isinstance(img_proc_size_info, (list, tuple)) and len(img_proc_size_info) == 2: # e.g. (H, W)
                parsed_h, parsed_w = img_proc_size_info[0], img_proc_size_info[1]
            elif isinstance(img_proc_size_info, int): # e.g. S (square image)
                parsed_h = parsed_w = img_proc_size_info

            if parsed_h and parsed_w:
                TARGET_SIZE_FOR_IMAGES = (parsed_h, parsed_w)
                print(f"  Target Image Size updated from tokenizer.image_processor: {TARGET_SIZE_FOR_IMAGES}")
                image_processor_found = True
            else:
                print(f"  Could not reliably parse size from tokenizer.image_processor.size. Using default: {TARGET_SIZE_FOR_IMAGES}")
        except AttributeError:
            print("  `tokenizer.image_processor` does not have a 'size' attribute or expected structure.")
        except Exception as e:
            print(f"  Error accessing/parsing tokenizer.image_processor.size: {e}")

    # Check 2: model.processor (another place Unsloth might store it for vision models)
    if not image_processor_found and hasattr(model, 'processor') and model.processor is not None:
        # This assumes model.processor would be an image processor or a combined one
        print("Found `model.processor`.")
        if hasattr(model.processor, 'image_processor') and model.processor.image_processor is not None: # If it's a combined processor
            image_proc_component = model.processor.image_processor
        else: # Or if model.processor *is* the image processor
            image_proc_component = model.processor

        if hasattr(image_proc_component, 'size'):
            try:
                img_proc_size_info = image_proc_component.size
                print(f"  model.processor's image component size attribute: {img_proc_size_info}")
                # (Add parsing logic here if different from above, otherwise reuse)
                # For brevity, assuming similar parsing as above.
                # If size is found and parsed, update TARGET_SIZE_FOR_IMAGES and set image_processor_found = True
                # This part is illustrative; exact structure of model.processor varies.
                print(f"  (Parsing logic for model.processor.size would go here if needed)")
            except Exception as e:
                print(f"  Error accessing/parsing model.processor's image component size: {e}")
        else:
            print("  `model.processor` (or its image component) does not have a 'size' attribute.")


    if not image_processor_found:
        print(f"No image processor with size information explicitly found in Unsloth's tokenizer or model.processor.")
        print(f"Using default TARGET_SIZE_FOR_IMAGES: {TARGET_SIZE_FOR_IMAGES}.")
        print("IMPORTANT: If you are using a vision model, ensure images are preprocessed to the model's expected input size.")
        print(f"The current model '{selected_model_name}' is likely a TEXT model. Image processing capabilities might be limited or absent.")

else:
    print("Unsloth tokenizer not loaded. Cannot determine image processing details.")
    print(f"Using default TARGET_SIZE_FOR_IMAGES: {TARGET_SIZE_FOR_IMAGES}.")

print("\nCell 5: Unsloth tokenizer/processor check complete.")

In [None]:
# -----------------------------------------------------------------------------
# Cell 6: Data Splitting (Patient-Level) and LDL Normalization (From user's Cell 6)
# -----------------------------------------------------------------------------
train_df, val_df, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
ldl_scaler = None

if 'image_df' in locals() and not image_df.empty:
    print(f"\nStarting data splitting for {len(image_df)} image-LDL pairs...")
    if 'person_id' not in image_df.columns:
        print("ERROR: 'person_id' column missing in image_df. Cannot perform patient-level split.")
    else:
        unique_person_ids = image_df['person_id'].unique()
        print(f"Total unique patients for splitting: {len(unique_person_ids)}")

        if len(unique_person_ids) < 3:
            print("Warning: Not enough unique patients for a robust 3-way (train/validation/test) split.")
            if len(unique_person_ids) == 2:
                train_pids, val_pids = train_test_split(unique_person_ids, test_size=0.5, random_state=RANDOM_SEED)
                test_pids = np.array([])
            elif len(unique_person_ids) == 1:
                train_pids = unique_person_ids
                val_pids, test_pids = np.array([]), np.array([])
            else:
                train_pids, val_pids, test_pids = np.array([]), np.array([]), np.array([])
        else:
            train_pids, temp_pids = train_test_split(unique_person_ids, test_size=0.30, random_state=RANDOM_SEED)
            if len(temp_pids) > 1 :
                 val_pids, test_pids = train_test_split(temp_pids, test_size=0.50, random_state=RANDOM_SEED)
            elif len(temp_pids) == 1:
                val_pids = temp_pids
                test_pids = np.array([])
            else:
                val_pids, test_pids = np.array([]), np.array([])

        train_df = image_df[image_df['person_id'].isin(train_pids)].copy()
        val_df = image_df[image_df['person_id'].isin(val_pids)].copy()
        test_df = image_df[image_df['person_id'].isin(test_pids)].copy()

        print(f"Train set: {len(train_df)} samples from {len(train_pids)} patients.")
        print(f"Validation set: {len(val_df)} samples from {len(val_pids)} patients.")
        print(f"Test set: {len(test_df)} samples from {len(test_pids)} patients.")

        if len(train_pids)>0 and len(val_pids)>0: assert len(set(train_pids) & set(val_pids)) == 0, "Patient overlap train/val!"
        if len(train_pids)>0 and len(test_pids)>0: assert len(set(train_pids) & set(test_pids)) == 0, "Patient overlap train/test!"
        if len(val_pids)>0 and len(test_pids)>0: assert len(set(val_pids) & set(test_pids)) == 0, "Patient overlap val/test!"
        print("Patient-level splits verified (no overlap if sets are non-empty).")

        if not train_df.empty and 'LDL' in train_df.columns:
            print("\nNormalizing LDL values using StandardScaler...")
            ldl_scaler = StandardScaler()
            train_df['LDL_scaled'] = ldl_scaler.fit_transform(train_df[['LDL']])
            if not val_df.empty: val_df['LDL_scaled'] = ldl_scaler.transform(val_df[['LDL']])
            else: val_df['LDL_scaled'] = pd.Series(dtype='float64')
            if not test_df.empty: test_df['LDL_scaled'] = ldl_scaler.transform(test_df[['LDL']])
            else: test_df['LDL_scaled'] = pd.Series(dtype='float64')

            print("LDL normalization complete.")
            print("Scaled LDL stats in train_df (should be mean~0, std~1):")
            from IPython.display import display
            display(train_df['LDL_scaled'].describe())
            # import joblib
            # scaler_filename = 'ldl_scaler_unsloth.joblib'
            # joblib.dump(ldl_scaler, scaler_filename)
            # print(f"LDL scaler saved to {scaler_filename}")
        else:
            print("Train DataFrame is empty or 'LDL' column missing. Skipping LDL normalization.")
else:
    print("image_df is empty. Skipping data splitting and LDL normalization.")

print("\nCell 6: Data splitting and LDL normalization attempt complete.")

In [None]:
# -----------------------------------------------------------------------------
# Cell 5.1 (from user, now Cell 6.1): Check Unsloth tokenizer/model.processor
# -----------------------------------------------------------------------------
print("\n--- Sanity Check for Unsloth Components (Cell 6.1) ---")
if 'tokenizer' in locals() and tokenizer is not None:
    print(f"Unsloth tokenizer IS LOADED. Type: {type(tokenizer)}")
    if hasattr(tokenizer, 'image_processor') and tokenizer.image_processor is not None:
        print(f"  It has a tokenizer.image_processor of type: {type(tokenizer.image_processor)}")
    else:
        print("  It does NOT have a direct `tokenizer.image_processor` attribute (or it's None).")

    if hasattr(model, 'processor') and model.processor is not None:
        print(f"Unsloth model.processor IS LOADED. Type: {type(model.processor)}")
        if hasattr(model.processor, 'image_processor') and model.processor.image_processor is not None:
             print(f"  model.processor has an image_processor component of type: {type(model.processor.image_processor)}")
    else:
        print("  The model does NOT have a `model.processor` attribute (or it's None).")

    if not (hasattr(tokenizer, 'image_processor') and tokenizer.image_processor is not None) and \
       not (hasattr(model, 'processor') and model.processor is not None and hasattr(model.processor, 'image_processor')):
        print(f"  WARNING: No obvious image processor found. The model '{selected_model_name}' may be text-only.")
        print("  If your task requires image input, ensure you've selected a vision-language model and that Unsloth loads its image processor correctly.")
else:
    print("Unsloth tokenizer IS NOT LOADED or is None.")

In [None]:
# -----------------------------------------------------------------------------
# Cell 7: Custom PyTorch Dataset for DICOM Images and LDL (New Cell)
# -----------------------------------------------------------------------------
class DICOM_LDL_Dataset(Dataset):
    def __init__(self, dataframe, image_root_dir, tokenizer, target_size=(896, 896)):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame with 'image_path' and 'LDL_scaled' columns.
            image_root_dir (str): Root directory where images are extracted. (Not strictly needed with full paths in df, but good practice)
            tokenizer: The Unsloth tokenizer/processor with an image_processor component.
            target_size (tuple): Desired image size (height, width).
        """
        self.dataframe = dataframe
        self.image_root_dir = image_root_dir
        self.tokenizer = tokenizer
        self.target_size = target_size

        if not hasattr(self.tokenizer, 'image_processor') or self.tokenizer.image_processor is None:
             raise ValueError("Tokenizer must have a loaded 'image_processor' attribute.")

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_path = self.dataframe.iloc[idx]['image_path']
        ldl_scaled = self.dataframe.iloc[idx]['LDL_scaled']

        # --- Image Loading and Processing ---
        try:
            # Read DICOM file
            dicom_data = pydicom.dcmread(img_path)
            # Convert to NumPy array (handle different pixel data types and dimensions)
            # Ensure pixel data is in standard format (e.g., uint8 or uint16)
            if 'PixelData' not in dicom_data:
                 raise ValueError(f"No PixelData found in DICOM file: {img_path}")

            pixel_array = dicom_data.pixel_array

            # Handle potential multi-frame DICOMs - take the first frame for simplicity
            if pixel_array.ndim == 4: # (frames, height, width, channels)
                image = pixel_array[0, :, :, :]
            elif pixel_array.ndim == 3: # (height, width, channels) or (frames, height, width)
                 if dicom_data.get("PhotometricInterpretation", "").startswith("MONOCHROME"):
                      # Grayscale image (height, width) -> add channel dim
                      image = pixel_array[:, :]
                 else: # Assume (height, width, channels)
                      image = pixel_array[:, :, :]
            elif pixel_array.ndim == 2: # Grayscale image (height, width)
                 image = pixel_array
            else:
                 raise ValueError(f"Unsupported pixel array dimension: {pixel_array.ndim} for {img_path}")

            # Convert to PIL Image
            # Need to handle different pixel formats and value ranges
            # A common approach is to normalize or scale pixel values
            # For simplicity, let's try scaling to 0-255 for common image formats
            if image.dtype != np.uint8:
                 # Simple scaling for visualization/processing compatibility
                 image = image.astype(np.float32)
                 image = (image - image.min()) / (image.max() - image.min()) * 255.0
                 image = image.astype(np.uint8)

            # Convert grayscale to RGB if needed by the model's image processor
            if image.ndim == 2:
                 image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
            elif image.ndim == 3 and image.shape[2] == 1: # (H, W, 1) grayscale
                 image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
            elif image.ndim == 3 and image.shape[2] == 4: # (H, W, 4) RGBA
                 image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)

            image = Image.fromarray(image)

            # Apply model's image processing (resizing, normalization etc.)
            # The image_processor expects a PIL Image or a list of PIL Images
            processed_images = self.tokenizer.image_processor(images=image, return_tensors="pt")
            # The output is typically a dictionary like {'pixel_values': tensor}
            image_tensor = processed_images['pixel_values'].squeeze(0) # Remove batch dim added by processor

        except Exception as e:
            print(f"Error processing image {img_path}: {e}")
            # Handle errors: return None or a dummy sample, or raise error
            # For now, let's raise to debug data loading issues
            raise

        # Convert LDL to tensor
        ldl_tensor = torch.tensor(ldl_scaled, dtype=torch.float32)

        # Return image tensor and LDL tensor
        # The model might expect image input and text input separately.
        # For now, we prepare the image tensor and the target LDL.
        # The text part ("predict LDL based on image") will be handled during collation/training.
        return {
            "pixel_values": image_tensor,
            "labels": ldl_tensor, # Using 'labels' as is standard in HF for supervised tasks
            "image_path": img_path # Keep path for debugging
        }

print("\nCell 7: Custom Dataset class defined.")