<a href="https://colab.research.google.com/github/Ravikrishnan05/PrediscanMedtech_project/blob/main/Innerengineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [3]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, processor = FastVisionModel.from_pretrained(
    "google/medgemma-4b-pt",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.3: Fast Gemma3 patching. Transformers: 4.53.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
# 1. Get Vision Feature Dimension
# ----------------------------------------------------------------------------
print("--- 1. Determining Vision Feature Dimension ---")
# Since we know MedGemma is a Gemma3-based model, we can rely on its config.
if hasattr(model.config, 'vision_config') and hasattr(model.config.vision_config, 'hidden_size'):
    vision_feature_dim = model.config.vision_config.hidden_size
else:
    # Fallback just in case, this is the known value for MedGemma 4B's SigLIP vision tower.
    vision_feature_dim = 1024
print(f"Vision feature dimension is: {vision_feature_dim}\n")

--- 1. Determining Vision Feature Dimension ---
Vision feature dimension is: 1152



In [5]:
#install pydicom
!pip install pydicom

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.4/2.4 MB[0m [31m76.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1


In [7]:
"""
# Cell B: Architecture Definition, Verification, and Forward Pass

import torch
import torch.nn as nn
import pydicom
from PIL import Image

# This cell assumes `model` and `processor` were loaded in the previous cell.
# The `processor` object from FastVisionModel contains the image_processor and tokenizer.

# 1. Get Vision Feature Dimension
# ----------------------------------------------------------------------------
print("--- 1. Determining Vision Feature Dimension ---")
# Since we know MedGemma is a Gemma3-based model, we can rely on its config.
if hasattr(model.config, 'vision_config') and hasattr(model.config.vision_config, 'hidden_size'):
    vision_feature_dim = model.config.vision_config.hidden_size
else:
    # Fallback just in case, this is the known value for MedGemma 4B's SigLIP vision tower.
    vision_feature_dim = 1024
print(f"Vision feature dimension is: {vision_feature_dim}\n")


# 2. Define the Vision Regressor Wrapper
# ----------------------------------------------------------------------------
print("--- 2. Defining the MedGemmaVisionRegressor Wrapper ---")
class MedGemmaVisionRegressor(nn.Module):
    """A wrapper to isolate the vision_tower and add a regression head."""
    def __init__(self, base_vlm_model, vision_feature_dim_input: int):
        super().__init__()
        self.base_vlm = base_vlm_model
        # The model's operating dtype is now float32 due to the Unsloth fallback.
        self.target_dtype = self.base_vlm.dtype
        print(f"[Regressor Init] Base model operating dtype: {self.target_dtype}")

        self.regression_head = nn.Sequential(
            nn.Linear(vision_feature_dim_input, vision_feature_dim_input // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(vision_feature_dim_input // 2, 1)
        ).to(dtype=self.target_dtype) # Cast head to the model's operating dtype (float32)

    def forward(self, pixel_values: torch.Tensor, return_vision_features=False):
        # Access the base model. FastVisionModel does not add a `.model` wrapper like PEFT.
        base_model = self.base_vlm

        # Process image through the Vision Tower (SigLIP Vision Transformer)
        vision_outputs = base_model.vision_tower(pixel_values=pixel_values, return_dict=True)

        # Extract the final pooled feature vector representing the image
        image_features = vision_outputs.pooler_output

        # Pass vision features through the Regression Head
        ldl_prediction = self.regression_head(image_features)

        if return_vision_features:
            return ldl_prediction, image_features
        else:
            return ldl_prediction


# 3. Instantiate the Full Model and Print Architecture
# ----------------------------------------------------------------------------
print("--- 3. Instantiating the Regressor and Printing Architecture ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regressor_model = MedGemmaVisionRegressor(model, vision_feature_dim)
regressor_model.to(device)

print("\n\n=== MODEL ARCHITECTURE ===")
print(regressor_model)
print("==========================\n")
"""

--- 1. Determining Vision Feature Dimension ---
Vision feature dimension is: 1152

--- 2. Defining the MedGemmaVisionRegressor Wrapper ---
--- 3. Instantiating the Regressor and Printing Architecture ---
[Regressor Init] Base model operating dtype: torch.float16


=== MODEL ARCHITECTURE ===
MedGemmaVisionRegressor(
  (base_vlm): Gemma3ForConditionalGeneration(
    (model): Gemma3Model(
      (vision_tower): SiglipVisionModel(
        (vision_model): SiglipVisionTransformer(
          (embeddings): SiglipVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
            (position_embedding): Embedding(4096, 1152)
          )
          (encoder): SiglipEncoder(
            (layers): ModuleList(
              (0-26): 27 x SiglipEncoderLayer(
                (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
                (self_attn): SiglipAttention(
                  (k_proj): Linear4bit(in_features=1152

In [12]:
"""
# Cell B: Standalone Verification with User-Uploaded PNG/JPEG Image

import torch
import torch.nn as nn
from PIL import Image # Pillow library for standard images
import os
from google.colab import files # Import the files module for uploading
import numpy as np

# This cell assumes `regressor_model`, `processor`, `device`, and `model` were created in the previous cell.

# 1. Upload a Sample Image (PNG, JPG, etc.) from Your Computer
# ----------------------------------------------------------------------------
print("--- 1. Please Upload an Image (PNG, JPG, etc.) for Verification ---")
# This will open a file upload dialog in your browser.
uploaded = files.upload()

# Check if a file was uploaded
if not uploaded:
    raise RuntimeError("No file was uploaded. Please run the cell again and select a file.")

# Get the filename of the uploaded file
sample_image_filename = next(iter(uploaded))
print(f"\nSuccessfully uploaded '{sample_image_filename}'.")
# The path to the uploaded file is just its filename in the current directory
sample_image_path = sample_image_filename
print(f"Using uploaded image at path: ./{sample_image_path}\n")


# 2. Pre-process the Uploaded Image
# ----------------------------------------------------------------------------
print("--- 2. Pre-processing the Image ---")

def preprocess_standard_image(image_path, processor_obj):
    Loads a standard image (PNG, JPG) and uses the model's processor.

    # Open the image using Pillow
    try:
        pil_image = Image.open(image_path)
    except Exception as e:
        raise RuntimeError(f"Failed to open image file '{image_path}'. Error: {e}")

    # Ensure the image is in RGB format, as most vision models expect 3 channels.
    # This will convert grayscale or RGBA images correctly.
    pil_image_rgb = pil_image.convert("RGB")

    # The `processor` object contains `image_processor` which handles everything else:
    # resizing, normalization, and tensor conversion.
    processed_output = processor_obj.image_processor(images=pil_image_rgb, return_tensors="pt")

    # Move the final tensor to the correct device and ensure its dtype matches the model.
    # The model's operating dtype is likely float32 due to the Unsloth fallback on T4.
    pixel_values = processed_output.pixel_values.to(device=device, dtype=model.dtype)

    print(f"Image processed into a tensor.")
    print(f"Tensor Shape: {pixel_values.shape}")
    print(f"Tensor Dtype: {pixel_values.dtype}\n")
    return pixel_values

# Prepare the input tensor from our uploaded sample
image_tensor = preprocess_standard_image(sample_image_path, processor)


# 3. Perform the Forward Pass and Inspect Outputs
# ----------------------------------------------------------------------------
print("--- 3. Performing Forward Pass and Inspecting Outputs ---")
regressor_model.eval() # Set model to evaluation mode

with torch.no_grad(): # No need to track gradients for verification
    # Call the model with our special flag to get intermediate features
    final_prediction, vision_features = regressor_model(
        image_tensor,
        return_vision_features=True
    )

print("\n=== VERIFICATION RESULTS ===")
# --- Output 1: After the Vision Transformer ---
print(f"\n[OUTPUT 1] Intermediate Features from Vision Transformer (Vision Tower)")
print(f"  - Shape: {vision_features.shape}")
print(f"  - Dtype: {vision_features.dtype}")
print(f"  - Device: {vision_features.device}")
print(f"  - Sample Values (first 10 features): \n{vision_features[0, :10].cpu().numpy()}")
print("  - This is the feature vector representing the image. Its shape should be (1, vision_feature_dim).")

# --- Output 2: After the Regression Head ---
print(f"\n[OUTPUT 2] Final Numeric Prediction from Regression Head")
print(f"  - Shape: {final_prediction.shape}")
print(f"  - Dtype: {final_prediction.dtype}")
print(f"  - Device: {final_prediction.device}")
print(f"  - Predicted (Scaled) Value: {final_prediction.item():.4f}")
print("  - This is the final output, representing the predicted scaled LDL value.")

print("\n==========================")
print("\nVerification complete. The data flow from your uploaded image to a single number is working.")
print("WARNING: The model is running in float32, which may cause memory issues during full training.")
"""



In [6]:
# Final Combined Cell: Architecture and Verification with Robust Feature Extraction

import torch
import torch.nn as nn
from PIL import Image
import os
from google.colab import files
import numpy as np
try:
    import pydicom
    DICOM_AVAILABLE = True
except ImportError:
    DICOM_AVAILABLE = False
    print("pydicom library not found. DICOM file uploads will not work.")

# Assumes `model` and `processor` were loaded successfully in the first cell.

# 1. Get Vision Feature Dimension
# ----------------------------------------------------------------------------
print("--- 1. Determining Vision Feature Dimension ---")
if hasattr(model.config, 'vision_config') and hasattr(model.config.vision_config, 'hidden_size'):
    vision_feature_dim = model.config.vision_config.hidden_size
else:
    vision_feature_dim = 1152 # Fallback to the detected dimension
print(f"Vision feature dimension is: {vision_feature_dim}\n")


# 2. Define the Vision Regressor Wrapper (with ROBUST forward pass)
# ----------------------------------------------------------------------------
print("--- 2. Defining the MedGemmaVisionRegressor Wrapper ---")
class MedGemmaVisionRegressor(nn.Module):
    """A wrapper with a robust forward pass for different Vision Transformer outputs."""
    def __init__(self, base_vlm_model, vision_feature_dim_input: int):
        super().__init__()
        self.base_vlm = base_vlm_model
        # Use the model's ACTUAL dtype, which is float32 because of the Unsloth fallback
        self.target_dtype = self.base_vlm.dtype
        print(f"[Regressor Init] Base model operating dtype: {self.target_dtype}")

        self.regression_head = nn.Sequential(
            nn.Linear(vision_feature_dim_input, vision_feature_dim_input // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(vision_feature_dim_input // 2, 1)
        ).to(dtype=self.target_dtype)

    def forward(self, pixel_values: torch.Tensor, return_vision_features=False):
        base_model = self.base_vlm

        # --- ROBUST FEATURE EXTRACTION ---
        # Call the vision tower
        vision_outputs = base_model.vision_tower(pixel_values=pixel_values, return_dict=True)

        # Try to get pooler_output first.
        image_features = vision_outputs.get("pooler_output") # Use .get() for safety

        # If pooler_output is None (as we just discovered), fall back to last_hidden_state.
        if image_features is None:
            print("`pooler_output` is None. Falling back to using `last_hidden_state`.")
            if hasattr(vision_outputs, "last_hidden_state"):
                # The shape is (batch_size, num_patches, hidden_size).
                # We take the embedding of the first token ([CLS] token) as the image representation.
                image_features = vision_outputs.last_hidden_state[:, 0, :]
            else:
                raise RuntimeError("The `vision_tower` output has neither `.pooler_output` nor `.last_hidden_state`. Cannot extract features.")

        # Pass features through the Regression Head
        ldl_prediction = self.regression_head(image_features)

        if return_vision_features:
            return ldl_prediction, image_features
        else:
            return ldl_prediction


# 3. Instantiate the Full Model
# ----------------------------------------------------------------------------
print("--- 3. Instantiating the Regressor ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regressor_model = MedGemmaVisionRegressor(model, vision_feature_dim)
regressor_model.to(device)
print("Regressor model created and moved to device.")


# 4. Upload and Pre-process a Sample Image
# ----------------------------------------------------------------------------
print("\n--- 4. Please Upload an Image (PNG, JPG, DCM, etc.) for Verification ---")
uploaded = files.upload()
if not uploaded: raise RuntimeError("No file was uploaded.")
sample_image_filename = next(iter(uploaded))
print(f"\nSuccessfully uploaded '{sample_image_filename}'.\n")

def preprocess_any_image(image_path, processor_obj):
    """Loads a standard image or DICOM and uses the model's processor."""
    try:
        pil_image = Image.open(image_path).convert("RGB")
        print(f"Successfully loaded '{image_path}' as a standard image.")
    except Exception:
        if DICOM_AVAILABLE:
            print(f"Could not open with Pillow, trying to load '{image_path}' as a DICOM file...")
            dicom_file = pydicom.dcmread(image_path)
            pixel_array = dicom_file.pixel_array; pixel_array = ((pixel_array - np.min(pixel_array)) / (np.max(pixel_array) - np.min(pixel_array)) * 255.0).astype(np.uint8)
            pil_image = Image.fromarray(pixel_array).convert('RGB')
        else: raise RuntimeError(f"Could not open '{image_path}' and pydicom is not available.")

    processed_output = processor_obj.image_processor(images=pil_image, return_tensors="pt")
    # VERY IMPORTANT: The log shows the model fell back to float32. We must match this.
    pixel_values = processed_output.pixel_values.to(device=device, dtype=torch.float32)

    print(f"Image processed into a tensor.")
    print(f"Tensor Shape: {pixel_values.shape}")
    print(f"Tensor Dtype: {pixel_values.dtype}\n")
    return pixel_values

image_tensor = preprocess_any_image(sample_image_filename, processor)


# 5. Perform the Forward Pass and Inspect Outputs
# ----------------------------------------------------------------------------
print("--- 5. Performing Forward Pass and Inspecting Outputs ---")
regressor_model.eval()
with torch.no_grad():
    final_prediction, vision_features = regressor_model(
        image_tensor,
        return_vision_features=True
    )

print("\n=== VERIFICATION RESULTS ===")
print(f"\n[OUTPUT 1] Intermediate Features from Vision Transformer")
print(f"  - Shape: {vision_features.shape}")
print(f"  - Dtype: {vision_features.dtype}")
print(f"  - Sample Values (first 10 features): \n{vision_features[0, :10].cpu().numpy()}")

print(f"\n[OUTPUT 2] Final Numeric Prediction from Regression Head")
print(f"  - Shape: {final_prediction.shape}")
print(f"  - Dtype: {final_prediction.dtype}")
print(f"  - Predicted (Scaled) Value: {final_prediction.item():.4f}")

print("\n==========================")
print("\nVerification complete. Data flow is now working correctly.")

--- 1. Determining Vision Feature Dimension ---
Vision feature dimension is: 1152

--- 2. Defining the MedGemmaVisionRegressor Wrapper ---
--- 3. Instantiating the Regressor ---
[Regressor Init] Base model operating dtype: torch.float16
Regressor model created and moved to device.

--- 4. Please Upload an Image (PNG, JPG, DCM, etc.) for Verification ---


Saving 1.png to 1 (3).png

Successfully uploaded '1 (3).png'.

Successfully loaded '1 (3).png' as a standard image.
Image processed into a tensor.
Tensor Shape: torch.Size([1, 3, 896, 896])
Tensor Dtype: torch.float32

--- 5. Performing Forward Pass and Inspecting Outputs ---
`pooler_output` is None. Falling back to using `last_hidden_state`.

=== VERIFICATION RESULTS ===

[OUTPUT 1] Intermediate Features from Vision Transformer
  - Shape: torch.Size([1, 1152])
  - Dtype: torch.float16
  - Sample Values (first 10 features): 
[ 0.8525  -0.1282   1.039   -0.207   -0.337    0.2384   0.1703  -0.4177
  0.04437  0.1947 ]

[OUTPUT 2] Final Numeric Prediction from Regression Head
  - Shape: torch.Size([1, 1])
  - Dtype: torch.float16
  - Predicted (Scaled) Value: 0.4641


Verification complete. Data flow is now working correctly.
