<a href="https://colab.research.google.com/github/Ravikrishnan05/PrediscanMedtech_project/blob/main/Innerengineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, processor = FastVisionModel.from_pretrained(
    "google/medgemma-4b-pt",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
# 1. Get Vision Feature Dimension
# ----------------------------------------------------------------------------
print("--- 1. Determining Vision Feature Dimension ---")
# Since we know MedGemma is a Gemma3-based model, we can rely on its config.
if hasattr(model.config, 'vision_config') and hasattr(model.config.vision_config, 'hidden_size'):
    vision_feature_dim = model.config.vision_config.hidden_size
else:
    # Fallback just in case, this is the known value for MedGemma 4B's SigLIP vision tower.
    vision_feature_dim = 1024
print(f"Vision feature dimension is: {vision_feature_dim}\n")

In [None]:
#install pydicom
!pip install pydicom

In [None]:
"""
# Cell B: Architecture Definition, Verification, and Forward Pass

import torch
import torch.nn as nn
import pydicom
from PIL import Image

# This cell assumes `model` and `processor` were loaded in the previous cell.
# The `processor` object from FastVisionModel contains the image_processor and tokenizer.

# 1. Get Vision Feature Dimension
# ----------------------------------------------------------------------------
print("--- 1. Determining Vision Feature Dimension ---")
# Since we know MedGemma is a Gemma3-based model, we can rely on its config.
if hasattr(model.config, 'vision_config') and hasattr(model.config.vision_config, 'hidden_size'):
    vision_feature_dim = model.config.vision_config.hidden_size
else:
    # Fallback just in case, this is the known value for MedGemma 4B's SigLIP vision tower.
    vision_feature_dim = 1024
print(f"Vision feature dimension is: {vision_feature_dim}\n")


# 2. Define the Vision Regressor Wrapper
# ----------------------------------------------------------------------------
print("--- 2. Defining the MedGemmaVisionRegressor Wrapper ---")
class MedGemmaVisionRegressor(nn.Module):
    A wrapper to isolate the vision_tower and add a regression head.
    def __init__(self, base_vlm_model, vision_feature_dim_input: int):
        super().__init__()
        self.base_vlm = base_vlm_model
        # The model's operating dtype is now float32 due to the Unsloth fallback.
        self.target_dtype = self.base_vlm.dtype
        print(f"[Regressor Init] Base model operating dtype: {self.target_dtype}")

        self.regression_head = nn.Sequential(
            nn.Linear(vision_feature_dim_input, vision_feature_dim_input // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(vision_feature_dim_input // 2, 1)
        ).to(dtype=self.target_dtype) # Cast head to the model's operating dtype (float32)

    def forward(self, pixel_values: torch.Tensor, return_vision_features=False):
        # Access the base model. FastVisionModel does not add a `.model` wrapper like PEFT.
        base_model = self.base_vlm

        # Process image through the Vision Tower (SigLIP Vision Transformer)
        vision_outputs = base_model.vision_tower(pixel_values=pixel_values, return_dict=True)

        # Extract the final pooled feature vector representing the image
        image_features = vision_outputs.pooler_output

        # Pass vision features through the Regression Head
        ldl_prediction = self.regression_head(image_features)

        if return_vision_features:
            return ldl_prediction, image_features
        else:
            return ldl_prediction


# 3. Instantiate the Full Model and Print Architecture
# ----------------------------------------------------------------------------
print("--- 3. Instantiating the Regressor and Printing Architecture ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regressor_model = MedGemmaVisionRegressor(model, vision_feature_dim)
regressor_model.to(device)

print("\n\n=== MODEL ARCHITECTURE ===")
print(regressor_model)
print("==========================\n")
"""

In [None]:
"""
# Cell B: Standalone Verification with User-Uploaded PNG/JPEG Image

import torch
import torch.nn as nn
from PIL import Image # Pillow library for standard images
import os
from google.colab import files # Import the files module for uploading
import numpy as np

# This cell assumes `regressor_model`, `processor`, `device`, and `model` were created in the previous cell.

# 1. Upload a Sample Image (PNG, JPG, etc.) from Your Computer
# ----------------------------------------------------------------------------
print("--- 1. Please Upload an Image (PNG, JPG, etc.) for Verification ---")
# This will open a file upload dialog in your browser.
uploaded = files.upload()

# Check if a file was uploaded
if not uploaded:
    raise RuntimeError("No file was uploaded. Please run the cell again and select a file.")

# Get the filename of the uploaded file
sample_image_filename = next(iter(uploaded))
print(f"\nSuccessfully uploaded '{sample_image_filename}'.")
# The path to the uploaded file is just its filename in the current directory
sample_image_path = sample_image_filename
print(f"Using uploaded image at path: ./{sample_image_path}\n")


# 2. Pre-process the Uploaded Image
# ----------------------------------------------------------------------------
print("--- 2. Pre-processing the Image ---")

def preprocess_standard_image(image_path, processor_obj):
    Loads a standard image (PNG, JPG) and uses the model's processor.

    # Open the image using Pillow
    try:
        pil_image = Image.open(image_path)
    except Exception as e:
        raise RuntimeError(f"Failed to open image file '{image_path}'. Error: {e}")

    # Ensure the image is in RGB format, as most vision models expect 3 channels.
    # This will convert grayscale or RGBA images correctly.
    pil_image_rgb = pil_image.convert("RGB")

    # The `processor` object contains `image_processor` which handles everything else:
    # resizing, normalization, and tensor conversion.
    processed_output = processor_obj.image_processor(images=pil_image_rgb, return_tensors="pt")

    # Move the final tensor to the correct device and ensure its dtype matches the model.
    # The model's operating dtype is likely float32 due to the Unsloth fallback on T4.
    pixel_values = processed_output.pixel_values.to(device=device, dtype=model.dtype)

    print(f"Image processed into a tensor.")
    print(f"Tensor Shape: {pixel_values.shape}")
    print(f"Tensor Dtype: {pixel_values.dtype}\n")
    return pixel_values

# Prepare the input tensor from our uploaded sample
image_tensor = preprocess_standard_image(sample_image_path, processor)


# 3. Perform the Forward Pass and Inspect Outputs
# ----------------------------------------------------------------------------
print("--- 3. Performing Forward Pass and Inspecting Outputs ---")
regressor_model.eval() # Set model to evaluation mode

with torch.no_grad(): # No need to track gradients for verification
    # Call the model with our special flag to get intermediate features
    final_prediction, vision_features = regressor_model(
        image_tensor,
        return_vision_features=True
    )

print("\n=== VERIFICATION RESULTS ===")
# --- Output 1: After the Vision Transformer ---
print(f"\n[OUTPUT 1] Intermediate Features from Vision Transformer (Vision Tower)")
print(f"  - Shape: {vision_features.shape}")
print(f"  - Dtype: {vision_features.dtype}")
print(f"  - Device: {vision_features.device}")
print(f"  - Sample Values (first 10 features): \n{vision_features[0, :10].cpu().numpy()}")
print("  - This is the feature vector representing the image. Its shape should be (1, vision_feature_dim).")

# --- Output 2: After the Regression Head ---
print(f"\n[OUTPUT 2] Final Numeric Prediction from Regression Head")
print(f"  - Shape: {final_prediction.shape}")
print(f"  - Dtype: {final_prediction.dtype}")
print(f"  - Device: {final_prediction.device}")
print(f"  - Predicted (Scaled) Value: {final_prediction.item():.4f}")
print("  - This is the final output, representing the predicted scaled LDL value.")

print("\n==========================")
print("\nVerification complete. The data flow from your uploaded image to a single number is working.")
print("WARNING: The model is running in float32, which may cause memory issues during full training.")
"""

In [None]:
# Final Combined Cell: Architecture and Verification with Robust Feature Extraction

import torch
import torch.nn as nn
from PIL import Image
import os
from google.colab import files
import numpy as np
try:
    import pydicom
    DICOM_AVAILABLE = True
except ImportError:
    DICOM_AVAILABLE = False
    print("pydicom library not found. DICOM file uploads will not work.")

# Assumes `model` and `processor` were loaded successfully in the first cell.

# 1. Get Vision Feature Dimension
# ----------------------------------------------------------------------------
print("--- 1. Determining Vision Feature Dimension ---")
if hasattr(model.config, 'vision_config') and hasattr(model.config.vision_config, 'hidden_size'):
    vision_feature_dim = model.config.vision_config.hidden_size
else:
    vision_feature_dim = 1152 # Fallback to the detected dimension
print(f"Vision feature dimension is: {vision_feature_dim}\n")


# 2. Define the Vision Regressor Wrapper (with ROBUST forward pass)
# ----------------------------------------------------------------------------
print("--- 2. Defining the MedGemmaVisionRegressor Wrapper ---")
class MedGemmaVisionRegressor(nn.Module):
    """A wrapper with a robust forward pass for different Vision Transformer outputs."""

    #Here the MedGemmaVisionRegressor Class in PyTorch (inherits from nn.Module)which is the basic class

    def __init__(self, base_vlm_model, vision_feature_dim_input: int): #Constructor of the child class .Here the define the architecture
        super().__init__() #Calls the parent class (nn.Module)
        #This line takes the massive, pre-trained MedGemma model (base_vlm_model)
        #That we loaded with Unsloth and stores it as a part of our new object.

        self.base_vlm = base_vlm_model
        #self.base_vlm stores this model as an instance variable, so you can access it from other methods inside your class.

        # Use the model's ACTUAL dtype, which is float32 because of the Unsloth fallback
        self.target_dtype = self.base_vlm.dtype
        # self.base_vlm.dtype checks the data type used by the base model’s parameters.this directly assigns It.
        # This helps ensure that any new layers you add (like the regression head) will use the same dtype as the base model,
        # avoiding errors like:
        # Expected input to be of type torch.float32 but got torch.float16.
        print(f"[Regressor Init] Base model operating dtype: {self.target_dtype}")

        #The architecure defined here
        self.regression_head = nn.Sequential(
            nn.Linear(vision_feature_dim_input, vision_feature_dim_input // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(vision_feature_dim_input // 2, 1)
        ).to(dtype=self.target_dtype)

    #this is method overriding in pytorch nn.Module we already have an method were overriding it for our specific architecture
    def forward(self, pixel_values: torch.Tensor, return_vision_features=False):
        base_model = self.base_vlm

        # --- ROBUST FEATURE EXTRACTION ---
        # Call the vision tower
        #we're giving ip to model.This returns a dictionary of outputs (e.g.,pooler_output, last_hidden_state, etc.)
        vision_outputs = base_model.vision_tower(pixel_values=pixel_values, return_dict=True)
        #Now vision_output is a dictonary with its keys as "pooler_output" ,"last_hidden_state"
        #It looks like a dictionary, but it’s not always a regular Python dictionary — it’s often a custom object that acts like a dictionary.
        #This behaves like a:
        # Dictionary (you can use .get() or ["key"])
        # Object (you can use .key like .last_hidden_state)
        # So it's a hybrid.

        # Try to get pooler_output first.
        image_features = vision_outputs.get("pooler_output") # Use .get() for safety
        #.get() is a dictionary-safe way to fetch a key.O(1)

        # If pooler_output is None (as we just discovered), fall back to last_hidden_state.
        if image_features is None:
            print("`pooler_output` is None. Falling back to using `last_hidden_state`.")
            # hasattr(obj, "attr") checks if the object has that attribute
            if hasattr(vision_outputs, "last_hidden_state"):
                # The shape is (batch_size, num_patches, hidden_size).
                # We take the embedding of the first token ([CLS] token) as the image representation.
                # gives one vecctor as were taking
                image_features = vision_outputs.last_hidden_state[:, 0, :]
                print("Shape of image_features (CLS token):", image_features.shape)
            else:
                raise RuntimeError("The `vision_tower` output has neither `.pooler_output` nor `.last_hidden_state`. Cannot extract features.")

        # Pass features through the Regression Head
        ldl_prediction = self.regression_head(image_features)

        if return_vision_features:
            return ldl_prediction, image_features
        else:
            return ldl_prediction


# 3. Instantiate the Full Model
# ----------------------------------------------------------------------------
print("--- 3. Instantiating the Regressor ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regressor_model = MedGemmaVisionRegressor(model, vision_feature_dim)
regressor_model.to(device)
print("Regressor model created and moved to device.")


# 4. Upload and Pre-process a Sample Image
# ----------------------------------------------------------------------------
print("\n--- 4. Please Upload an Image (PNG, JPG, DCM, etc.) for Verification ---")
uploaded = files.upload()
if not uploaded: raise RuntimeError("No file was uploaded.")
sample_image_filename = next(iter(uploaded))
print(f"\nSuccessfully uploaded '{sample_image_filename}'.\n")

def preprocess_any_image(image_path, processor_obj):
    """Loads a standard image or DICOM and uses the model's processor."""
    try:
        pil_image = Image.open(image_path).convert("RGB")
        print(f"Successfully loaded '{image_path}' as a standard image.")
    except Exception:
        if DICOM_AVAILABLE:
            print(f"Could not open with Pillow, trying to load '{image_path}' as a DICOM file...")
            dicom_file = pydicom.dcmread(image_path)
            pixel_array = dicom_file.pixel_array; pixel_array = ((pixel_array - np.min(pixel_array)) / (np.max(pixel_array) - np.min(pixel_array)) * 255.0).astype(np.uint8)
            pil_image = Image.fromarray(pixel_array).convert('RGB')
        else: raise RuntimeError(f"Could not open '{image_path}' and pydicom is not available.")

    processed_output = processor_obj.image_processor(images=pil_image, return_tensors="pt")
    # VERY IMPORTANT: The log shows the model fell back to float32. We must match this.
    pixel_values = processed_output.pixel_values.to(device=device, dtype=torch.float32)

    print(f"Image processed into a tensor.")
    print(f"Tensor Shape: {pixel_values.shape}")
    print(f"Tensor Dtype: {pixel_values.dtype}\n")
    return pixel_values

image_tensor = preprocess_any_image(sample_image_filename, processor)


# 5. Perform the Forward Pass and Inspect Outputs
# ----------------------------------------------------------------------------
print("--- 5. Performing Forward Pass and Inspecting Outputs ---")
regressor_model.eval()
with torch.no_grad():
    final_prediction, vision_features = regressor_model(
        image_tensor,
        return_vision_features=True
    )

print("\n=== VERIFICATION RESULTS ===")
print(f"\n[OUTPUT 1] Intermediate Features from Vision Transformer")
print(f"  - Shape: {vision_features.shape}")
print(f"  - Dtype: {vision_features.dtype}")
print(f"  - Sample Values (first 10 features): \n{vision_features[0, :10].cpu().numpy()}")

print(f"\n[OUTPUT 2] Final Numeric Prediction from Regression Head")
print(f"  - Shape: {final_prediction.shape}")
print(f"  - Dtype: {final_prediction.dtype}")
print(f"  - Predicted (Scaled) Value: {final_prediction.item():.4f}")

print("\n==========================")
print("\nVerification complete. Data flow is now working correctly.")

Explanation Breakdown
1. The Constructor: __init__
This method sets up a new "instance" or object of our class. It runs once when we create the model, like regressor_model = MedGemmaVisionRegressor(...).
Technical Terms:
class MedGemmaVisionRegressor(nn.Module): We declare a new class named MedGemmaVisionRegressor that inherits from torch.nn.Module. This inheritance gives our class all the powerful capabilities of a standard PyTorch model (like tracking parameters, moving to a GPU with .to(device), etc.).
super().__init__(): This line is essential. It calls the constructor of the parent class (nn.Module) to properly initialize all the background machinery that PyTorch provides.
Layman's Terms (Building a Custom Car):
Think of nn.Module as a standard, high-quality car chassis that already has an engine mount, suspension, and electrical wiring.
class MedGemmaVisionRegressor(...) is you declaring, "I'm going to build a new, custom car model based on this standard chassis."
super().__init__() is the first step where you take the chassis off the factory line. Now you're ready to add your custom parts.
2. Storing Components as Attributes: self.base_vlm = base_vlm_model
OOP Concept: Composition & Instance Attributes
Our custom model is "composed of" other, pre-existing parts. This line takes the massive, pre-trained MedGemma model (base_vlm_model) that we loaded with Unsloth and stores it as a part of our new object.
self.base_vlm becomes an instance attribute—a variable that belongs to this specific object. This makes the MedGemma model accessible from anywhere within our class.
Layman's Terms (Installing the Engine):
This is where you take a powerful, pre-built jet engine (the base_vlm_model) and bolt it onto your car chassis. It's now a permanent part of your custom car.
3. Defining the Custom Regression Head
This is where we build the new, small component that performs our specific task (predicting a single number).
Technical Terms:
self.regression_head: We create another instance attribute to hold our custom part.
nn.Sequential: A PyTorch container that chains layers together in a sequence. The output of one layer automatically becomes the input to the next.
nn.Linear(in_features, out_features): A standard fully-connected neural network layer. It performs a linear transformation (matrix multiplication) on the input.
The first nn.Linear takes the image feature vector (e.g., of size 1152) and maps it to a smaller intermediate size.
The second nn.Linear takes the intermediate vector and maps it to a single output (out_features=1), which is our final prediction.
nn.ReLU(): A non-linear activation function. It's like a switch that adds complexity, allowing the model to learn more than just simple linear relationships.
nn.Dropout(0.1): A regularization technique used during training to prevent overfitting. It randomly sets a fraction (10%) of input units to 0 at each update, forcing the network to learn more robust features.
Layman's Terms (Building a Custom Dashboard):
The jet engine (base_vlm) produces thousands of complex readings (the image_features). This is too much information for a simple speedometer.
You build a custom dashboard (self.regression_head) to interpret these readings.
The nn.Linear layers are like "converters" that process and simplify the data.
The nn.ReLU is like a set of "logic gates" that help make sense of the signals.
The final nn.Linear(..., 1) is the actual speedometer needle itself—it takes all the processed information and displays it as a single, final number.
4. The Forward Pass: forward(...)
This method defines the "assembly line"—the actual path data takes through the model when you make a prediction.
OOP Concept: Methods
A method defines an object's behavior. When you call regressor_model(image_tensor), PyTorch automatically executes this forward method.
Layman's Terms (The Car's Operation Manual):
This is the step-by-step guide for how your custom car works. It tells you what happens when you turn the key (i.e., provide an input).
5. Accessing the Vision Tower: vision_outputs = base_model.vision_tower(...)
Technical Terminology:
base_model = self.base_vlm: We get our "jet engine" component.
base_model.vision_tower: We access the specific part of the MedGemma model responsible for image processing. This vision_tower is the Vision Transformer (ViT).
We call it like a function, passing the pixel_values (our prepared image tensor) to it.
Layman's Analogy:
Step 1 of the operation: The image (pixel_values) is sent to the jet engine's main computer (vision_tower). The computer analyzes the image and generates a detailed diagnostic report (vision_outputs).
6. Extracting the Image Feature Vector
This step unpacks the diagnostic report to get the one piece of information we need.
Technical Terminology:
vision_outputs.get("pooler_output"): We first try to get the pooler_output, which is a pre-packaged summary vector.
vision_outputs.last_hidden_state[:, 0, :]: If the summary isn't available (as was our case), we fall back to a more general method. We take all the detailed, step-by-step outputs (last_hidden_state) and select only the very first one ([:, 0, :]). This is the [CLS] token, which is conventionally used as the final summary of the entire image in many transformer models.
Layman's Analogy:
The diagnostic report (vision_outputs) is 500 pages long. We don't need all of it. We first look for the "Executive Summary" page (pooler_output).
Since that page is missing, we go to the main report (last_hidden_state) and take the very first paragraph ([:, 0, :]), which we know is always the final conclusion. This paragraph is our image_features.
7. The Connection: ldl_prediction = self.regression_head(image_features)
This is the most critical connection point.
Technical Terminology: The output from the previous step (image_features)—a tensor representing the image—is now passed as the input to our custom regression_head.
Layman's Analogy:
You take the final conclusion paragraph (image_features) from the engine's diagnostic report and hand it to your custom dashboard (regression_head). The dashboard reads this summary and moves its speedometer needle (ldl_prediction) to the correct position.
8. Returning the Final Result: return ldl_prediction
Technical Terminology: The forward method completes by returning the final tensor produced by the regression_head. This is the output of our entire custom model.
Layman's Analogy:
The car's operation is complete. The final reading on the speedometer (ldl_prediction) is the result.

In [None]:
print("--- 3. Instantiating the Regressor and Printing Architecture ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regressor_model = MedGemmaVisionRegressor(model, vision_feature_dim)
regressor_model.to(device)

print("\n\n=== MODEL ARCHITECTURE ===")
print(regressor_model)
print("==========================\n")