In [1]:
!ls


'=1.21.0'
'=4.21.0'
'=8.0.0'
 calibration_data
 clip_assets.pt
 create_deployment_assets.py
 cuda-tegra-repo-ubuntu2204-12-9-local_12.9.1-1_arm64.deb
 cudatest.py
 final_eval.py
 finetuned_tinyclip_multilabel.pt
 install_cusparselt.sh
 install_cusparselt.sh.1
 int8_calibration.cache
 libcusparse_lt-linux-sbsa-0.5.2.1-archive
 libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
 onnx2trt.py
 requirements.txt
 sliding_window.ipynb
 tinyclip_dynamic.onnx
 tinyclip_fp16_dynamic.trt
 tinyclip_fp16.trt
 tinyclip_int8_dynamic.trt
 tinyclip_int8.trt
 tinyCLIPval.py
 tinyclip_vision_model_complete.onnx
 tmp_cusparselt
 torch-1.10.0-cp38-cp38-linux_aarch64.whl
 torch-1.13.0-cp38-cp38-linux_aarch64.whl
 torch-1.8.0-cp36-cp36m-linux_aarch64.whl
 torch-2.0.0-cp38-cp38-linux_aarch64.whl
 torch-2.0.0+nv23.05-cp38-cp38-linux_aarch64.whl
 torch-2.1.0a0+41361538.nv23.06-cp38-cp38-linux_aarch64.whl
 torch-2.1.0-cp38-cp38-linux_aarch64.whl
 trt_env
 valid
 valid_crops


In [24]:
# --- IMPORTS ---
import torch
from PIL import Image
from transformers import CLIPModel, CLIPProcessor
import json
import os
import warnings
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import time

# --- NEW: TensorRT and PyCUDA Imports ---
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit # This is needed for initializing CUDA driver

# Suppress specific warnings from PIL about large images
warnings.filterwarnings("ignore", category=UserWarning, module='PIL')

# --- 1. CONFIGURATION (MODIFIED) ---
MODEL_ID = "wkcn/TinyCLIP-ViT-61M-32-Text-29M-LAION400M"
# --- MODIFIED: Point to the TensorRT engine file ---
SAVED_MODEL_PATH = 'tinyclip_int8_dynamic.trt'
# --- MODIFIED: Use relative path for Jupyter environment ---
VALIDATION_DIR = 'valid'
CLASS_LABELS = ["calyx", "fruitlet", "peduncle", "negative"]
CONFIDENCE_THRESHOLD = 0.5

# BATCH_SIZE for processing patches to prevent out-of-memory errors
BATCH_SIZE = 8

# Sliding Window Parameters
PATCH_SIZE = 224
STRIDE = 112

# We must use CUDA for TensorRT
device = "cuda"
print("using device:", device)

using device: cuda


In [25]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np

class TensorRTInference:
    def __init__(self, engine_path):
        print(f"Loading TensorRT engine from: {engine_path}")
        self.logger = trt.Logger(trt.Logger.WARNING)
        
        with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        
        if self.engine is None:
            raise RuntimeError("Failed to load TensorRT engine")
            
        self.context = self.engine.create_execution_context()
        self.stream = cuda.Stream()

        print(f"Engine loaded successfully!")
        print(f"Number of IO tensors: {self.engine.num_io_tensors}")
        
        # Initialize input/output information
        self.input_info = {}
        self.output_info = {}
        
        for i in range(self.engine.num_io_tensors):
            tensor_name = self.engine.get_tensor_name(i)
            tensor_shape = self.engine.get_tensor_shape(tensor_name)
            tensor_dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name))
            tensor_mode = self.engine.get_tensor_mode(tensor_name)
            
            print(f"Tensor '{tensor_name}': shape={tensor_shape}, dtype={tensor_dtype}")
            
            if tensor_mode == trt.TensorIOMode.INPUT:
                self.input_info[tensor_name] = {
                    'shape': tensor_shape,
                    'dtype': tensor_dtype
                }
                self.input_tensor_name = tensor_name  # Assuming single input
                print(f"  -> INPUT tensor")
            else:
                self.output_info[tensor_name] = {
                    'shape': tensor_shape,
                    'dtype': tensor_dtype
                }
                self.output_tensor_name = tensor_name  # Assuming single output
                print(f"  -> OUTPUT tensor")
        
        print(f"Initialized {len(self.input_info)} inputs and {len(self.output_info)} outputs")
    
    def infer(self, pixel_values: np.ndarray):
        batch_size = pixel_values.shape[0]
        
        #print(f"Running inference with batch_size={batch_size}")
        #print(f"pixel_values shape: {pixel_values.shape}")
        
        # Set the input shape for dynamic batching
        actual_input_shape = list(pixel_values.shape)
        #print(f"Setting {self.input_tensor_name} shape to {actual_input_shape}")
        
        # This is the key fix: set input shape before setting tensor addresses
        self.context.set_input_shape(self.input_tensor_name, actual_input_shape)
        
        # Now allocate memory based on the actual shapes
        input_nbytes = int(pixel_values.nbytes)
        output_shape = self.context.get_tensor_shape(self.output_tensor_name)
        output_size = int(np.prod(output_shape))
        output_nbytes = int(output_size * np.dtype(self.output_info[self.output_tensor_name]['dtype']).itemsize)
        
        #print(f"Output shape after setting input: {output_shape}")
        #print(f"Allocating {input_nbytes} bytes for input, {output_nbytes} bytes for output")
        
        # Allocate GPU memory
        d_input = cuda.mem_alloc(input_nbytes)
        d_output = cuda.mem_alloc(output_nbytes)
        
        # Set tensor addresses
        self.context.set_tensor_address(self.input_tensor_name, int(d_input))
        self.context.set_tensor_address(self.output_tensor_name, int(d_output))
        
        # Copy input data to GPU
        cuda.memcpy_htod_async(d_input, pixel_values, self.stream)
        
        # Execute inference
        if not self.context.execute_async_v3(stream_handle=self.stream.handle):
            raise RuntimeError("TensorRT inference execution failed")
        
        # Allocate host memory for output and copy back
        output_host = np.empty(output_shape, dtype=self.output_info[self.output_tensor_name]['dtype'])
        cuda.memcpy_dtoh_async(output_host, d_output, self.stream)
        self.stream.synchronize()
        
        # Clean up GPU memory
        d_input.free()
        d_output.free()
        
        #print(f"Inference completed. Output shape: {output_host.shape}")
        return output_host

In [26]:
# --- 2. MODEL AND PROCESSOR SETUP (MODIFIED) ---
print("--- Loading TENSORRT Vision Model for Stage 2 ---")
if not os.path.exists(SAVED_MODEL_PATH):
    raise FileNotFoundError(f"TensorRT model not found at {SAVED_MODEL_PATH}.")

# Instantiate our TensorRT wrapper for the VISION model
trt_model = TensorRTInference(SAVED_MODEL_PATH)

# The processor is still needed for preprocessing images
processor = CLIPProcessor.from_pretrained(MODEL_ID)

# --- NEW: Encode text prompts ONCE using PyTorch CLIP model ---
print("--- Encoding text prompts ONCE using PyTorch CLIP model ---")
full_clip_model = CLIPModel.from_pretrained(MODEL_ID).to(device)
text_prompts = [f"a photo of a {label}" for label in CLASS_LABELS]
text_inputs = processor(text=text_prompts, return_tensors="pt", padding=True).to(device)

# Ensure to use the text model and text projection
with torch.no_grad():
    text_features = full_clip_model.get_text_features(input_ids=text_inputs['input_ids'], 
                                                      attention_mask=text_inputs['attention_mask'])
# Normalize text features as done in CLIP
text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) 
print(f"Encoded text features shape: {text_features.shape}")
print(f"Using device: {device}\n")

--- Loading TENSORRT Vision Model for Stage 2 ---
Loading TensorRT engine from: tinyclip_int8_dynamic.trt
Engine loaded successfully!
Number of IO tensors: 2
Tensor 'input': shape=(-1, 3, 224, 224), dtype=<class 'numpy.float32'>
  -> INPUT tensor
Tensor 'output': shape=(-1, 512), dtype=<class 'numpy.float32'>
  -> OUTPUT tensor
Initialized 1 inputs and 1 outputs
--- Encoding text prompts ONCE using PyTorch CLIP model ---
Encoded text features shape: torch.Size([4, 512])
Using device: cuda



In [27]:
# --- 3. LOAD AND PROCESS GROUND TRUTH (Unchanged) ---
print("--- Loading and Processing COCO Ground Truth ---")
annotation_file_path = os.path.join(VALIDATION_DIR, '_annotations.coco.json')
if not os.path.exists(annotation_file_path):
    raise FileNotFoundError(f"Annotation file not found at {annotation_file_path}.")

with open(annotation_file_path, 'r') as f:
    coco_data = json.load(f)

# Create mappings and load all annotations
coco_id_to_name = {cat['id']: cat['name'] for cat in coco_data['categories']}
train_class_labels = [lbl for lbl in CLASS_LABELS if lbl != "negative"]
name_to_class_idx = {name: i for i, name in enumerate(train_class_labels)}
coco_id_to_class_idx = {
    coco_id: name_to_class_idx.get(name) for coco_id, name in coco_id_to_name.items() if name in train_class_labels
}

image_id_to_filename = {img['id']: img['file_name'] for img in coco_data['images']}
image_id_to_filename_subset = dict(list(image_id_to_filename.items())[:])
image_id_to_annotations = {img_id: [] for img_id in image_id_to_filename}
for ann in coco_data['annotations']:
    image_id_to_annotations[ann['image_id']].append(ann)
print(f"Processed ground truth for {len(image_id_to_filename)} images.\n")


# --- 4. HELPER FUNCTION (Unchanged) ---
def get_patch_ground_truth(patch_box, image_annotations, overlap_threshold=0.1):
    px1, py1, px2, py2 = patch_box
    patch_area = (px2 - px1) * (py2 - py1)
    patch_truth = [0] * len(train_class_labels)
    for ann in image_annotations:
        bbox = ann['bbox']
        bx1, by1, bw, bh = bbox; bx2, by2 = bx1 + bw, by1 + bh
        ix1, iy1 = max(px1, bx1), max(py1, by1)
        ix2, iy2 = min(px2, bx2), min(py2, by2)
        inter_area = max(0, ix2 - ix1) * max(0, iy2 - iy1)
        if (inter_area / patch_area) > overlap_threshold:
            class_idx = coco_id_to_class_idx.get(ann['category_id'])
            if class_idx is not None:
                patch_truth[class_idx] = 1
    is_negative = 1 if sum(patch_truth) == 0 else 0
    return patch_truth + [is_negative]

--- Loading and Processing COCO Ground Truth ---
Processed ground truth for 60 images.



In [28]:



# --- 5. OPTIMIZED SLIDING WINDOW INFERENCE LOOP ---
print(f"--- Running OPTIMIZED Sliding Window Analysis on All {len(image_id_to_filename_subset)} Images ---")
# Should be (batch_size, 3, H, W) and match what the engine expects

for image_id, filename in list(image_id_to_filename_subset.items())[:1]:  # Process only first image for debugging
    print(f"\n\n=========================================================")
    print(f"Processing Image: {filename}")
    print(f"=============================================================")
    start_time = time.time()

    image_path = os.path.join(VALIDATION_DIR, filename)
    if not os.path.exists(image_path):
        print(f"--> SKIPPING: File not found at {image_path}")
        continue

    image = Image.open(image_path).convert("RGB")
    image_width, image_height = image.size
    current_image_annotations = image_id_to_annotations[image_id]

    patches = []
    num_patches_y = (image_height - PATCH_SIZE) // STRIDE + 1
    num_patches_x = (image_width - PATCH_SIZE) // STRIDE + 1

    for y in range(0, image_height - PATCH_SIZE + 1, STRIDE):
        for x in range(0, image_width - PATCH_SIZE + 1, STRIDE):
            patch = image.crop((x, y, x + PATCH_SIZE, y + PATCH_SIZE))
            patches.append(patch)

    if not patches:
        print("--> SKIPPING: No patches were generated for this image.")
        continue

    print(f"Extracted {len(patches)} patches. Processing them in batches of {BATCH_SIZE}...")

    all_probs = []
    image_patch_predictions = []

    # Process the list of patches in batches
    with torch.no_grad():
        for i in range(0, min(BATCH_SIZE, len(patches)), BATCH_SIZE):  # Process only first batch for debugging
            batch = patches[i:i + BATCH_SIZE]
            print(f"\n--- Processing batch {i//BATCH_SIZE + 1} with {len(batch)} patches ---")
            
            # 1. Preprocess images with Hugging Face processor
            # The 'text' argument is NOT passed here, as text is pre-encoded
            inputs = processor(images=batch, return_tensors="pt").to(device) 
            
            print(f"Preprocessed inputs keys: {inputs.keys()}")
            print(f"pixel_values shape: {inputs['pixel_values'].shape}")
            
            # --- Run inference using the TensorRT wrapper (only pixel_values) ---
            # 2. Convert PyTorch pixel_values tensor to NumPy array for the TRT engine
            pixel_values_np = inputs['pixel_values'].cpu().numpy()
            
            print(f"NumPy pixel_values shape: {pixel_values_np.shape}")
            print(f"pixel_values dtype: {pixel_values_np.dtype}")

            # 3. Run image-only inference using the TensorRT wrapper
            try:
                # The TensorRT engine outputs image features
                image_features_np = trt_model.infer(pixel_values=pixel_values_np)
                print(f"TensorRT output (image_features) shape: {image_features_np.shape}")
                
                # 4. Convert the NumPy output back to a Torch tensor for post-processing
                image_features = torch.from_numpy(image_features_np).to(device)
                # Normalize image features as done in CLIP
                image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) 

                # 5. Calculate logits (similarity scores) by performing dot product
                #    between image features and the pre-encoded text features.
                #    This is how CLIP computes similarity.
                logits = torch.matmul(image_features, text_features.T) # .T transposes text_features
                
                # Get probabilities and predictions for the current batch
                
                probs = logits.sigmoid() # For multi-label classification
                predictions = (probs > CONFIDENCE_THRESHOLD).int()

                all_probs.append(probs.cpu())
                image_patch_predictions.append(predictions.cpu())
                
                print(f"Batch processed successfully!")
                print(f"Probabilities shape: {probs.shape}")
                print(f"Predictions shape: {predictions.shape}")
                
            except Exception as e:
                print(f"Error during TensorRT inference: {e}")
                print(f"Error type: {type(e)}")
                import traceback
                traceback.print_exc()
                break # Exit loop on error

    if all_probs:  # Only continue if we have results
        # Concatenate results from all batches
        full_probs_tensor = torch.cat(all_probs)
        full_predictions_tensor = torch.cat(image_patch_predictions)
        
        print(f"\nFinal concatenated results:")
        print(f"full_probs_tensor shape: {full_probs_tensor.shape}")
        print(f"full_predictions_tensor shape: {full_predictions_tensor.shape}")

    end_time = time.time()
    print(f"--> Image processing finished in {end_time - start_time:.2f} seconds.")
    
    break  # Only process first image for debugging

print("\n\n--- Debug analysis complete. ---")

--- Running OPTIMIZED Sliding Window Analysis on All 60 Images ---


Processing Image: IMG_2197_JPEG.rf.912aa4bfc6ecb5926978c385be331efe.jpg
Extracted 910 patches. Processing them in batches of 8...

--- Processing batch 1 with 8 patches ---
Preprocessed inputs keys: dict_keys(['pixel_values'])
pixel_values shape: torch.Size([8, 3, 224, 224])
NumPy pixel_values shape: (8, 3, 224, 224)
pixel_values dtype: float32
TensorRT output (image_features) shape: (8, 512)
Batch processed successfully!
Probabilities shape: torch.Size([8, 4])
Predictions shape: torch.Size([8, 4])

Final concatenated results:
full_probs_tensor shape: torch.Size([8, 4])
full_predictions_tensor shape: torch.Size([8, 4])
--> Image processing finished in 0.40 seconds.


--- Debug analysis complete. ---


In [32]:
# --- 5. OPTIMIZED SLIDING WINDOW INFERENCE LOOP WITH HEATMAP GENERATION ---
print(f"--- Running OPTIMIZED Sliding Window Analysis on All {len(image_id_to_filename_subset)} Images ---")

# List to store processing times for each image
image_processing_times = [] # New: Initialize list to store times

for image_id, filename in list(image_id_to_filename_subset.items())[:15]:
    print(f"\n\n=========================================================")
    print(f"Processing Image: {filename}")
    print(f"=============================================================")
    start_time = time.time()

    image_path = os.path.join(VALIDATION_DIR, filename)
    if not os.path.exists(image_path):
        print(f"--> SKIPPING: File not found at {image_path}")
        continue

    image = Image.open(image_path).convert("RGB")
    image_width, image_height = image.size
    current_image_annotations = image_id_to_annotations[image_id]

    # --- Extract all patches into a list first ---
    patches = []
    patch_coordinates = []  # Store coordinates for ground truth calculation if needed
    num_patches_y = (image_height - PATCH_SIZE) // STRIDE + 1
    num_patches_x = (image_width - PATCH_SIZE) // STRIDE + 1

    for y in range(0, image_height - PATCH_SIZE + 1, STRIDE):
        for x in range(0, image_width - PATCH_SIZE + 1, STRIDE):
            patch = image.crop((x, y, x + PATCH_SIZE, y + PATCH_SIZE))
            patches.append(patch)
            patch_coordinates.append((x, y, x + PATCH_SIZE, y + PATCH_SIZE))

    if not patches:
        print("--> SKIPPING: No patches were generated for this image.")
        continue

    print(f"Extracted {len(patches)} patches. Processing them in batches of {BATCH_SIZE}...")
    print(f"Grid dimensions: {num_patches_y} x {num_patches_x}")

    all_probs = []
    image_patch_predictions = []

    # Process all patches in batches (not just the first batch)
    with torch.no_grad():
        for i in range(0, len(patches), BATCH_SIZE):
            batch = patches[i:i + BATCH_SIZE]
            batch_num = i // BATCH_SIZE + 1
            #print(f"--- Processing batch {batch_num}/{(len(patches) + BATCH_SIZE - 1) // BATCH_SIZE} with {len(batch)} patches ---")

            # 1. Preprocess images with Hugging Face processor
            inputs = processor(images=batch, return_tensors="pt").to(device)

            # 2. Convert PyTorch pixel_values tensor to NumPy array for the TRT engine
            pixel_values_np = inputs['pixel_values'].cpu().numpy()

            # 3. Run image-only inference using the TensorRT wrapper
            try:
                # The TensorRT engine outputs image features
                image_features_np = trt_model.infer(pixel_values=pixel_values_np)

                # 4. Convert the NumPy output back to a Torch tensor for post-processing
                image_features = torch.from_numpy(image_features_np).to(device)
                # Normalize image features as done in CLIP
                image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)

                # 5. Calculate logits (similarity scores) by performing dot product
                #    between image features and the pre-encoded text features.
                logits = torch.matmul(image_features, text_features.T)

                # Get probabilities and predictions for the current batch
                #print(f"Logits min: {logits.min()}, max: {logits.max()}, mean: {logits.mean()}, std: {logits.std()}")
                probs = logits.sigmoid()  # For multi-label classification
                predictions = (probs > CONFIDENCE_THRESHOLD).int()

                all_probs.append(probs.cpu())
                image_patch_predictions.append(predictions.cpu())

                print(f"Batch {batch_num} processed successfully!")

            except Exception as e:
                print(f"Error during TensorRT inference in batch {batch_num}: {e}")
                import traceback
                traceback.print_exc()
                break  # Exit loop on error

    if not all_probs:  # If no batches were processed successfully
        print("--> SKIPPING: No patches were processed successfully.")
        continue

    # Concatenate results from all batches
    full_probs_tensor = torch.cat(all_probs)
    full_predictions_tensor = torch.cat(image_patch_predictions)

    print(f"\nFinal results:")
    print(f"full_probs_tensor shape: {full_probs_tensor.shape}")
    print(f"full_predictions_tensor shape: {full_predictions_tensor.shape}")
    print(f"Expected patches: {num_patches_y * num_patches_x}")

    # --- HEATMAP GENERATION ---
    # Reshape the results to form the heatmap
    # The output order is preserved, so we can directly reshape.
    # Reshape from (total_patches, num_classes) to (y_dim, x_dim, num_classes)
    heatmap_tensor = full_probs_tensor.view(num_patches_y, num_patches_x, len(CLASS_LABELS))
    # Permute to get (num_classes, y_dim, x_dim) for easy plotting
    heatmap = heatmap_tensor.permute(2, 0, 1)

    end_time = time.time()
    elapsed_time = end_time - start_time
    image_processing_times.append(elapsed_time) # New: Store elapsed time
    print(f"--> Image processing finished in {elapsed_time:.2f} seconds.")

    # --- Display Per-Image Report ---
    #print(f"\nImage Report Card for: {filename}")
    #print("-" * 60)
    #print(f"{'Class':<12} | {'Ground Truth Count':<20} | {'Predicted Patch Count':<22}")
    #print("-" * 60)

    # Count ground truth annotations
    true_counts = [0] * len(train_class_labels)
    for ann in current_image_annotations:
        class_idx = coco_id_to_class_idx.get(ann['category_id'])
        if class_idx is not None:
            true_counts[class_idx] += 1

    # Sum the predictions from the batched tensor result
    predicted_counts = torch.sum(full_predictions_tensor[:, :len(train_class_labels)], axis=0).numpy()
    for i, label in enumerate(train_class_labels):
        print(f"{label:<12} | {true_counts[i]:<20} | {predicted_counts[i]:<22}")
    print("-" * 60)

    # --- Display Per-Image Heatmaps ---
    print(f"\nHeatmap Visualizations for: {filename}")
    heatmap_np = heatmap.numpy()
    fig, axes = plt.subplots(1, len(CLASS_LABELS) + 1, figsize=(20, 5))

    # Show original image
    axes[0].imshow(image)
    axes[0].set_title("Original Image")
    axes[0].axis('off')

    custom_vmin = 0.45
    custom_vmax = 0.55

    # Show heatmaps for each class
    for i, class_name in enumerate(CLASS_LABELS):
        im = axes[i+1].imshow(heatmap_np[i], cmap='viridis', interpolation='nearest',
                               vmin=custom_vmin, vmax=custom_vmax)
        axes[i+1].set_title(f"Heatmap for '{class_name}'")
        axes[i+1].axis('off')

    # Add colorbar
    fig.colorbar(im, ax=axes.ravel().tolist())
    plt.tight_layout()
    plt.show()

    # Optional: Break after first image for testing
    # Remove this break to process all images
    #break # Keep this commented to process all 15 images

# New: Calculate and print the mean processing time
if image_processing_times:
    mean_time = sum(image_processing_times) / len(image_processing_times)
    print(f"\n\n--- Mean time to process a full image: {mean_time:.2f} seconds ---")
else:
    print("\n\n--- No images were processed to calculate mean time. ---")

print("\n\n--- Analysis of validation images complete. ---")

--- Running OPTIMIZED Sliding Window Analysis on All 60 Images ---


Processing Image: IMG_2197_JPEG.rf.912aa4bfc6ecb5926978c385be331efe.jpg
Extracted 910 patches. Processing them in batches of 8...
Grid dimensions: 35 x 26
Batch 1 processed successfully!
Batch 2 processed successfully!
Batch 3 processed successfully!
Batch 4 processed successfully!
Batch 5 processed successfully!
Batch 6 processed successfully!
Batch 7 processed successfully!
Batch 8 processed successfully!
Batch 9 processed successfully!
Batch 10 processed successfully!
Batch 11 processed successfully!
Batch 12 processed successfully!
Batch 13 processed successfully!
Batch 14 processed successfully!
Batch 15 processed successfully!
Batch 16 processed successfully!
Batch 17 processed successfully!
Batch 18 processed successfully!
Batch 19 processed successfully!
Batch 20 processed successfully!
Batch 21 processed successfully!
Batch 22 processed successfully!
Batch 23 processed successfully!
Batch 24 processed success

KeyboardInterrupt: 