# Install required packages

In [None]:
!pip install gradio ultralytics paddlepaddle paddleocr transformers torch torchvision accelerate bitsandbytes

# Libraries

In [None]:
import numpy as np
import pandas as pd
import torch
import cv2
import json
from PIL import Image
import gradio as gr
from ultralytics import YOLO
from paddleocr import PaddleOCR
from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig
import warnings
import re
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Models

In [None]:
# Global variables to store loaded models
MODELS_LOADED = False
yolo_model = None
llava_model = None
processor = None
ocr_model = None

In [None]:
def load_models():
    """Load all required models once"""
    global MODELS_LOADED, yolo_model, llava_model, processor, ocr_model

    if MODELS_LOADED:
        return yolo_model, llava_model, processor, ocr_model

    print("Loading models...")

    try:
        # Load YOLO model for license plate detection
        yolo_model_path = "/kaggle/input/license_plate_detect_yolo11/pytorch/default/1/best.pt"
        yolo_model = YOLO(yolo_model_path)
        print("✅ YOLO model loaded successfully")

        # Load LLaVA-NeXT model with quantization for faster inference
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4"
        )

        processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
        llava_model = AutoModelForImageTextToText.from_pretrained(
            "llava-hf/llava-v1.6-mistral-7b-hf",
            quantization_config=quant_config,
            device_map="auto",
            torch_dtype=torch.float16
        )
        print("✅ LLaVA-NeXT model loaded successfully")

        # Initialize PaddleOCR
        ocr_model = PaddleOCR(
            use_angle_cls=True,
            lang='en'
        )
        print("✅ PaddleOCR model loaded successfully")

        MODELS_LOADED = True
        return yolo_model, llava_model, processor, ocr_model

    except Exception as e:
        print(f"Error loading models: {str(e)}")

In [None]:
yolo_model, llava_model, processor, ocr_model = load_models()
MODELS_LOADED = True

# Detect License Plates

In [None]:
def detect_license_plates(image, yolo_model, confidence_threshold=0.5):
    """Detect license plates using YOLO model with adjustable confidence threshold"""
    try:
        # Convert PIL image to numpy array
        if isinstance(image, Image.Image):
            image_np = np.array(image)
        else:
            image_np = image

        print(f"Running YOLO detection with confidence threshold: {confidence_threshold}")

        # Run YOLO inference with user-specified confidence threshold
        results = yolo_model(
            image_np,
            conf=confidence_threshold,
            imgsz=640,
            half=False,  # Disable FP16 for better compatibility
            device='cuda' if torch.cuda.is_available() else 'cpu',
            verbose=True
        )

        detections = []
        for result in results:
            boxes = result.boxes
            if boxes is not None:
                print(f"Found {len(boxes)} potential detections")
                for i, box in enumerate(boxes):
                    # Get bounding box coordinates
                    x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                    confidence = box.conf[0].cpu().numpy()

                    # More lenient size filter
                    width = x2 - x1
                    height = y2 - y1
                    print(f"Detection {i+1}: bbox=({x1:.1f},{y1:.1f},{x2:.1f},{y2:.1f}), size=({width:.1f}x{height:.1f}), conf={confidence:.3f}")

                    if width > 15 and height > 8:  # More lenient minimum size filter
                        detections.append({
                            'bbox': [int(x1), int(y1), int(x2), int(y2)],
                            'confidence': float(confidence)
                        })
                        print(f"✅ Added detection {i+1}")
                    else:
                        print(f"❌ Filtered out detection {i+1} (too small)")
            else:
                print("No boxes detected by YOLO")

        print(f"Final detections: {len(detections)}")
        return detections
    except Exception as e:
        print(f"Error in license plate detection: {str(e)}")
        return []

# Preprocess License Plates

In [None]:
def preprocess_plate_image(plate_image):
    """Preprocess license plate image for better OCR"""
    try:
        # Ensure we have a valid image
        if plate_image is None or plate_image.size == 0:
            return None

        # Convert to grayscale if needed
        if len(plate_image.shape) == 3:
            gray = cv2.cvtColor(plate_image, cv2.COLOR_RGB2GRAY)
        else:
            gray = plate_image

        # Resize if too small
        h, w = gray.shape
        if h < 32 or w < 64:
            scale_factor = max(32/h, 64/w, 2.0)
            new_h, new_w = int(h * scale_factor), int(w * scale_factor)
            gray = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_CUBIC)

        # Apply CLAHE for better contrast
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(gray)

        # Apply Gaussian blur to reduce noise
        blurred = cv2.GaussianBlur(enhanced, (3, 3), 0)

        # Apply threshold
        _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # Convert back to 3-channel for PaddleOCR compatibility
        thresh_3ch = cv2.cvtColor(thresh, cv2.COLOR_GRAY2RGB)
        return thresh_3ch

    except Exception as e:
        print(f"Error preprocessing plate image: {str(e)}")
        return plate_image

# Extract Plate Text

In [None]:
def extract_text_from_plate(image, bbox, ocr_model, ocr_confidence_threshold=0.1):
    """Extract text from license plate using OCR with adjustable confidence threshold"""
    try:
        x1, y1, x2, y2 = bbox

        # Convert PIL to numpy if needed
        if isinstance(image, Image.Image):
            image_np = np.array(image)
        else:
            image_np = image

        # Get original image dimensions
        h, w = image_np.shape[:2]
        print(f"Original image size: {w}x{h}")
        print(f"Raw bbox: ({x1},{y1},{x2},{y2})")

        # Clamp coordinates to image boundaries
        x1_clamped = max(0, min(x1, w-1))
        y1_clamped = max(0, min(y1, h-1))
        x2_clamped = max(x1_clamped+1, min(x2, w))
        y2_clamped = max(y1_clamped+1, min(y2, h))

        print(f"Clamped bbox: ({x1_clamped},{y1_clamped},{x2_clamped},{y2_clamped})")

        # Check if bbox is still valid after clamping
        if x2_clamped <= x1_clamped or y2_clamped <= y1_clamped:
            print("Invalid bbox coordinates after clamping")
            return "", 0.0

        # Crop the license plate region with some padding
        padding = 10
        x1_pad = max(0, x1_clamped - padding)
        y1_pad = max(0, y1_clamped - padding)
        x2_pad = min(w, x2_clamped + padding)
        y2_pad = min(h, y2_clamped + padding)

        print(f"Final crop region: ({x1_pad},{y1_pad},{x2_pad},{y2_pad})")

        plate_image = image_np[y1_pad:y2_pad, x1_pad:x2_pad]

        # Check if plate_image is valid
        if plate_image.size == 0:
            print("Empty plate image after cropping")
            return "", 0.0

        print(f"Cropped plate image size: {plate_image.shape}")

        # Ensure the cropped image has the right format
        if len(plate_image.shape) != 3:
            print("Converting grayscale to RGB")
            if len(plate_image.shape) == 2:
                plate_image = cv2.cvtColor(plate_image, cv2.COLOR_GRAY2RGB)

        # Try OCR on original cropped image first
        best_text = ""
        best_confidence = 0.0

        try:
            print(f"Trying OCR on original image with confidence threshold: {ocr_confidence_threshold}")
            ocr_results_original = ocr_model.ocr(plate_image)

            # Handle the new PaddleOCR dict format
            if isinstance(ocr_results_original, dict):
                rec_texts = ocr_results_original.get('rec_texts', [])
                rec_scores = ocr_results_original.get('rec_scores', [])

                print(f"Found rec_texts: {rec_texts}")
                print(f"Found rec_scores: {rec_scores}")

                for i, text in enumerate(rec_texts):
                    if i < len(rec_scores):
                        conf = rec_scores[i]
                        if text and len(str(text).strip()) > 0 and conf > ocr_confidence_threshold:
                            print(f"  Original - Text: '{text}', Confidence: {conf:.3f}")

                            # Clean up the text
                            cleaned_text = re.sub(r'[^A-Z0-9]', '', str(text).upper())

                            if cleaned_text and len(cleaned_text) >= 2:
                                if conf > best_confidence:
                                    best_text = cleaned_text
                                    best_confidence = conf
                                    print(f"  ✅ New best result: '{best_text}' (conf: {best_confidence:.3f})")

            # Handle list format (legacy or different version)
            elif isinstance(ocr_results_original, list) and len(ocr_results_original) > 0:
                print("Handling list format...")

                # Check if it's a list of detection results
                if isinstance(ocr_results_original[0], list):
                    for line in ocr_results_original[0]:
                        if line and len(line) >= 2 and line[1]:
                            text = str(line[1][0]).strip()
                            conf = float(line[1][1]) if len(line[1]) >= 2 else 0.0

                            print(f"  Original (list) - Text: '{text}', Confidence: {conf:.3f}")

                            if conf > ocr_confidence_threshold and len(text) > 0:
                                cleaned_text = re.sub(r'[^A-Z0-9]', '', text.upper())

                                if cleaned_text and len(cleaned_text) >= 2:
                                    if conf > best_confidence:
                                        best_text = cleaned_text
                                        best_confidence = conf
                                        print(f"  ✅ New best result: '{best_text}' (conf: {best_confidence:.3f})")

                # Direct list of results
                else:
                    for item in ocr_results_original:
                        if isinstance(item, dict):
                            rec_texts = item.get('rec_texts', [])
                            rec_scores = item.get('rec_scores', [])

                            for i, text in enumerate(rec_texts):
                                if i < len(rec_scores):
                                    conf = rec_scores[i]
                                    if text and len(str(text).strip()) > 0 and conf > ocr_confidence_threshold:
                                        cleaned_text = re.sub(r'[^A-Z0-9]', '', str(text).upper())
                                        if cleaned_text and len(cleaned_text) >= 2:
                                            if conf > best_confidence:
                                                best_text = cleaned_text
                                                best_confidence = conf
                                                print(f"  ✅ New best result: '{best_text}' (conf: {best_confidence:.3f})")

        except Exception as e:
            print(f"Error with original OCR: {str(e)}")


        # Try OCR on preprocessed image
        if best_confidence < 0.8:
            try:
                print("Trying OCR on preprocessed image...")
                processed_plate = preprocess_plate_image(plate_image)

                if processed_plate is not None:
                    ocr_results_processed = ocr_model.ocr(processed_plate)

                    print(f"Processed OCR result type: {type(ocr_results_processed)}")

                    # Handle the new PaddleOCR dict format
                    if isinstance(ocr_results_processed, dict):
                        rec_texts = ocr_results_processed.get('rec_texts', [])
                        rec_scores = ocr_results_processed.get('rec_scores', [])

                        print(f"Processed rec_texts: {rec_texts}")
                        print(f"Processed rec_scores: {rec_scores}")

                        for i, text in enumerate(rec_texts):
                            if i < len(rec_scores):
                                conf = rec_scores[i]
                                if text and len(str(text).strip()) > 0 and conf > ocr_confidence_threshold:
                                    print(f"  Processed - Text: '{text}', Confidence: {conf:.3f}")

                                    # Clean up the text
                                    cleaned_text = re.sub(r'[^A-Z0-9]', '', str(text).upper())

                                    if cleaned_text and len(cleaned_text) >= 2:
                                        if conf > best_confidence:
                                            best_text = cleaned_text
                                            best_confidence = conf
                                            print(f"  ✅ New best result: '{best_text}' (conf: {best_confidence:.3f})")

                    # Handle other formats...
                    elif isinstance(ocr_results_processed, list) and len(ocr_results_processed) > 0:
                        # Similar logic as above for processed results
                        if isinstance(ocr_results_processed[0], list):
                            for line in ocr_results_processed[0]:
                                if line and len(line) >= 2 and line[1]:
                                    text = str(line[1][0]).strip()
                                    conf = float(line[1][1]) if len(line[1]) >= 2 else 0.0

                                    if conf > ocr_confidence_threshold and len(text) > 0:
                                        cleaned_text = re.sub(r'[^A-Z0-9]', '', text.upper())
                                        if cleaned_text and len(cleaned_text) >= 2:
                                            if conf > best_confidence:
                                                best_text = cleaned_text
                                                best_confidence = conf
                                                print(f"  ✅ New best result: '{best_text}' (conf: {best_confidence:.3f})")

            except Exception as e:
                print(f"Error with preprocessed OCR: {str(e)}")

        print(f"Final result: '{best_text}' (conf: {best_confidence:.3f})")
        return best_text, best_confidence

    except Exception as e:
        print(f"Error in OCR text extraction: {str(e)}")
        return "", 0.0

# Traffic Scene Description

In [None]:
def describe_traffic_scene(image, llava_model, processor, temperature=0.7, top_p=0.9):
    """Generate traffic scene description using LLaVA-NeXT with adjustable temperature and top_p"""
    try:
        # Resize image if too large to speed up inference
        if isinstance(image, Image.Image):
            width, height = image.size
            if width > 1024 or height > 1024:
                image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)

        # Simplified prompt for faster processing
        prompt = """
        USER: <image>
        Analyze this traffic scene in detail. Describe:
        1. Types of vehicles present (cars, trucks, motorcycles, etc.)
        2. Traffic signs, signals, and road markings visible
        3. Road conditions and infrastructure
        4. Weather and lighting conditions
        5. Overall traffic flow and density
        6. Any notable safety considerations or hazards

        ASSISTANT:"""

        inputs = processor(image, prompt, return_tensors="pt")

        # Move to appropriate device
        device = next(llava_model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        print(f"Generating scene description with temperature={temperature}, top_p={top_p}")

        with torch.inference_mode():
            # Use adjustable temperature and top_p parameters
            output = llava_model.generate(
                **inputs,
                max_new_tokens=150,
                do_sample=True if temperature > 0.0 else False,
                temperature=temperature if temperature > 0.0 else None,
                top_p=top_p if temperature > 0.0 else None,
                pad_token_id=processor.tokenizer.eos_token_id
            )

        full_response = processor.decode(output[0], skip_special_tokens=True)

        # Extract only the assistant's response
        description = ""
        if "ASSISTANT:" in full_response:
            parts = full_response.split("ASSISTANT:")
            if len(parts) > 1:
                description = parts[-1].strip()

        # Clean up any remaining template artifacts
        if description.startswith("[INST]") or description.startswith("USER:"):
            # Try alternative extraction
            lines = full_response.split('\n')
            for i, line in enumerate(lines):
                if "ASSISTANT:" in line and i + 1 < len(lines):
                    description = '\n'.join(lines[i+1:]).strip()
                    break

        # Remove any remaining template markers
        description = re.sub(r'\[/?INST\]', '', description).strip()
        description = re.sub(r'USER:.*?ASSISTANT:', '', description, flags=re.DOTALL).strip()

        if not description:
            description = "Unable to generate scene description."

        return description

    except Exception as e:
        return f"Error generating scene description: {str(e)}"

# Process Traffic Image

In [None]:
def process_traffic_image(image, yolo_confidence=0.5, ocr_confidence=0.1, vllm_temperature=0.7, vllm_top_p=0.9):
    """Main processing function with adjustable parameters"""
    if image is None:
        return "Please upload an image", "", ""

    try:
        # Load models
        yolo_model, llava_model, processor, ocr_model = load_models()

        # Process tasks with user-specified parameters
        print(f"Processing image with parameters:")
        print(f"  YOLO confidence: {yolo_confidence}")
        print(f"  OCR confidence: {ocr_confidence}")
        print(f"  VLLM temperature: {vllm_temperature}")
        print(f"  VLLM top_p: {vllm_top_p}")

        # Detect license plates with adjustable confidence
        print("Detecting license plates...")
        plate_detections = detect_license_plates(image, yolo_model, yolo_confidence)

        # Generate scene description with adjustable parameters
        print("Generating scene description...")
        scene_description = describe_traffic_scene(image, llava_model, processor, vllm_temperature, vllm_top_p)

        # Process each detected plate with OCR using adjustable confidence
        print(f"Processing {len(plate_detections)} detected plates...")
        processed_plates = []
        for i, detection in enumerate(plate_detections):
            try:
                bbox = detection['bbox']
                plate_text, ocr_conf = extract_text_from_plate(image, bbox, ocr_model, ocr_confidence)

                print(f"Final OCR result for plate {i+1}: text='{plate_text}', conf={ocr_conf:.3f}")

                # Include plates with any readable text OR high detection confidence
                if plate_text and len(plate_text) >= 2:
                    processed_plates.append({
                        'bbox': bbox,
                        'detection_confidence': detection['confidence'],
                        'plate_text': plate_text,
                        'ocr_confidence': ocr_conf
                    })
                    print(f"✅ Added readable plate {i+1}: '{plate_text}' (conf: {ocr_conf:.3f})")
                elif detection['confidence'] > 0.7:  # High detection confidence even if OCR failed
                    processed_plates.append({
                        'bbox': bbox,
                        'detection_confidence': detection['confidence'],
                        'plate_text': f"[High confidence detection - OCR failed]",
                        'ocr_confidence': 0.0
                    })
                    print(f"⚠️ Added high-confidence unreadable plate {i+1}")
                else:
                    # Low confidence detection
                    processed_plates.append({
                        'bbox': bbox,
                        'detection_confidence': detection['confidence'],
                        'plate_text': f"[Low confidence detection - conf: {detection['confidence']:.3f}]",
                        'ocr_confidence': 0.0
                    })
                    print(f"⚠️ Added low-confidence plate {i+1}")
            except Exception as e:
                print(f"Error processing plate {i+1}: {str(e)}")
                import traceback
                traceback.print_exc()
                # Still add the detection as unreadable
                processed_plates.append({
                    'bbox': detection['bbox'],
                    'detection_confidence': detection['confidence'],
                    'plate_text': "[Processing error]",
                    'ocr_confidence': 0.0
                })
                continue

        # Create final JSON output with parameter info
        final_result = {
            'scene_description': scene_description,
            'total_plates_detected': len(processed_plates),
            'license_plates': processed_plates,
            'parameters_used': {
                'yolo_confidence_threshold': yolo_confidence,
                'ocr_confidence_threshold': ocr_confidence,
                'vllm_temperature': vllm_temperature,
                'vllm_top_p': vllm_top_p
            },
        }

        # Format outputs for Gradio
        scene_text = f"Scene Description (temp={vllm_temperature}, top_p={vllm_top_p}):\n{scene_description}"
        plates_text = f"License Plates Detected: {len(processed_plates)} (from {len(plate_detections)} YOLO detections)\n"
        plates_text += f"YOLO Confidence Threshold: {yolo_confidence:.2f}\n"
        plates_text += f"OCR Confidence Threshold: {ocr_confidence:.2f}\n\n"

        if processed_plates:
            for i, plate in enumerate(processed_plates, 1):
                plates_text += f"Plate {i}:\n"
                plates_text += f"  Text: '{plate['plate_text']}'\n"
                plates_text += f"  Detection Confidence: {plate['detection_confidence']:.3f}\n"
                plates_text += f"  OCR Confidence: {plate['ocr_confidence']:.3f}\n"
                plates_text += f"  Bounding Box: {plate['bbox']}\n\n"
        else:
            plates_text += "No license plates detected by YOLO model.\n"

        json_output = json.dumps(final_result, indent=2)

        return scene_text, plates_text, json_output

    except Exception as e:
        return f"Error processing image: {str(e)}", "", ""

# Gradio

In [None]:
def create_gradio_interface():
    """Create the Gradio interface with adjustable parameters"""

    # Define the interface with parameter controls
    iface = gr.Interface(
        fn=process_traffic_image,
        inputs=[
            gr.Image(type="pil", label="Upload Traffic Scene Image"),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.5,
                step=0.05,
                label="YOLO Detection Confidence Threshold",
                info="Higher values = fewer but more confident detections"
            ),
            gr.Slider(
                minimum=0.01,
                maximum=1.0,
                value=0.1,
                step=0.01,
                label="OCR Confidence Threshold",
                info="Minimum confidence required to accept OCR text results"
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.7,
                step=0.1,
                label="VLLM Temperature",
                info="Controls randomness in scene description"
            ),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="VLLM Top-p",
                info="Controls diversity of scene description vocabulary"
            )
        ],
        outputs=[
            gr.Textbox(
                label="Traffic Scene Description",
                lines=8,
                max_lines=15
            ),
            gr.Textbox(
                label="License Plate Detection Results",
                lines=10,
                max_lines=20
            ),
            gr.Textbox(
                label="Complete JSON Output",
                lines=15,
                max_lines=25
            )
        ],
        title="🚗 Traffic Scene Analyzer with Adjustable Parameters",
        description="""
        Upload a traffic scene image and adjust parameters to customize the analysis:

        - **🎯 YOLO Detection**: Controls how confident the model needs to be to detect license plates
        - **📝 OCR Confidence**: Sets minimum confidence for accepting text recognition results
        - **🧠 VLLM Temperature**: Controls creativity vs consistency in scene descriptions
        - **🎨 VLLM Top-p**: Controls vocabulary diversity in scene descriptions

        **Outputs:**
        1. **Scene Description**: AI-generated description of the traffic scene
        2. **License Plate Detection**: Automatic detection and text extraction from license plates
        3. **JSON Output**: Structured data combining both results with parameter information
        """,
        theme=gr.themes.Soft(),
        allow_flagging="never"
    )

    return iface

# Launch App

In [None]:
# Main execution
if __name__ == "__main__":
    print("🚀 Starting Enhanced Traffic Scene Analyzer...")
    # Create and launch the interface
    interface = create_gradio_interface()

    # Launch with public sharing for Colab
    interface.launch(
        share=True,
        debug=True,
        server_name="0.0.0.0",
        server_port=7860
    )