In [None]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
from transformers import ViTFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel
import onnxruntime as ort
from paddleocr import PaddleOCR

class ImageProcessingPipeline:
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        self.preprocess = self._init_preprocessing()
        self.object_detection = self._init_object_detection()
        self.classification = self._init_classification()
        self.ocr = self._init_ocr()
        self.captioning = self._init_captioning()

    def _init_preprocessing(self):
        return transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def _init_object_detection(self):
        # Initialize YOLOv5-nano using ONNX Runtime for inference
        model = ort.InferenceSession("yolov5n.onnx")
        return model

    def _init_classification(self):
        # Load quantized MobileNetV3-Small
        model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v3_small', pretrained=True)
        model.eval()
        model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
        return model.to(self.device)

    def _init_ocr(self):
        # Initialize PaddleOCR with a lightweight model
        return PaddleOCR(use_angle_cls=False, lang='en', use_gpu=self.device=='cuda')

    def _init_captioning(self):
        # Initialize ViT-GPT2 for image captioning
        model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        return model.to(self.device), feature_extractor, tokenizer

    @torch.no_grad()
    def process_image(self, image_path):
        # Load and preprocess image
        image = Image.open(image_path).convert('RGB')
        input_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

        results = {}

        # Object Detection
        od_input = input_tensor.cpu().numpy()
        od_outputs = self.object_detection.run(None, {"images": od_input})
        results['object_detection'] = od_outputs[0]

        # Image Classification
        class_output = self.classification(input_tensor)
        results['classification'] = torch.argmax(class_output, dim=1).item()

        # OCR
        ocr_result = self.ocr.ocr(image_path, cls=False)
        results['ocr'] = [line[1][0] for line in ocr_result]

        # Image Captioning
        model, feature_extractor, tokenizer = self.captioning
        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(self.device)
        output_ids = model.generate(pixel_values, max_length=16, num_beams=4)
        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        results['caption'] = preds[0].strip()

        return results

# Usage
pipeline = ImageProcessingPipeline()
results = pipeline.process_image('path_to_your_image.jpg')
print(results)