In [None]:
import torch
from transformers import MobileBertTokenizer, MobileBertModel
from langdetect import detect
import re
from typing import List, Dict
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

class TextProcessingPipeline:
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        self.tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
        self.model = MobileBertModel.from_pretrained('google/mobilebert-uncased').to(self.device)
        self.model.eval()
        self.stopwords = set(stopwords.words('english'))

    def preprocess_text(self, text: str) -> str:
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        tokens = [token for token in tokens if token not in self.stopwords]
        # Join tokens back into a string
        return ' '.join(tokens)

    def detect_language(self, text: str) -> str:
        try:
            return detect(text)
        except:
            return 'unknown'

    @torch.no_grad()
    def generate_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu()

    def process_text(self, text: str) -> Dict:
        preprocessed_text = self.preprocess_text(text)
        language = self.detect_language(preprocessed_text)
        embedding = self.generate_embedding(preprocessed_text)
        
        return {
            'preprocessed_text': preprocessed_text,
            'language': language,
            'embedding': embedding
        }

    def process_image_output(self, image_output: Dict) -> Dict:
        results = {}
        
        # Process OCR text
        if 'ocr' in image_output:
            ocr_text = ' '.join(image_output['ocr'])
            results['ocr_processed'] = self.process_text(ocr_text)
        
        # Process image caption
        if 'caption' in image_output:
            results['caption_processed'] = self.process_text(image_output['caption'])
        
        return results

# Usage
def main():
    # Assuming we have the output from the Image Processing Pipeline
    image_output = {
        'object_detection': [[0, 0, 100, 100, 0.9, 1]],  # Example output
        'classification': 5,  # Example class ID
        'ocr': ['Hello', 'World'],
        'caption': 'A computer screen displaying text'
    }

    text_pipeline = TextProcessingPipeline()
    text_results = text_pipeline.process_image_output(image_output)
    
    print("OCR Processed:")
    print(text_results['ocr_processed'])
    print("\nCaption Processed:")
    print(text_results['caption_processed'])

if __name__ == "__main__":
    main()