MULTIMODAL VISION-LANGUAGE PROJECT TEMPLATE
===========================================
Use Case: Image Captioning, VQA, Visual Document Understanding, OCR

# 1. PROJECT SETUP & ENVIRONMENT

## 1.1 Install Required Libraries

In [None]:
# !pip install transformers torch torchvision
# !pip install pillow opencv-python
# !pip install datasets evaluate
# !pip install pytesseract easyocr
# !pip install gradio streamlit
# !pip install rouge-score nltk

## 1.2 Import Libraries

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import cv2
import warnings
warnings.filterwarnings('ignore')

# HuggingFace
import torch
from transformers import (
    VisionEncoderDecoderModel,
    ViTImageProcessor,
    AutoTokenizer,
    BlipProcessor,
    BlipForConditionalGeneration,
    BlipForQuestionAnswering,
    AutoProcessor,
    Pix2StructForConditionalGeneration,
    TrOCRProcessor,
    VisionEncoderDecoderModel as TrOCRModel
)
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader

# Evaluation
from evaluate import load as load_metric
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1.3 Configuration

In [None]:
CONFIG = {
    # Task configuration
    'task': 'image_captioning',  # 'image_captioning', 'vqa', 'ocr', 'document_understanding'
    
    # Model configuration
    'model_name': 'Salesforce/blip-image-captioning-base',
    # Alternatives:
    # - 'nlpconnect/vit-gpt2-image-captioning'
    # - 'Salesforce/blip-vqa-base'
    # - 'microsoft/trocr-base-handwritten'
    # - 'google/pix2struct-docvqa-base'
    
    # Data configuration
    'data_dir': './data/images',
    'batch_size': 8,
    'num_workers': 4,
    'max_length': 128,
    
    # Training configuration
    'learning_rate': 5e-5,
    'num_epochs': 5,
    'warmup_steps': 500,
    
    # Generation configuration
    'max_new_tokens': 50,
    'num_beams': 4,
    'temperature': 1.0,
    
    'random_seed': 42
}

torch.manual_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])

# 2. DATA LOADING & EXPLORATION

## 2.1 Load Image Dataset

In [None]:
def load_image_dataset(data_dir, dataset_type='custom'):
    """Load image dataset"""
    
    if dataset_type == 'huggingface':
        # Load from HuggingFace
        dataset = load_dataset('nlphuji/flickr30k')  # Example dataset
        return dataset
    
    else:
        # Load custom dataset
        image_paths = list(Path(data_dir).glob('**/*.jpg')) + \
                     list(Path(data_dir).glob('**/*.png'))
        
        # Load annotations if available
        annotations_file = Path(data_dir) / 'annotations.json'
        if annotations_file.exists():
            with open(annotations_file, 'r') as f:
                annotations = json.load(f)
        else:
            annotations = {str(p): "" for p in image_paths}
        
        data = {
            'image_path': [str(p) for p in image_paths],
            'caption': [annotations.get(str(p), "") for p in image_paths]
        }
        
        return Dataset.from_dict(data)

# Load dataset
dataset = load_image_dataset(CONFIG['data_dir'])
print(f"Dataset size: {len(dataset)}")

## 2.2 Visualize Sample Images

In [None]:
def visualize_samples(dataset, num_samples=6):
    """Visualize sample images with captions"""
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for i in range(min(num_samples, len(dataset))):
        sample = dataset[i]
        
        # Load image
        if 'image' in sample:
            image = sample['image']
        else:
            image = Image.open(sample['image_path'])
        
        axes[i].imshow(image)
        axes[i].axis('off')
        
        # Show caption if available
        if 'caption' in sample or 'text' in sample:
            caption = sample.get('caption', sample.get('text', ''))
            axes[i].set_title(caption[:50] + '...' if len(caption) > 50 else caption,
                            fontsize=10)
    
    plt.tight_layout()
    plt.show()

visualize_samples(dataset)

## 2.3 Analyze Dataset Statistics

In [None]:
# Analyze image dimensions
def analyze_images(dataset, sample_size=100):
    """Analyze image statistics"""
    widths = []
    heights = []
    aspects = []
    
    sample_indices = np.random.choice(len(dataset), min(sample_size, len(dataset)), replace=False)
    
    for idx in sample_indices:
        sample = dataset[int(idx)]
        
        if 'image' in sample:
            image = sample['image']
        else:
            image = Image.open(sample['image_path'])
        
        w, h = image.size
        widths.append(w)
        heights.append(h)
        aspects.append(w / h)
    
    # Visualize
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    axes[0].hist(widths, bins=30, edgecolor='black')
    axes[0].set_title('Width Distribution')
    axes[0].set_xlabel('Width (pixels)')
    
    axes[1].hist(heights, bins=30, edgecolor='black')
    axes[1].set_title('Height Distribution')
    axes[1].set_xlabel('Height (pixels)')
    
    axes[2].hist(aspects, bins=30, edgecolor='black')
    axes[2].set_title('Aspect Ratio Distribution')
    axes[2].set_xlabel('Width / Height')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Average width: {np.mean(widths):.0f}px")
    print(f"Average height: {np.mean(heights):.0f}px")
    print(f"Average aspect ratio: {np.mean(aspects):.2f}")

analyze_images(dataset)

# 3. IMAGE CAPTIONING

## 3.1 Load Captioning Model

In [None]:
# Load BLIP model for image captioning
processor = BlipProcessor.from_pretrained(CONFIG['model_name'])
model = BlipForConditionalGeneration.from_pretrained(CONFIG['model_name']).to(device)

print(f"Model loaded: {CONFIG['model_name']}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

## 3.2 Generate Captions

In [None]:
def generate_caption(image_path, model, processor, device):
    """Generate caption for an image"""
    # Load image
    if isinstance(image_path, str):
        image = Image.open(image_path).convert('RGB')
    else:
        image = image_path
    
    # Process
    inputs = processor(image, return_tensors="pt").to(device)
    
    # Generate
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=CONFIG['max_new_tokens'],
            num_beams=CONFIG['num_beams'],
            temperature=CONFIG['temperature']
        )
    
    # Decode
    caption = processor.decode(output[0], skip_special_tokens=True)
    
    return caption

## 3.3 Test Captioning on Samples

In [None]:
# Test on sample images
print("Generating captions for sample images...\n")

num_samples = 4
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for i in range(min(num_samples, len(dataset))):
    sample = dataset[i]
    
    # Load image
    if 'image' in sample:
        image = sample['image']
    else:
        image = Image.open(sample['image_path'])
    
    # Generate caption
    caption = generate_caption(image, model, processor, device)
    
    # Display
    axes[i].imshow(image)
    axes[i].axis('off')
    axes[i].set_title(f"Generated: {caption}", fontsize=10, wrap=True)
    
    print(f"Image {i+1}:")
    print(f"Generated Caption: {caption}")
    if 'caption' in sample:
        print(f"Ground Truth: {sample['caption']}")
    print()

plt.tight_layout()
plt.show()

# 4. VISUAL QUESTION ANSWERING (VQA)

## 4.1 Load VQA Model

In [None]:
# Load BLIP model for VQA
vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

print("VQA model loaded")

## 4.2 Answer Questions about Images

In [None]:
def answer_visual_question(image_path, question, model, processor, device):
    """Answer a question about an image"""
    # Load image
    if isinstance(image_path, str):
        image = Image.open(image_path).convert('RGB')
    else:
        image = image_path
    
    # Process
    inputs = processor(image, question, return_tensors="pt").to(device)
    
    # Generate answer
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50)
    
    # Decode
    answer = processor.decode(output[0], skip_special_tokens=True)
    
    return answer

## 4.3 Interactive VQA Demo

In [None]:
# Test VQA on sample images
sample_image = dataset[0]['image'] if 'image' in dataset[0] else Image.open(dataset[0]['image_path'])

questions = [
    "What is in the image?",
    "What color is the main object?",
    "How many people are in the image?",
    "What is the setting of this image?"
]

print("Visual Question Answering Demo\n" + "="*50)
plt.figure(figsize=(8, 6))
plt.imshow(sample_image)
plt.axis('off')
plt.title("Query Image")
plt.show()

for question in questions:
    answer = answer_visual_question(sample_image, question, vqa_model, vqa_processor, device)
    print(f"Q: {question}")
    print(f"A: {answer}\n")

# 5. OPTICAL CHARACTER RECOGNITION (OCR)

## 5.1 Load TrOCR Model

In [None]:
# Load TrOCR for handwritten text recognition
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

print("TrOCR model loaded")

## 5.2 Extract Text from Images

In [None]:
def extract_text_trocr(image_path, model, processor, device):
    """Extract text from image using TrOCR"""
    # Load image
    if isinstance(image_path, str):
        image = Image.open(image_path).convert('RGB')
    else:
        image = image_path
    
    # Process
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    
    # Generate
    with torch.no_grad():
        generated_ids = model.generate(pixel_values, max_new_tokens=CONFIG['max_length'])
    
    # Decode
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return text

## 5.3 Alternative: EasyOCR

In [None]:
import easyocr

# Initialize EasyOCR
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())

def extract_text_easyocr(image_path):
    """Extract text using EasyOCR"""
    if isinstance(image_path, str):
        image = cv2.imread(image_path)
    else:
        image = cv2.cvtColor(np.array(image_path), cv2.COLOR_RGB2BGR)
    
    # Extract text
    results = reader.readtext(image)
    
    # Combine text
    text = " ".join([result[1] for result in results])
    
    return text, results

# Test OCR
# text = extract_text_trocr(sample_image, trocr_model, trocr_processor, device)
# print(f"Extracted Text: {text}")

# 6. DOCUMENT UNDERSTANDING

## 6.1 Load Document VQA Model

In [None]:
# Load Pix2Struct for document understanding
doc_processor = AutoProcessor.from_pretrained("google/pix2struct-docvqa-base")
doc_model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-base").to(device)

print("Document understanding model loaded")

## 6.2 Answer Questions about Documents

In [None]:
def answer_document_question(image_path, question, model, processor, device):
    """Answer questions about document images"""
    # Load image
    if isinstance(image_path, str):
        image = Image.open(image_path).convert('RGB')
    else:
        image = image_path
    
    # Process
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
    
    # Generate
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=100)
    
    # Decode
    answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return answer

# Test document understanding
# document_image = Image.open("invoice.png")
# question = "What is the total amount?"
# answer = answer_document_question(document_image, question, doc_model, doc_processor, device)
# print(f"Answer: {answer}")

# 7. BATCH PROCESSING

## 7.1 Batch Captioning

In [None]:
def batch_generate_captions(image_paths, model, processor, device, batch_size=8):
    """Generate captions for multiple images"""
    all_captions = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i+batch_size]
        
        # Load images
        images = []
        for path in batch_paths:
            if isinstance(path, str):
                img = Image.open(path).convert('RGB')
            else:
                img = path
            images.append(img)
        
        # Process batch
        inputs = processor(images, return_tensors="pt", padding=True).to(device)
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=CONFIG['max_new_tokens'])
        
        # Decode
        captions = processor.batch_decode(outputs, skip_special_tokens=True)
        all_captions.extend(captions)
    
    return all_captions

# Test batch processing
# sample_images = [dataset[i]['image_path'] for i in range(min(10, len(dataset)))]
# captions = batch_generate_captions(sample_images, model, processor, device)

# 8. EVALUATION METRICS

## 8.1 BLEU Score

In [None]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

def calculate_bleu(references, hypotheses):
    """Calculate BLEU score"""
    # Tokenize
    refs = [[ref.split()] for ref in references]
    hyps = [hyp.split() for hyp in hypotheses]
    
    # Calculate BLEU
    bleu_1 = corpus_bleu(refs, hyps, weights=(1.0, 0, 0, 0))
    bleu_2 = corpus_bleu(refs, hyps, weights=(0.5, 0.5, 0, 0))
    bleu_3 = corpus_bleu(refs, hyps, weights=(0.33, 0.33, 0.33, 0))
    bleu_4 = corpus_bleu(refs, hyps, weights=(0.25, 0.25, 0.25, 0.25))
    
    return {
        'BLEU-1': bleu_1,
        'BLEU-2': bleu_2,
        'BLEU-3': bleu_3,
        'BLEU-4': bleu_4
    }

## 8.2 ROUGE Score

In [None]:
def calculate_rouge(references, hypotheses):
    """Calculate ROUGE score"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for ref, hyp in zip(references, hypotheses):
        score = scorer.score(ref, hyp)
        for key in scores:
            scores[key].append(score[key].fmeasure)
    
    # Average scores
    avg_scores = {key: np.mean(values) for key, values in scores.items()}
    
    return avg_scores

## 8.3 METEOR and CIDEr

In [None]:
# Load evaluation metrics
try:
    meteor = load_metric('meteor')
    # cider = load_metric('cider')
    
    def calculate_meteor(references, predictions):
        """Calculate METEOR score"""
        score = meteor.compute(predictions=predictions, references=references)
        return score['meteor']
except:
    print("Meteor metric not available")

## 8.4 Evaluate Model

In [None]:
def evaluate_captioning_model(dataset, model, processor, device, num_samples=100):
    """Evaluate captioning model"""
    references = []
    hypotheses = []
    
    sample_indices = np.random.choice(len(dataset), min(num_samples, len(dataset)), replace=False)
    
    for idx in sample_indices:
        sample = dataset[int(idx)]
        
        # Load image
        if 'image' in sample:
            image = sample['image']
        else:
            image = Image.open(sample['image_path'])
        
        # Generate caption
        caption = generate_caption(image, model, processor, device)
        
        # Get reference
        if 'caption' in sample:
            reference = sample['caption']
        elif 'text' in sample:
            reference = sample['text']
        else:
            continue
        
        references.append(reference)
        hypotheses.append(caption)
    
    # Calculate metrics
    bleu_scores = calculate_bleu(references, hypotheses)
    rouge_scores = calculate_rouge(references, hypotheses)
    
    print("\nEvaluation Results:")
    print("="*50)
    print("\nBLEU Scores:")
    for key, value in bleu_scores.items():
        print(f"  {key}: {value:.4f}")
    
    print("\nROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"  {key}: {value:.4f}")
    
    return bleu_scores, rouge_scores

# Evaluate model
# bleu_scores, rouge_scores = evaluate_captioning_model(dataset, model, processor, device)

# 9. GRADIO INTERFACE

## 9.1 Image Captioning Interface

In [None]:
import gradio as gr

def create_captioning_interface(model, processor, device):
    """Create Gradio interface for image captioning"""
    
    def caption_image(image):
        caption = generate_caption(image, model, processor, device)
        return caption
    
    interface = gr.Interface(
        fn=caption_image,
        inputs=gr.Image(type="pil", label="Upload Image"),
        outputs=gr.Textbox(label="Generated Caption"),
        title="Image Captioning",
        description="Upload an image to generate a caption",
        examples=[
            # Add example image paths here
        ]
    )
    
    return interface

# Launch interface
# interface = create_captioning_interface(model, processor, device)
# interface.launch(share=True)

## 9.2 VQA Interface

In [None]:
def create_vqa_interface(model, processor, device):
    """Create Gradio interface for VQA"""
    
    def answer_question(image, question):
        answer = answer_visual_question(image, question, model, processor, device)
        return answer
    
    interface = gr.Interface(
        fn=answer_question,
        inputs=[
            gr.Image(type="pil", label="Upload Image"),
            gr.Textbox(label="Ask a Question")
        ],
        outputs=gr.Textbox(label="Answer"),
        title="Visual Question Answering",
        description="Upload an image and ask questions about it"
    )
    
    return interface

# Launch VQA interface
# vqa_interface = create_vqa_interface(vqa_model, vqa_processor, device)
# vqa_interface.launch(share=True)

## 9.3 Multi-Task Interface

In [None]:
def create_multitask_interface():
    """Create interface with multiple vision tasks"""
    
    with gr.Blocks() as demo:
        gr.Markdown("# Multimodal Vision-Language System")
        
        with gr.Tab("Image Captioning"):
            cap_image = gr.Image(type="pil")
            cap_button = gr.Button("Generate Caption")
            cap_output = gr.Textbox(label="Caption")
            
            cap_button.click(
                lambda img: generate_caption(img, model, processor, device),
                inputs=cap_image,
                outputs=cap_output
            )
        
        with gr.Tab("Visual QA"):
            vqa_image = gr.Image(type="pil")
            vqa_question = gr.Textbox(label="Question")
            vqa_button = gr.Button("Get Answer")
            vqa_output = gr.Textbox(label="Answer")
            
            vqa_button.click(
                lambda img, q: answer_visual_question(img, q, vqa_model, vqa_processor, device),
                inputs=[vqa_image, vqa_question],
                outputs=vqa_output
            )
        
        with gr.Tab("OCR"):
            ocr_image = gr.Image(type="pil")
            ocr_button = gr.Button("Extract Text")
            ocr_output = gr.Textbox(label="Extracted Text")
            
            ocr_button.click(
                lambda img: extract_text_trocr(img, trocr_model, trocr_processor, device),
                inputs=ocr_image,
                outputs=ocr_output
            )
    
    return demo

# Launch multi-task interface
# multi_interface = create_multitask_interface()
# multi_interface.launch(share=True)

# 10. MODEL FINE-TUNING

## 10.1 Prepare Training Data

In [None]:
from torch.utils.data import Dataset

class CaptioningDataset(Dataset):
    """Custom dataset for image captioning"""
    
    def __init__(self, data, processor, max_length=128):
        self.data = data
        self.processor = processor
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        
        # Load image
        if 'image' in sample:
            image = sample['image']
        else:
            image = Image.open(sample['image_path']).convert('RGB')
        
        # Get caption
        caption = sample.get('caption', sample.get('text', ''))
        
        # Process
        encoding = self.processor(
            images=image,
            text=caption,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )
        
        # Remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        
        return encoding

# Create dataset
# train_dataset = CaptioningDataset(dataset, processor)

## 10.2 Training Loop

In [None]:
from transformers import Trainer, TrainingArguments

def train_captioning_model(model, train_dataset, eval_dataset, output_dir='./finetuned_model'):
    """Fine-tune captioning model"""
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=CONFIG['num_epochs'],
        per_device_train_batch_size=CONFIG['batch_size'],
        per_device_eval_batch_size=CONFIG['batch_size'],
        warmup_steps=CONFIG['warmup_steps'],
        learning_rate=CONFIG['learning_rate'],
        logging_steps=100,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        fp16=torch.cuda.is_available(),
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )
    
    trainer.train()
    
    return trainer

# Fine-tune model
# trainer = train_captioning_model(model, train_dataset, eval_dataset)

# 11. DEPLOYMENT

In [None]:
# Save model
def save_model(model, processor, output_dir='./saved_model'):
    """Save model and processor"""
    model.save_pretrained(output_dir)
    processor.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

# Load model
def load_saved_model(model_dir):
    """Load saved model"""
    model = BlipForConditionalGeneration.from_pretrained(model_dir).to(device)
    processor = BlipProcessor.from_pretrained(model_dir)
    return model, processor

# 12. CONCLUSIONS & NEXT STEPS

## Summary:
- Task: {CONFIG['task']}
- Model: {CONFIG['model_name']}
- Generated captions for X images
- Average BLEU score: X.XX

## Next Steps:
- [ ] Fine-tune on domain-specific data
- [ ] Implement attention visualization
- [ ] Add support for video captioning
- [ ] Implement multi-language support
- [ ] Optimize inference speed
- [ ] Deploy as REST API
- [ ] Add real-time webcam support
- [ ] Implement image generation from text