# Advanced Multimodal Techniques

This notebook covers advanced techniques in multimodal learning, including attention mechanisms, cross-modal transformers, and practical applications.

## Setup

In [None]:
!pip install torch torchvision transformers

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Cross-Modal Attention

Attention mechanisms allow models to focus on relevant parts of one modality based on information from another.

In [None]:
class CrossModalAttention(nn.Module):
    def __init__(self, dim):
        super(CrossModalAttention, self).__init__()
        self.query = nn.Linear(dim, dim)
        self.key = nn.Linear(dim, dim)
        self.value = nn.Linear(dim, dim)
        self.scale = dim ** -0.5
    
    def forward(self, x1, x2):
        # x1: modality 1 features
        # x2: modality 2 features
        q = self.query(x1)
        k = self.key(x2)
        v = self.value(x2)
        
        attn = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        attn = F.softmax(attn, dim=-1)
        
        out = torch.matmul(attn, v)
        return out

# Example usage
attention = CrossModalAttention(dim=256)
x1 = torch.randn(1, 10, 256)  # Text features
x2 = torch.randn(1, 49, 256)  # Image features
output = attention(x1, x2)
print(f"Output shape: {output.shape}")

## Multimodal Fusion Network

A complete network for fusing multiple modalities.

In [None]:
class MultimodalFusionNetwork(nn.Module):
    def __init__(self, text_dim, image_dim, hidden_dim, num_classes):
        super(MultimodalFusionNetwork, self).__init__()
        
        # Project modalities to same dimension
        self.text_proj = nn.Linear(text_dim, hidden_dim)
        self.image_proj = nn.Linear(image_dim, hidden_dim)
        
        # Cross-modal attention
        self.attention = CrossModalAttention(hidden_dim)
        
        # Fusion and classification
        self.fusion = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, num_classes)
        )
    
    def forward(self, text_features, image_features):
        # Project to common space
        text_proj = self.text_proj(text_features)
        image_proj = self.image_proj(image_features)
        
        # Apply cross-modal attention
        attended_features = self.attention(text_proj, image_proj)
        
        # Concatenate and classify
        combined = torch.cat([text_proj, attended_features], dim=-1)
        output = self.fusion(combined.mean(dim=1))
        
        return output

# Example
model = MultimodalFusionNetwork(
    text_dim=768,
    image_dim=2048,
    hidden_dim=256,
    num_classes=10
)

text_feat = torch.randn(1, 20, 768)
image_feat = torch.randn(1, 49, 2048)
output = model(text_feat, image_feat)
print(f"Output shape: {output.shape}")

## Applications

Multimodal learning is used in:
- **Image Captioning**: Generating text descriptions of images
- **Visual Question Answering**: Answering questions about images
- **Video Understanding**: Analyzing video content with audio and text
- **Cross-modal Retrieval**: Finding images from text queries or vice versa
- **Multimodal Sentiment Analysis**: Analyzing sentiment from text, audio, and video

## Conclusion

This tutorial has covered:
1. Introduction to multimodal learning concepts
2. Processing techniques for visual and text data
3. Advanced fusion and attention mechanisms

For more information, check out the data folder for sample datasets and additional resources.