# Visual and Text Processing

This notebook explores techniques for processing visual and textual data, including feature extraction and representation learning.

## Setup

Install and import necessary libraries.

In [None]:
!pip install torch torchvision transformers pillow

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torchvision import models, transforms
from PIL import Image
import numpy as np

## Text Processing with BERT

We'll use BERT to extract features from text data.

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Sample text
text = "This is an example of multimodal learning."

# Tokenize and encode
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    outputs = bert_model(**inputs)
    text_features = outputs.last_hidden_state

print(f"Text features shape: {text_features.shape}")

## Visual Processing with ResNet

We'll use a pre-trained ResNet model to extract visual features.

In [None]:
# Load pre-trained ResNet
resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
resnet.eval()

# Remove the final classification layer to get features
feature_extractor = nn.Sequential(*list(resnet.children())[:-1])

# Image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Create a sample image
img = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
img_tensor = preprocess(img).unsqueeze(0)

# Extract features
with torch.no_grad():
    image_features = feature_extractor(img_tensor)

print(f"Image features shape: {image_features.shape}")

## Feature Fusion

Combining visual and text features is a key step in multimodal learning.

In [None]:
# Simple concatenation fusion
# Note: In practice, you'd need to handle dimensions more carefully
print("Visual and text features can be combined using:")
print("- Concatenation")
print("- Attention mechanisms")
print("- Cross-modal transformers")
print("- Bilinear pooling")