This script document the framework used in the paper "Multimodal Critique of Authority – Humor as a Function of Dissent during the COVID-19 Pandemic" (paper in progress). 

The framework in question can be summarized as following:

### Step 1: Contrastive image-text loss for representation learning
### Step 2: Hybrid-modal attention for cross-modal fusion
### Step 3: Humor classification
### Step 4: Mutual learning

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer, CLIPProcessor, CLIPModel
from PIL import Image
from torchvision import transforms
import clip

In [None]:
# Load M-CLIP model, this will be used for the texts embeddings
model_name = "M-CLIP/XLM-Roberta-Large-Vit-B-32"
text_model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load CLIP model for image embeddings
model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_name).eval()
clip_processor = CLIPProcessor.from_pretrained(model_name)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available, otherwise fallback to CPU
print(device)

# Step 1: Contrastive loss

In [None]:
def get_text_embedding(text):
    """Returns text embedding using M-CLIP."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        text_features = text_model(**inputs).last_hidden_state.mean(dim=1)
    return text_features


def get_image_embedding(image_path):
    """Returns image embedding using CLIP."""
    image = preprocess(Image.open(image_path)).unsqueeze(0).to("cuda")
    with torch.no_grad():
        image_features = clip_model.encode_image(image)
    return image_features


class ContrastiveLoss(nn.Module):
    """Contrastive loss for aligning text and image embeddings."""
    def __init__(self, temperature=0.07):
        super().__init__()
        self.temperature = temperature
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, image_embeddings, text_embeddings):
        batch_size = image_embeddings.shape[0]
        similarities = torch.matmul(image_embeddings, text_embeddings.T) / self.temperature
        labels = torch.arange(batch_size).to(similarities.device)
        loss = (self.loss_fn(similarities, labels) + self.loss_fn(similarities.T, labels)) / 2
        return loss

In [None]:
# Example training loop
def train_contrastive_model(image_paths, texts, epochs=10, lr=1e-4):
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    contrastive_loss = ContrastiveLoss().to(device)
    optimizer = optim.Adam(text_model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0
        optimizer.zero_grad()
        
        # Extract embeddings
        text_embeddings = torch.cat([get_text_embedding(t) for t in texts]).to(device)
        image_embeddings = torch.cat([get_image_embedding(img) for img in image_paths]).to(device)
        
        # Normalize embeddings
        text_embeddings = text_embeddings / text_embeddings.norm(dim=1, keepdim=True)
        image_embeddings = image_embeddings / image_embeddings.norm(dim=1, keepdim=True)
        
        # Compute loss
        loss = contrastive_loss(image_embeddings, text_embeddings)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")
    
    print("Training completed.")
    return text_model

# Step 2: Hybrid-modal attention