In [1]:
from snorkel.labeling import labeling_function
import json
import os
import numpy as np
from snorkel.labeling import LFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds
import os
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import random



In [2]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def calculate_metrics(y_true, y_pred, abstain_class=-1):
    # Filter out samples where prediction is -1
    valid_indices = y_pred != abstain_class
    y_true_filtered = y_true[valid_indices]
    y_pred_filtered = y_pred[valid_indices]

    # Compute metrics
    conf_matrix = confusion_matrix(y_true_filtered, y_pred_filtered)
    precision = precision_score(y_true_filtered, y_pred_filtered)
    recall = recall_score(y_true_filtered, y_pred_filtered)
    f1 = f1_score(y_true_filtered, y_pred_filtered)
    accuracy = accuracy_score(y_true_filtered, y_pred_filtered)

    return {
        'Confusion Matrix': conf_matrix,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Accuracy': accuracy
    }

In [3]:
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

@labeling_function()
def llava_7b(image_name):
    root_path = '../prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava:7b_results_hateful.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def llava_13b(image_name):
    root_path = '../prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava 13b-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def bakllava(image_name):
    root_path = '../prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'bakllava-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def llava_llama3(image_name):
    root_path = '../prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava-llama3-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def llava_phi3(image_name):
    root_path = '../prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava-phi3-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1


@labeling_function()
def moondream(image_name):
    root_path = '../prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'moondream-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def llava_34b(image_name):
    root_path = '../prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava 34b-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

In [4]:
train_data_json_path = '../prompting_framework/prompting_results/hateful/simplified_train.json'
dev_data_json_path = '../prompting_framework/prompting_results/hateful/simplified_dev.json'

with open(train_data_json_path, 'r') as file:
    train_data = json.load(file)

# Extract and pad image names, ensuring they are 5 digits long before the '.png'
train_image_names = []
for entry in train_data:
    img_name, ext = entry['img'].split('.')
    padded_img_name = img_name.zfill(5)  # Pad the image name to 5 digits
    train_image_names.append(f"{padded_img_name}.{ext}")

with open(dev_data_json_path, 'r') as file:
    dev_data = json.load(file)
    
dev_image_names = []
Y_dev = []
for entry in dev_data:
    Y_dev.append(entry['label'])
    img_name, ext = entry['img'].split('.')
    padded_img_name = img_name.zfill(5)  # Pad the image name to 5 digits
    dev_image_names.append(f"{padded_img_name}.{ext}")

print(f"There are {len(train_image_names)} images in the Train set.")
print(f"There are {len(dev_image_names)} images in the dev set.")
print(f"There are {len(Y_dev)} labels in the dev set.")


There are 8500 images in the Train set.
There are 500 images in the dev set.
There are 500 labels in the dev set.


In [5]:

lfs = [moondream,
       llava_34b,
       llava_13b,
       llava_phi3,
       bakllava,
       llava_7b,
       llava_llama3
       ]

applier = LFApplier(lfs)

In [6]:
L_dev = applier.apply(dev_image_names)
L_train = applier.apply(train_image_names)

500it [00:07, 69.58it/s]
8500it [01:10, 121.43it/s]


In [7]:
Y_dev = np.array(Y_dev)
LFAnalysis(L_dev, lfs).lf_summary(Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
moondream,0,"[0, 1]",1.0,1.0,0.752,246,254,0.492
llava_34b,1,"[0, 1]",1.0,1.0,0.752,269,231,0.538
llava_13b,2,"[0, 1]",1.0,1.0,0.752,288,212,0.576
llava_phi3,3,"[0, 1]",0.99,0.99,0.744,267,228,0.539394
bakllava,4,"[0, 1]",0.95,0.95,0.71,276,199,0.581053
llava_7b,5,"[0, 1]",1.0,1.0,0.752,298,202,0.596
llava_llama3,6,"[0, 1]",1.0,1.0,0.752,278,222,0.556


In [8]:
label_model = LabelModel(cardinality=2, verbose=False)
label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 4048.92epoch/s]


# Parameter Search

In [9]:
seeds = [1, 12, 123, 1234, 12345]
epochs = [100, 200, 300, 400, 800, 1000, 2000, 5000]

In [10]:
all_results = []

for seed in seeds:
    for epoch in epochs:
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L_train, Y_dev, n_epochs=epoch, log_freq=500, seed=seed)

        probs_dev = label_model.predict_proba(L_dev)
        preds_dev = probs_to_preds(probs_dev)
        
        metrics = calculate_metrics(Y_dev, preds_dev)

        all_results.append([metrics['F1 Score'], epoch, seed])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 4202.08epoch/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 4303.99epoch/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 4204.24epoch/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 4364.38epoch/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 800/800 [00:00<00:00, 4398.93epoch/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4421.53epoch/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<0

In [None]:
all_results

In [11]:

label_model = LabelModel(cardinality=2, verbose=False)
label_model.fit(L_train, Y_dev, n_epochs=100, log_freq=500, seed=12)

probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)

metrics = calculate_metrics(Y_dev, preds_dev)
for metric, value in metrics.items():
    print(f"{metric}: {value}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 4219.96epoch/s]

Confusion Matrix: [[165  85]
 [122 128]]
Precision: 0.6009389671361502
Recall: 0.512
F1 Score: 0.5529157667386609
Accuracy: 0.586





In [12]:
import torch.nn.functional as F

def expected_cross_entropy_loss(logits, target_distributions):
    """
    Computes the expected cross-entropy loss for a batch of predictions and target distributions.

    Parameters:
    logits (torch.Tensor): The raw output from the model of shape (batch_size, num_classes).
    target_distributions (torch.Tensor): The target class distributions of shape (batch_size, num_classes),
                                         where each row is a probability distribution over classes.

    Returns:
    torch.Tensor: The expected cross-entropy loss.
    """
    # Convert logits to log probabilities
    log_probs = F.log_softmax(logits, dim=1)
    
    # Compute the element-wise product between target distributions and log probabilities
    # Then, sum across classes to get the cross-entropy for each instance
    cross_entropy = -torch.sum(target_distributions * log_probs, dim=1)
    
    # Take the mean over the batch
    loss = cross_entropy.mean()
    
    return loss
    
class HatefulMemesDataset(Dataset):
    def __init__(self, image_names, root_dir, labels, target_dists, processor):
        """
        Args:
            data_frame (DataFrame): DataFrame containing image names and labels.
            image_dir (str): Directory where the images are stored.
            processor (CLIPProcessor): CLIP processor for preprocessing images.
        """
        self.image_names = image_names
        self.root_dir = root_dir
        self.labels = labels
        self.target_dists = target_dists
        self.processor = processor

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        # Get image name and label from the dataframe
        img_name = os.path.join(self.root_dir, self.image_names[idx])
        label = self.labels[idx]
        target_dist = self.target_dists[idx]

        # Load and process image
        image = Image.open(img_name).convert('RGB')
        inputs = self.processor(images=image, return_tensors="pt")

        # Return image and label
        return inputs['pixel_values'].squeeze(0), torch.tensor(label, dtype=torch.long), torch.tensor(target_dist)

# MLP head to be added after the CLIP model
class MLPHead(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLPHead, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# CLIP model with MLP head for binary classification
class CLIPWithMLP(nn.Module):
    def __init__(self, clip_model, mlp_head):
        super(CLIPWithMLP, self).__init__()
        self.clip_model = clip_model
        self.mlp_head = mlp_head

        # Freeze CLIP's parameters
        for param in self.clip_model.parameters():
            param.requires_grad = False

    def forward(self, image):
        # Extract image features from CLIP
        image_features = self.clip_model.get_image_features(pixel_values=image)
        # Pass through the MLP head
        outputs = self.mlp_head(image_features)
        return outputs

# Training function
def train_model(model, train_loader, dev_loader, criterion, optimizer, device, epochs=5):
    model.train()  # Set model to training mode
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels, target_dist in tqdm(train_loader):
            images, labels, target_dist = images.to(device), labels.to(device), target_dist.to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            # loss = expected_cross_entropy_loss(outputs, target_dist)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}")
        evaluate_model(model, dev_loader, device)

# Evaluation function to compute precision, recall, and F1-score
def evaluate_model(model, dev_loader, device):
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for images, labels, target_dist in tqdm(dev_loader):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs, 1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    # Compute metrics
    metrics = calculate_metrics(np.array(all_labels), np.array(all_preds))
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

    return metrics

In [15]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

root_dir = "/home1/pupil/goowfd/CVPR_2025/hateful_memes/img/"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")


probs_train = label_model.predict_proba(L_train)
preds_train = probs_to_preds(probs_train)

# Create datasets and dataloaders
train_dataset = HatefulMemesDataset(image_names=train_image_names, 
                                    root_dir=root_dir, 
                                    labels=preds_train,
                                    target_dists=probs_train,
                                    processor=processor)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=16)

dev_dataset = HatefulMemesDataset(image_names=dev_image_names, 
                                  root_dir=root_dir, 
                                  labels=Y_dev, 
                                  target_dists=probs_dev,
                                  processor=processor)
dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)

# Define MLP head (the dimension is based on CLIP output size)
# mlp_head = MLPHead(input_dim=512, output_dim=2)  # Binary classification, so output_dim = 2
mlp_head = MLPHead(input_dim=768, output_dim=2)  # Binary classification, so output_dim = 2

# Create the full model with CLIP + MLP
model = CLIPWithMLP(clip_model=clip_model, mlp_head=mlp_head)
model.to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.mlp_head.parameters(), lr=0.0001)

# Train the model
epochs = 1
train_model(model, train_loader, dev_loader, criterion, optimizer, device, epochs=epochs)

# Evaluate the model
evaluate_model(model, dev_loader, device)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [01:53<00:00,  6.66s/it]


Epoch [1/1], Loss: 0.6774


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:11<00:00,  5.44it/s]


Confusion Matrix: [[ 84 166]
 [ 52 198]]
Precision: 0.5439560439560439
Recall: 0.792
F1 Score: 0.6449511400651465
Accuracy: 0.564


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:10<00:00,  6.01it/s]

Confusion Matrix: [[ 84 166]
 [ 52 198]]
Precision: 0.5439560439560439
Recall: 0.792
F1 Score: 0.6449511400651465
Accuracy: 0.564





{'Confusion Matrix': array([[ 84, 166],
        [ 52, 198]]),
 'Precision': 0.5439560439560439,
 'Recall': 0.792,
 'F1 Score': 0.6449511400651465,
 'Accuracy': 0.564}

In [None]:
evaluate_model(model, dev_loader, device)