In [1]:
from snorkel.labeling import labeling_function
import json
import os
import numpy as np

# Defining the Labelers

In [2]:
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

@labeling_function()
def llava_7b(image_name):
    root_path = '/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava:7b_results_hateful.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def llava_13b(image_name):
    root_path = '/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava 13b-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def bakllava(image_name):
    root_path = '/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'bakllava-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def llava_llama3(image_name):
    root_path = '/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava-llama3-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

@labeling_function()
def llava_phi3(image_name):
    root_path = '/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'llava-phi3-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1


@labeling_function()
def moondream(image_name):
    root_path = '/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/prompting_framework/prompting_results/hateful/total_results/'
    llava_7b_results = 'moondream-allsamples-results.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data[image_name] if data[image_name] is not None else -1

In [3]:
print(llava_7b("48132.png"))
print(llava_llama3("48132.png"))

0
0


# Train Dataset

In [4]:
train_data_json_path = '/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/prompting_framework/prompting_results/hateful/simplified_train.json'
dev_data_json_path = '/home/macula/SMATousi/CVPR/ViGIR_CVPR_LLM/prompting_framework/prompting_results/hateful/simplified_dev.json'

with open(train_data_json_path, 'r') as file:
    train_data = json.load(file)

# Extract and pad image names, ensuring they are 5 digits long before the '.png'
train_image_names = []
for entry in train_data:
    img_name, ext = entry['img'].split('.')
    padded_img_name = img_name.zfill(5)  # Pad the image name to 5 digits
    train_image_names.append(f"{padded_img_name}.{ext}")

with open(dev_data_json_path, 'r') as file:
    dev_data = json.load(file)
    
dev_image_names = []
Y_dev = []
for entry in dev_data:
    Y_dev.append(entry['label'])
    img_name, ext = entry['img'].split('.')
    padded_img_name = img_name.zfill(5)  # Pad the image name to 5 digits
    dev_image_names.append(f"{padded_img_name}.{ext}")

print(f"There are {len(train_image_names)} images in the Train set.")
print(f"There are {len(dev_image_names)} images in the dev set.")
print(f"There are {len(Y_dev)} labels in the dev set.")


There are 8500 images in the Train set.
There are 500 images in the dev set.
There are 500 labels in the dev set.


# Applying the LFs

In [5]:
from snorkel.labeling import LFApplier

lfs = [llava_7b,
       llava_13b,
       moondream,
       llava_llama3,
       llava_phi3,
       bakllava
       ]

applier = LFApplier(lfs)

In [6]:
from snorkel.labeling import LFAnalysis

L_dev = applier.apply(dev_image_names)
L_train = applier.apply(train_image_names)

500it [00:06, 72.30it/s]
8500it [00:57, 147.71it/s]


In [7]:
Y_dev = np.array(Y_dev)
LFAnalysis(L_dev, lfs).lf_summary(Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
llava_7b,0,"[0, 1]",1.0,1.0,0.738,298,202,0.596
llava_13b,1,"[0, 1]",1.0,1.0,0.738,288,212,0.576
moondream,2,"[0, 1]",1.0,1.0,0.738,246,254,0.492
llava_llama3,3,"[0, 1]",1.0,1.0,0.738,278,222,0.556
llava_phi3,4,"[0, 1]",0.99,0.99,0.73,267,228,0.539394
bakllava,5,"[0, 1]",0.95,0.95,0.696,276,199,0.581053


F1 of the labelers:

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score

for i in range(L_dev.shape[1]):
    
    precision = precision_score(Y_dev, L_dev[:,i])
    recall = recall_score(Y_dev, L_dev[:,i])
    f1 = f1_score(Y_dev, L_dev[:,i])

    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")

Precision: 0.59
Recall: 0.62
F1-score: 0.60
Precision: 0.60
Recall: 0.45
F1-score: 0.52
Precision: 0.40
Recall: 0.03
F1-score: 0.06
Precision: 0.54
Recall: 0.70
F1-score: 0.61


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [8]:
L_dev.shape

(500, 6)

# Optimization for Label Model

In [8]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                                                              | 0/5000 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=5.341]
  6%|█████                                                                             | 311/5000 [00:00<00:02, 1586.26epoch/s]INFO:root:[500 epochs]: TRAIN:[loss=0.002]
 14%|███████████▏                                                                      | 684/5000 [00:00<00:01, 2488.42epoch/s]INFO:root:[1000 epochs]: TRAIN:[loss=0.002]
 29%|███████████████████████▎                                                         | 1436/5000 [00:00<00:01, 3241.34epoch/s]INFO:root:[1500 epochs]: TRAIN:[loss=0.002]
 36%|████████████████████████████▉                                                    | 1786/5000 [00:00<00:00, 3324.92epoch/s]INFO:root:[2000 epochs]: TRAIN:[loss=0.002]
 43%|███████████████████████████████████                                              | 2162/500

In [9]:
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds

probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)
print(
    f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}"
)
print(
    f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
)

Label model f1 score: 0.5539112050739958
Label model roc-auc: 0.605344


In [11]:
print(label_model.get_weights())

[0.82491001 0.76428977 0.54531989 0.78372908 0.82443517 0.83430104]


# Training the End Model

The dataloader for PyTorch

In [42]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Custom dataset class for loading images
class ImageDataset(Dataset):
    def __init__(self, image_names, root_dir, labels, transform=None):
        """
        Args:
            image_names (list): List of image file names.
            root_dir (string): Directory where images are stored.
            transform (callable, optional): Optional transform to be applied on an image.
        """
        self.image_names = image_names
        self.root_dir = root_dir
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        # Build the full path of the image file
        img_name = os.path.join(self.root_dir, self.image_names[idx])
        label = self.labels[idx]
        image = Image.open(img_name).convert('RGB')  # Load image as RGB

        # Apply any transformations (e.g., resize, normalization)
        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label)

In [35]:
root_dir = "/home1/pupil/goowfd/CVPR_2025/hateful_memes/img/"

transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize images to 224x224 (example)
        transforms.ToTensor(),          # Convert images to PyTorch tensors
        # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize with ImageNet mean/std
    ])

probs_dev = label_model.predict_proba(L_train)
label_model_predictions = probs_to_preds(probs_dev)

In [43]:
dataset = ImageDataset(image_names=train_image_names, root_dir=root_dir, labels=label_model_predictions, transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4)



A basic ResNet

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image

class MLPHead(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLPHead, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# Pretrained ResNet with MLP head for binary classification
class ResNetWithMLP(nn.Module):
    def __init__(self, num_classes=2):
        super(ResNetWithMLP, self).__init__()
        # Load a pretrained ResNet (e.g., ResNet18)
        self.resnet = models.resnet18(pretrained=True)
        
        # Freeze the ResNet layers if you don't want to train them
        for param in self.resnet.parameters():
            param.requires_grad = False

        # Replace the last fully connected layer with a custom MLP head
        num_features = self.resnet.fc.in_features  # Get the number of features in the last layer
        self.resnet.fc = MLPHead(input_dim=num_features, output_dim=num_classes)

    def forward(self, x):
        return self.resnet(x)

In [32]:
model = ResNetWithMLP(num_classes=2)



In [40]:
from tqdm import tqdm
def train(model, train_loader, criterion, optimizer, device, epochs):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()  # Clear the gradients

            outputs = model(images)  # Forward pass
            loss = criterion(outputs, labels)  # Compute the loss

            loss.backward()  # Backward pass
            optimizer.step()  # Update the weights

            # Calculate running loss and accuracy
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader)
        accuracy = 100 * correct / total

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.2f}%")


In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # Binary classification loss
optimizer = optim.Adam(model.resnet.fc.parameters(), lr=0.001)  # Only optimize the MLP parameters

# Train the model
epochs = 10
train(model, dataloader, criterion, optimizer, device, epochs)


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:27<00:00, 38.82it/s]


Epoch [1/10], Loss: 0.6554, Accuracy: 61.88%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:17<00:00, 60.41it/s]


Epoch [2/10], Loss: 0.6356, Accuracy: 64.39%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:16<00:00, 62.97it/s]


Epoch [3/10], Loss: 0.6321, Accuracy: 64.47%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:16<00:00, 64.39it/s]


Epoch [4/10], Loss: 0.6299, Accuracy: 64.69%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:16<00:00, 64.87it/s]


Epoch [5/10], Loss: 0.6236, Accuracy: 65.01%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:16<00:00, 64.02it/s]


Epoch [6/10], Loss: 0.6240, Accuracy: 64.79%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:16<00:00, 64.38it/s]


Epoch [7/10], Loss: 0.6198, Accuracy: 65.80%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:16<00:00, 64.64it/s]


Epoch [8/10], Loss: 0.6191, Accuracy: 65.20%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:16<00:00, 64.90it/s]


Epoch [9/10], Loss: 0.6137, Accuracy: 65.92%


100%|██████████████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:16<00:00, 64.68it/s]

Epoch [10/10], Loss: 0.6127, Accuracy: 66.81%





Evaluation ...

In [47]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Function to evaluate the model on the dev set
def evaluate(model, dev_loader, device):
    model.eval()  # Set the model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():  # Disable gradient computation
        for images, labels in tqdm(dev_loader):
            images, labels = images.to(device), labels.to(device)

            # Forward pass to get outputs
            outputs = model(images)

            # Get predictions (class with the highest score)
            _, predicted = torch.max(outputs.data, 1)

            # Store true labels and predictions
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    return np.array(all_labels), np.array(all_preds)

# Function to calculate precision, recall, and F1 score
def calculate_metrics(labels, predictions):
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return precision, recall, f1

In [48]:
dev_dataset = ImageDataset(image_names=dev_image_names, root_dir=root_dir, labels=Y_dev, transform=transform)
dev_dataloader = DataLoader(dev_dataset, batch_size=8, shuffle=False, num_workers=4)

In [50]:
labels, predictions = evaluate(model, dev_dataloader, device)

# Calculate and print precision, recall, and F1-score
calculate_metrics(labels, predictions)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 31.80it/s]

Precision: 0.5491
Recall: 0.4920
F1 Score: 0.5190





(0.5491071428571429, 0.492, 0.5189873417721519)

Training the CLIP+MLP model