<a href="https://colab.research.google.com/github/Tahnees/PR_Assignment/blob/main/SnifferPR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile

zip_path = "/content/drive/MyDrive/News_clippings.zip"
extract_to = "/content/visual_news_dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)


In [None]:
import os

for root, dirs, files in os.walk(extract_to):
    print(f"{root}:\n  Dirs: {dirs}\n  Files: {files[:5]}\n")


/content/visual_news_dataset:
  Dirs: ['News_clippings']
  Files: []

/content/visual_news_dataset/News_clippings:
  Dirs: ['data', 'embeddings']
  Files: []

/content/visual_news_dataset/News_clippings/data:
  Dirs: ['scene_resnet_place', 'merged_balanced', 'semantics_clip_text_image', 'semantics_clip_text_text', 'person_sbert_text_text', 'full']
  Files: []

/content/visual_news_dataset/News_clippings/data/scene_resnet_place:
  Dirs: []
  Files: ['test.json', 'train.json', 'val.json']

/content/visual_news_dataset/News_clippings/data/merged_balanced:
  Dirs: []
  Files: ['test.json', 'train.json', 'val.json']

/content/visual_news_dataset/News_clippings/data/semantics_clip_text_image:
  Dirs: []
  Files: ['test.json', 'train.json', 'val.json']

/content/visual_news_dataset/News_clippings/data/semantics_clip_text_text:
  Dirs: []
  Files: ['test.json', 'train.json', 'val.json']

/content/visual_news_dataset/News_clippings/data/person_sbert_text_text:
  Dirs: []
  Files: ['test.json', 

In [None]:
import json, random

train_path = "/content/visual_news_dataset/News_clippings/data/semantics_clip_text_image/train.json"
limited_train_path = "/content/visual_news_dataset/News_clippings/data/semantics_clip_text_image/train_5k.json"


with open(train_path, 'r') as f:
    full_data = json.load(f)

annotations = full_data["annotations"]
print(f"Total samples: {len(annotations)}")

sampled_data = annotations

with open(limited_train_path, 'w') as f:
    json.dump({"annotations": sampled_data}, f, indent=2)

print("Example entry:\n", sampled_data[0])


Total samples: 453128
Example entry:
 {'id': 728421, 'image_id': 728421, 'similarity_score': 1, 'falsified': False}


In [None]:
import json
from sklearn.model_selection import train_test_split

full_path = "/content/visual_news_dataset/News_clippings/data/semantics_clip_text_image/train.json"

with open(full_path, 'r') as f:
    full_data = json.load(f)

all_samples = full_data["annotations"]

labels = [int(sample["falsified"]) for sample in all_samples]
train_data, val_data = train_test_split(all_samples, test_size=0.1, random_state=42, stratify=labels)

with open("/content/train_clean.json", "w") as f:
    json.dump({"annotations": train_data}, f, indent=2)

with open("/content/val_clean.json", "w") as f:
    json.dump({"annotations": val_data}, f, indent=2)

print(f"Train samples: {len(train_data)}")
print(f"Val samples: {len(val_data)}")


Train samples: 407815
Val samples: 45313


In [None]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm
import pickle
import numpy as np
from scipy.spatial.distance import cosine

TRAIN_JSON = "/content/train_clean.json"
VAL_JSON = "/content/val_clean.json"

EMBED_PATHS = {
    "clip_image": "/content/visual_news_dataset/News_clippings/embeddings/clip_image_embeddings/clip_image_embeddings_train.pkl",
    "clip_text": "/content/visual_news_dataset/News_clippings/embeddings/clip_text_embeddings/clip_text_embeddings_train.pkl",
    "sbert": "/content/visual_news_dataset/News_clippings/embeddings/sbert_embeddings/sbert_embeddings_train.pkl"
}

VAL_EMBED_PATHS = {
    "clip_image": "/content/visual_news_dataset/News_clippings/embeddings/clip_image_embeddings/clip_image_embeddings_val.pkl",
    "clip_text": "/content/visual_news_dataset/News_clippings/embeddings/clip_text_embeddings/clip_text_embeddings_val.pkl",
    "sbert": "/content/visual_news_dataset/News_clippings/embeddings/sbert_embeddings/sbert_embeddings_val.pkl"
}

EXPECTED_SIZES = {
    "clip_image": 512,
    "clip_text": 512,
    "sbert": 768
}

BATCH_SIZE = 32
EPOCHS = 5
LR = 1e-3

class SnifferEmbedDataset(Dataset):
    def __init__(self, json_path, embed_paths):
        with open(json_path, 'r') as f:
            self.data = json.load(f)["annotations"]

        self.embeddings = {}
        for name, path in embed_paths.items():
            with open(path, 'rb') as f:
                self.embeddings[name] = pickle.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        eid = entry["id"]
        label = int(entry.get("falsified", False))
        caption = entry.get("caption", "")

        vecs = {}
        feature_parts = []
        for name in self.embeddings:
            expected_dim = EXPECTED_SIZES[name]
            vec = self.embeddings[name].get(eid)
            if vec is None:
                vec = np.zeros(expected_dim, dtype=np.float32)
            else:
                vec = np.array(vec, dtype=np.float32)
                if vec.shape[0] != expected_dim:
                    vec = np.pad(vec, (0, expected_dim - vec.shape[0]))
            vecs[name] = vec
            feature_parts.append(vec)

        feature_vector = np.concatenate(feature_parts)
        features = torch.tensor(feature_vector, dtype=torch.float32)
        return features, torch.tensor(label, dtype=torch.long), caption, eid, vecs


class SnifferEmbedModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 2)
        )

    def forward(self, x):
        return self.classifier(x)


def train_model(model, dataloader, optimizer, loss_fn):
    model.train()
    for features, labels, *_ in tqdm(dataloader):
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        loss = loss_fn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def evaluate_model(model, dataloader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for features, labels, *_ in dataloader:
            features = features.to(device)
            outputs = model(features)
            preds = outputs.argmax(dim=1).cpu().tolist()
            y_pred.extend(preds)
            y_true.extend(labels.tolist())
    acc = accuracy_score(y_true, y_pred)
    print(f"\n Accuracy: {acc:.4f}")
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=["Real", "Out-of-context"], zero_division=0))
    return y_true, y_pred


def explain_predictions(model, dataset, device, num_samples=5):
    model.eval()
    print("\n Sample Paper-style Explanations:\n")
    count = 0
    for i in range(len(dataset)):
        features, label, caption, eid, vecs = dataset[i]
        features = features.unsqueeze(0).to(device)
        pred = model(features).argmax().item()

        label_text = "Out-of-context" if label.item() == 1 else "Real"
        pred_text = "Out-of-context" if pred == 1 else "Real"

        clip_sim = 1 - cosine(vecs["clip_image"], vecs["clip_text"])
        sbert_score = np.linalg.norm(vecs["sbert"])

        if pred_text == "Out-of-context":
            explanation = (
                f"No, the image is wrongly used in a different news context. "
                f"On one hand, the caption discusses a specific context, but the person or scene in the image may not correspond. "
                f"The similarity between the caption and image is low (cosine={clip_sim:.2f}), and the contextual coherence score is weak (sbert norm={sbert_score:.2f}). "
                f"Therefore, the image is more likely to be misleading in this captioned context."
            )
        else:
            explanation = (
                f"Yes, the image and caption likely match. The similarity score is high (cosine={clip_sim:.2f}) and the contextual coherence (sbert norm={sbert_score:.2f}) indicates consistency between the scene and text."
            )

        print(f" ID: {eid}")
        print(f" Caption: {caption}")
        print(f" Prediction: {pred_text} | Ground Truth: {label_text}")
        print(f" Explanation: {explanation}\n")

        count += 1
        if count >= num_samples:
            break


input_dim = sum(EXPECTED_SIZES.values())
train_set = SnifferEmbedDataset(TRAIN_JSON, EMBED_PATHS)
val_set = SnifferEmbedDataset(VAL_JSON, VAL_EMBED_PATHS)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SnifferEmbedModel(input_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    print(f"\n Epoch {epoch+1}/{EPOCHS}")
    train_model(model, train_loader, optimizer, loss_fn)
    evaluate_model(model, val_loader)

explain_predictions(model, val_set, device)



 Epoch 1/5


100%|██████████| 12745/12745 [00:47<00:00, 266.01it/s]



 Accuracy: 0.5000
[[22657     0]
 [22656     0]]
                precision    recall  f1-score   support

          Real       0.50      1.00      0.67     22657
Out-of-context       0.00      0.00      0.00     22656

      accuracy                           0.50     45313
     macro avg       0.25      0.50      0.33     45313
  weighted avg       0.25      0.50      0.33     45313


 Epoch 2/5


100%|██████████| 12745/12745 [00:47<00:00, 269.27it/s]



 Accuracy: 0.5000
[[    0 22657]
 [    0 22656]]
                precision    recall  f1-score   support

          Real       0.00      0.00      0.00     22657
Out-of-context       0.50      1.00      0.67     22656

      accuracy                           0.50     45313
     macro avg       0.25      0.50      0.33     45313
  weighted avg       0.25      0.50      0.33     45313


 Epoch 3/5


100%|██████████| 12745/12745 [00:47<00:00, 268.72it/s]



 Accuracy: 0.5000
[[22657     0]
 [22656     0]]
                precision    recall  f1-score   support

          Real       0.50      1.00      0.67     22657
Out-of-context       0.00      0.00      0.00     22656

      accuracy                           0.50     45313
     macro avg       0.25      0.50      0.33     45313
  weighted avg       0.25      0.50      0.33     45313


 Epoch 4/5


100%|██████████| 12745/12745 [00:47<00:00, 270.46it/s]



 Accuracy: 0.5000
[[22657     0]
 [22656     0]]
                precision    recall  f1-score   support

          Real       0.50      1.00      0.67     22657
Out-of-context       0.00      0.00      0.00     22656

      accuracy                           0.50     45313
     macro avg       0.25      0.50      0.33     45313
  weighted avg       0.25      0.50      0.33     45313


 Epoch 5/5


100%|██████████| 12745/12745 [00:46<00:00, 274.15it/s]



 Accuracy: 0.5000
[[22657     0]
 [22656     0]]
                precision    recall  f1-score   support

          Real       0.50      1.00      0.67     22657
Out-of-context       0.00      0.00      0.00     22656

      accuracy                           0.50     45313
     macro avg       0.25      0.50      0.33     45313
  weighted avg       0.25      0.50      0.33     45313


 Sample Paper-style Explanations:

 ID: 1466565
 Caption: 
 Prediction: Real | Ground Truth: Real
 Explanation: Yes, the image and caption likely match. The similarity score is high (cosine=nan) and the contextual coherence (sbert norm=0.00) indicates consistency between the scene and text.

 ID: 203211
 Caption: 
 Prediction: Real | Ground Truth: Out-of-context
 Explanation: Yes, the image and caption likely match. The similarity score is high (cosine=nan) and the contextual coherence (sbert norm=0.00) indicates consistency between the scene and text.

 ID: 94787
 Caption: 
 Prediction: Real | Ground T

  dist = 1.0 - uv / math.sqrt(uu * vv)
