Now we are gonna be getting the similairty for FAKE VS REAL Dataset

In [1]:
import os
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

data_folder = r"D:\ML League\Intelligence-Sig-Recs-2025\NonMandatoryTasks\SemanticDatasetComparison\train"

real = os.path.join(data_folder, 'REAL')
fake = os.path.join(data_folder, 'FAKE')

In [2]:
def list_images(folder):
    exts = ('.png','.jpg','.jpeg','.bmp')
    return sorted([os.path.join(folder,f) for f in os.listdir(folder) if f.lower().endswith(exts)])

real_img = list_images(real)
fake_img = list_images(fake)

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
from tqdm import tqdm 

BATCH_SIZE = 128

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32",use_fast = 'True')

def get_clip_embeddings(files):
    all_feats = []
    clip_model.eval()
    for i in tqdm(range(0, len(files), BATCH_SIZE), desc=f"Processing {len(files)} images"):
        batch_files = files[i:i+BATCH_SIZE]
        imgs = [Image.open(f).convert("RGB") for f in batch_files]
        inputs = clip_processor(images=imgs, return_tensors="pt").to(device)
        with torch.no_grad():
            feats = clip_model.get_image_features(**inputs)
        all_feats.append(feats.cpu())
    return torch.cat(all_feats, dim=0)

In [6]:
'''fake_emb = get_clip_embeddings(fake_img)
real_emb = get_clip_embeddings(real_img)

torch.save(fake_emb, 'fake_clip_embeddings.pt')
torch.save(real_emb, 'real_clip_embeddings.pt')'''

"fake_emb = get_clip_embeddings(fake_img)\nreal_emb = get_clip_embeddings(real_img)\n\ntorch.save(fake_emb, 'fake_clip_embeddings.pt')\ntorch.save(real_emb, 'real_clip_embeddings.pt')"

In [7]:
fake_emb = torch.load('fake_clip_embeddings.pt')
real_emb = torch.load('real_clip_embeddings.pt')

In [8]:
import torch.nn.functional as F

real_norm = F.normalize(real_emb, dim=1)
fake_norm = F.normalize(fake_emb, dim=1)


In [9]:
c_real = real_norm.mean(dim=0, keepdim=True)
c_fake = fake_norm.mean(dim=0, keepdim=True)

centroid_cos = F.cosine_similarity(c_real, c_fake).item()
print(f"Centroid Cosine Similarity (REAL vs FAKE): {centroid_cos:.4f}")


Centroid Cosine Similarity (REAL vs FAKE): 0.9913


In [10]:
N = min(len(real_norm), len(fake_norm), 1000)
sample_real = real_norm[:N]
sample_fake = fake_norm[:N]

avg_pairwise_cos = (sample_real @ sample_fake.T).mean().item()
print(f"Average Pairwise Cosine Similarity: {avg_pairwise_cos:.4f}")


Average Pairwise Cosine Similarity: 0.7882


The centroid similarity shows that the FAKE dataset captures the overall semantic distribution of REAL images almost perfectly. However, the lower average pairwise similarity might be due to the unavialability of the labels and mismatch similairty matching between different labels which led to a lesser overall score

So i think we can try Frechet Inception distance as it gives a more statsitical comparison between those images and we can get a better idea of it

In [11]:
from torchvision import models, transforms
from tqdm import tqdm
from torch import nn

inception = models.inception_v3(weights=models.Inception_V3_Weights.IMAGENET1K_V1,
                                aux_logits=True, transform_input=False)
inception.fc = nn.Identity()
inception.to(device).eval()

preprocess = transforms.Compose([
    transforms.Resize((299, 299)),  # Inception input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [12]:
def load_images_from_folder(folder):
    paths = [os.path.join(folder, f) for f in os.listdir(folder) 
             if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    return paths

def get_inception_embeddings_gpu(image_paths, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(image_paths), batch_size)):
        batch_paths = image_paths[i:i+batch_size]
        imgs = []
        for p in batch_paths:
            img = Image.open(p).convert('RGB')
            img = preprocess(img)
            imgs.append(img)
        imgs = torch.stack(imgs).to(device)  # push batch to GPU
        with torch.no_grad():
            feat = inception(imgs)  # [B, 2048] on GPU
        embeddings.append(feat.cpu().numpy())  # move to CPU for saving
    embeddings = np.vstack(embeddings)
    return embeddings

real_folder = r"D:\ML League\Intelligence-Sig-Recs-2025\NonMandatoryTasks\SemanticDatasetComparison\train\REAL"
fake_folder = r"D:\ML League\Intelligence-Sig-Recs-2025\NonMandatoryTasks\SemanticDatasetComparison\train\FAKE"

real_paths = load_images_from_folder(real_folder)
fake_paths = load_images_from_folder(fake_folder)

real_emb = get_inception_embeddings_gpu(real_paths)
fake_emb = get_inception_embeddings_gpu(fake_paths)

# Save embeddings
np.save('real_inception_embeddings.npy', real_emb)
np.save('fake_inception_embeddings.npy', fake_emb)

print("Embeddings extracted:", real_emb.shape, fake_emb.shape)

100%|██████████| 1563/1563 [30:37<00:00,  1.18s/it]
100%|██████████| 1563/1563 [28:49<00:00,  1.11s/it]


Embeddings extracted: (50000, 2048) (50000, 2048)


In [None]:
import numpy as np
from scipy.linalg import sqrtm

def calculate_fid(real_embeddings: np.ndarray, fake_embeddings: np.ndarray):
    mu_real = np.mean(real_embeddings, axis=0)
    mu_fake = np.mean(fake_embeddings, axis=0)
    sigma_real = np.cov(real_embeddings, rowvar=False)
    sigma_fake = np.cov(fake_embeddings, rowvar=False)

    diff = mu_real - mu_fake
    diff_squared = diff.dot(diff)

    covmean, _ = sqrtm(sigma_real.dot(sigma_fake), disp=False)

    if np.iscomplexobj(covmean):
        covmean = covmean.real

    fid_value = diff_squared + np.trace(sigma_real + sigma_fake - 2 * covmean)
    return fid_value

real_emb = np.load("real_inception_embeddings.npy")
fake_emb = np.load("fake_inception_embeddings.npy")

fid_score = calculate_fid(real_emb, fake_emb)
print(f"FID (REAL vs FAKE): {fid_score:.4f}")


  covmean, _ = sqrtm(sigma_real.dot(sigma_fake), disp=False)


FID (REAL vs FAKE): 26.0692


In [14]:
real_folder = r"D:\ML League\Intelligence-Sig-Recs-2025\NonMandatoryTasks\SemanticDatasetComparison\test\REAL"
fake_folder = r"D:\ML League\Intelligence-Sig-Recs-2025\NonMandatoryTasks\SemanticDatasetComparison\test\FAKE"

real_paths_test = load_images_from_folder(real_folder)
fake_paths_test = load_images_from_folder(fake_folder)

real_emb_test = get_inception_embeddings_gpu(real_paths_test)
fake_emb_test = get_inception_embeddings_gpu(fake_paths_test)


100%|██████████| 313/313 [04:22<00:00,  1.19it/s]
100%|██████████| 313/313 [04:20<00:00,  1.20it/s]


In [15]:
np.save('real_inception_embeddings_test.npy', real_emb_test)
np.save('fake_inception_embeddings_test.npy', fake_emb_test)

In [17]:
num_samples = 10000  # number of random samples

# Make sure we don't sample more than available
num_samples_real = min(num_samples, real_emb.shape[0])
num_samples_fake = min(num_samples, fake_emb.shape[0])

# Random indices
real_indices = np.random.choice(real_emb.shape[0], num_samples_real, replace=False)
fake_indices = np.random.choice(fake_emb.shape[0], num_samples_fake, replace=False)

# Sampled embeddings
real_sampled = real_emb[real_indices]
fake_sampled = fake_emb[fake_indices]

print("Sampled real embeddings:", real_sampled.shape)
print("Sampled fake embeddings:", fake_sampled.shape)


Sampled real embeddings: (10000, 2048)
Sampled fake embeddings: (10000, 2048)


In [19]:
fid_score = calculate_fid(real_emb_test, fake_emb_test)
print(f"FID TEST (REAL vs FAKE): {fid_score:.4f}")

# Just to see how it comes out on test data

fid_score = calculate_fid(real_sampled, real_emb_test)
print(f"FID (REAL vs REAL_TEST): {fid_score:.4f}")

fid_score = calculate_fid(fake_sampled, fake_emb_test)
print(f"FID (FAKE vs FAKE_TEST): {fid_score:.4f}")

  covmean, _ = sqrtm(sigma_real.dot(sigma_fake), disp=False)


FID TEST (REAL vs FAKE): 30.0561
FID (REAL vs REAL_TEST): 4.7516
FID (FAKE vs FAKE_TEST): 4.6602


Statistically confirms that distribution of fake embeddings differs from real embeddings. These calculations of FID between classes was just to showcase the how the FID Score is given out for semantically similar class.If a class is perfectly similar semantci similarity is close to 0.So even though the FAKE dataset may have reached a very good similiarity with the REAL Dataset there are some imperfections or flaws in them which make them different from REAL dataset