In [1]:
!pip install -q datasets==3.5.1 pytorch-fid lpips

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m276.5/491.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import sys
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import math
from tqdm import tqdm
import argparse
from datasets import load_dataset
import warnings
import json
from datetime import datetime

from pytorch_fid import fid_score
from pytorch_fid.inception import InceptionV3
import lpips


warnings.filterwarnings('ignore')


In [None]:
class DatasetWrapper(Dataset):
    """Wrapper for HuggingFace datasets to work with PyTorch DataLoader"""

    def __init__(self, hf_dataset, transform=None, max_samples=None):
        self.dataset = hf_dataset
        self.transform = transform
        self.max_samples = max_samples

    def __len__(self):
        if self.max_samples:
            return min(len(self.dataset), self.max_samples)
        return len(self.dataset)

    def __getitem__(self, idx):
        if idx >= len(self):
            raise IndexError

        item = self.dataset[idx]
        image = item['image']

        if image.mode != 'RGB':
            image = image.convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image

class MetricsCalculator:
    """Main class for calculating FID, IS, and LPIPS metrics"""

    def __init__(self, device='cuda', batch_size=32, num_workers=4):
        self.device = device if torch.cuda.is_available() else 'cpu'
        self.batch_size = batch_size
        self.num_workers = num_workers

        print(f"Using device: {self.device}")

        self.lpips_model = lpips.LPIPS(net='alex').to(self.device)
        self.lpips_model.eval()

        self.inception_model = InceptionV3([InceptionV3.BLOCK_INDEX_BY_DIM[2048]]).to(self.device)
        self.inception_model.eval()

        self.transform = transforms.Compose([
            transforms.Resize((299, 299)),  # Inception input size
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
        ])

        self.lpips_transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5],
                               std=[0.5, 0.5, 0.5])
        ])

    def load_datasets(self, real_max_samples=None, fake_max_samples=None):
        """Load the HuggingFace datasets"""
        print("Loading real dataset (PH2)...")
        real_dataset = load_dataset("Shah1st/PH2", split="train")

        print("Loading generated dataset...")
        fake_dataset = load_dataset("Shah1st/skin-cancer-flux.1-dev-images", split="train")

        print(f"Real dataset size: {len(real_dataset)}")
        print(f"Generated dataset size: {len(fake_dataset)}")

        self.real_dataset = DatasetWrapper(real_dataset, self.transform, real_max_samples)
        self.fake_dataset = DatasetWrapper(fake_dataset, self.transform, fake_max_samples)

        self.real_lpips_dataset = DatasetWrapper(real_dataset, self.lpips_transform, real_max_samples)
        self.fake_lpips_dataset = DatasetWrapper(fake_dataset, self.lpips_transform, fake_max_samples)

        print(f"Using {len(self.real_dataset)} real samples")
        print(f"Using {len(self.fake_dataset)} fake samples")

    def extract_features(self, dataset, desc="Extracting features"):
        """Extract Inception features from dataset"""
        dataloader = DataLoader(dataset, batch_size=self.batch_size,
                               shuffle=False, num_workers=self.num_workers)

        features = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc=desc):
                batch = batch.to(self.device)
                pred = self.inception_model(batch)[0]

                if pred.size(2) != 1 or pred.size(3) != 1:
                    pred = F.adaptive_avg_pool2d(pred, output_size=(1, 1))

                features.append(pred.cpu().numpy().reshape(pred.size(0), -1))

        return np.concatenate(features, axis=0)

    def calculate_fid(self):
        """Calculate Fréchet Inception Distance"""
        print("\n" + "="*50)
        print("CALCULATING FID (Fréchet Inception Distance)")
        print("="*50)

        real_features = self.extract_features(self.real_dataset, "Extracting real features")
        fake_features = self.extract_features(self.fake_dataset, "Extracting fake features")

        mu_real = np.mean(real_features, axis=0)
        sigma_real = np.cov(real_features, rowvar=False)

        mu_fake = np.mean(fake_features, axis=0)
        sigma_fake = np.cov(fake_features, rowvar=False)

        diff = mu_real - mu_fake
        covmean, _ = scipy.linalg.sqrtm(sigma_real.dot(sigma_fake), disp=False)

        if np.iscomplexobj(covmean):
            covmean = covmean.real

        fid = diff.dot(diff) + np.trace(sigma_real + sigma_fake - 2*covmean)

        print(f"FID Score: {fid:.4f}")
        print("Lower is better - measures quality and diversity of generated images")

        return fid

    def calculate_kid(self, max_subset_size=1000):
      """Calculate Kernel Inception Distance"""
      print("\n" + "="*50)
      print("CALCULATING KID (Kernel Inception Distance)")
      print("="*50)

      real_features = self.extract_features(self.real_dataset, "Extracting real features")
      fake_features = self.extract_features(self.fake_dataset, "Extracting fake features")

      if len(real_features) > max_subset_size:
          indices = np.random.choice(len(real_features), max_subset_size, replace=False)
          real_features = real_features[indices]

      if len(fake_features) > max_subset_size:
          indices = np.random.choice(len(fake_features), max_subset_size, replace=False)
          fake_features = fake_features[indices]

      kid_value = self._compute_kid(real_features, fake_features)

      print(f"KID Score: {kid_value:.6f}")
      print("Lower is better - measures distribution difference (scale: ~0.001-0.1)")

      return kid_value

    def _compute_kid(self, X_real, X_fake, gamma=None):
        """Compute KID between two sets of features"""
        m = X_real.shape[0]
        n = X_fake.shape[0]

        if gamma is None:
            XX = np.dot(X_real, X_real.T)
            XY = np.dot(X_real, X_fake.T)
            YY = np.dot(X_fake, X_fake.T)

            X_diag = np.diag(XX).reshape(-1, 1)
            Y_diag = np.diag(YY).reshape(-1, 1)

            X_dist = X_diag + X_diag.T - 2 * XX
            Y_dist = Y_diag + Y_diag.T - 2 * YY
            XY_dist = X_diag + Y_diag.T - 2 * XY

            distances = np.concatenate([X_dist.flatten(), Y_dist.flatten(), XY_dist.flatten()])
            distances = distances[distances > 0]
            gamma = 1.0 / (2 * np.median(distances))

        def rbf_kernel(X, Y, gamma):
            """RBF kernel computation"""
            X_norm = np.sum(X**2, axis=1, keepdims=True)
            Y_norm = np.sum(Y**2, axis=1, keepdims=True)
            distances = X_norm + Y_norm.T - 2 * np.dot(X, Y.T)
            return np.exp(-gamma * distances)

        Kxx = rbf_kernel(X_real, X_real, gamma)
        Kyy = rbf_kernel(X_fake, X_fake, gamma)
        Kxy = rbf_kernel(X_real, X_fake, gamma)

        # KID calculation
        kid = (np.sum(Kxx) / (m * m) + np.sum(Kyy) / (n * n) - 2 * np.sum(Kxy) / (m * n))

        return kid

    def calculate_is(self, splits=10):
      """Calculate Inception Score"""
      print("\n" + "="*50)
      print("CALCULATING IS (Inception Score)")
      print("="*50)

      import torchvision.models as models
      is_model = models.inception_v3(pretrained=True, transform_input=False).to(self.device)
      is_model.eval()

      is_transform = transforms.Compose([
          transforms.Resize((299, 299)),
          transforms.ToTensor(),
          transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
      ])

      # Create IS-specific dataset wrapper
      is_dataset = DatasetWrapper(self.fake_dataset.dataset, is_transform,
                                self.fake_dataset.max_samples)

      # We only calculate IS on the fake dataset
      dataloader = DataLoader(is_dataset, batch_size=self.batch_size,
                            shuffle=False, num_workers=self.num_workers)

      predictions = []

      with torch.no_grad():
          for batch in tqdm(dataloader, desc="Getting IS predictions"):
              batch = batch.to(self.device)

              pred = is_model(batch)

              if isinstance(pred, tuple):
                  pred = pred[0]

              pred = F.softmax(pred, dim=1)
              predictions.append(pred.cpu().numpy())

      predictions = np.concatenate(predictions, axis=0)

      N = predictions.shape[0]
      split_size = N // splits

      scores = []
      for i in range(splits):
          start_idx = i * split_size
          end_idx = start_idx + split_size if i < splits - 1 else N

          split_predictions = predictions[start_idx:end_idx]

          py = np.mean(split_predictions, axis=0)  # Marginal distribution
          kl_div = split_predictions * (np.log(split_predictions + 1e-16) - np.log(py + 1e-16))
          kl_div = np.mean(np.sum(kl_div, axis=1))

          scores.append(np.exp(kl_div))

      is_mean = np.mean(scores)
      is_std = np.std(scores)

      print(f"IS Score: {is_mean:.4f} ± {is_std:.4f}")
      print("Higher is better - measures quality and diversity")

      return is_mean, is_std

    def calculate_lpips(self, num_pairs=1000):
        """Calculate LPIPS (Learned Perceptual Image Patch Similarity)"""
        print("\n" + "="*50)
        print("CALCULATING LPIPS (Learned Perceptual Image Patch Similarity)")
        print("="*50)

        real_loader = DataLoader(self.real_lpips_dataset, batch_size=1,
                                shuffle=True, num_workers=self.num_workers)
        fake_loader = DataLoader(self.fake_lpips_dataset, batch_size=1,
                                shuffle=True, num_workers=self.num_workers)

        lpips_scores = []

        real_iter = iter(real_loader)
        fake_iter = iter(fake_loader)

        for i in tqdm(range(min(num_pairs, len(self.real_lpips_dataset), len(self.fake_lpips_dataset))),
                      desc="Calculating LPIPS"):
            try:
                real_img = next(real_iter).to(self.device)
                fake_img = next(fake_iter).to(self.device)

                distance = self.lpips_model(real_img, fake_img)
                lpips_scores.append(distance.item())

            except StopIteration:
                break

        lpips_mean = np.mean(lpips_scores)
        lpips_std = np.std(lpips_scores)

        print(f"LPIPS Score: {lpips_mean:.4f} ± {lpips_std:.4f}")
        print("Lower is better - measures perceptual similarity between real and fake images")

        return lpips_mean, lpips_std

    def run_evaluation(self, real_max_samples=None, fake_max_samples=1000,
                      save_results=True, output_file=None):
        """Run complete evaluation pipeline"""
        print("SKIN CANCER IMAGE GENERATION EVALUATION")
        print("="*50)
        print(f"Real Dataset: Shah1st/PH2")
        print(f"Generated Dataset: Shah1st/skin-cancer-flux.1-dev-images")
        print(f"Device: {self.device}")
        print("="*50)

        self.load_datasets(real_max_samples, fake_max_samples)

        results = {}

        # FID
        try:
            fid_score = self.calculate_fid()
            results['fid'] = fid_score
        except Exception as e:
            print(f"Error calculating FID: {e}")
            results['fid'] = None

        # KID
        try:
            kid_score = self.calculate_kid()
            results['kid'] = kid_score
        except Exception as e:
            print(f"Error calculating KID: {e}")
            results['kid'] = None

        # IS
        try:
            is_mean, is_std = self.calculate_is()
            results['is_mean'] = is_mean
            results['is_std'] = is_std
        except Exception as e:
            print(f"Error calculating IS: {e}")
            results['is_mean'] = None
            results['is_std'] = None

        # LPIPS
        try:
            lpips_mean, lpips_std = self.calculate_lpips()
            results['lpips_mean'] = lpips_mean
            results['lpips_std'] = lpips_std
        except Exception as e:
            print(f"Error calculating LPIPS: {e}")
            results['lpips_mean'] = None
            results['lpips_std'] = None

        print("\n" + "="*50)
        print("EVALUATION SUMMARY")
        print("="*50)
        if results.get('fid') is not None:
            print(f"FID Score: {results['fid']:.4f}")
        if results.get('kid') is not None:
            print(f"KID Score: {results['kid']:.6f}")
        if results.get('is_mean') is not None:
            print(f"IS Score: {results['is_mean']:.4f} ± {results['is_std']:.4f}")
        if results.get('lpips_mean') is not None:
            print(f"LPIPS Score: {results['lpips_mean']:.4f} ± {results['lpips_std']:.4f}")
        print("="*50)

        if save_results:
            if output_file is None:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                output_file = f"evaluation_results_{timestamp}.json"

            json_results = {}
            for key, value in results.items():
                if value is not None:
                    if isinstance(value, np.ndarray):
                        json_results[key] = value.tolist()
                    elif isinstance(value, (np.float32, np.float64)):
                        json_results[key] = float(value)
                    elif isinstance(value, (np.int32, np.int64)):
                        json_results[key] = int(value)
                    else:
                        json_results[key] = value
                else:
                    json_results[key] = None

            with open(output_file, 'w') as f:
                json.dump({
                    'timestamp': datetime.now().isoformat(),
                    'real_dataset': 'Shah1st/PH2',
                    'generated_dataset': 'Shah1st/skin-cancer-flux.1-dev-images',
                    'real_samples_used': len(self.real_dataset),
                    'fake_samples_used': len(self.fake_dataset),
                    'results': json_results
                }, f, indent=2)

            print(f"Results saved to: {output_file}")

        return results

try:
    import scipy.linalg
except ImportError:
    print("Installing scipy...")
    os.system("pip install scipy")
    import scipy.linalg



In [3]:
batch_size = 64

device = "cuda"
real_samples = None
fake_samples = 200
output = '200-fake.json'
save_results = True


calculator = MetricsCalculator(
    device=device,
    batch_size=batch_size,
    num_workers=2
)

results = calculator.run_evaluation(
    real_max_samples=real_samples,
    fake_max_samples=fake_samples,
    save_results=save_results,
    output_file=output
)

Using device: cuda
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]


Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:01<00:00, 232MB/s]


Loading model from: /usr/local/lib/python3.11/dist-packages/lpips/weights/v0.1/alex.pth


Downloading: "https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth" to /root/.cache/torch/hub/checkpoints/pt_inception-2015-12-05-6726825d.pth
100%|██████████| 91.2M/91.2M [00:01<00:00, 70.9MB/s]


SKIN CANCER IMAGE GENERATION EVALUATION
Real Dataset: Shah1st/PH2
Generated Dataset: Shah1st/skin-cancer-flux.1-dev-images
Device: cuda
Loading real dataset (PH2)...


README.md:   0%|          | 0.00/517 [00:00<?, ?B/s]

(…)-00000-of-00001-9f7fbbd66f09fd25.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200 [00:00<?, ? examples/s]

Loading generated dataset...


README.md:   0%|          | 0.00/361 [00:00<?, ?B/s]

train-00000-of-00008.parquet:   0%|          | 0.00/448M [00:00<?, ?B/s]

train-00001-of-00008.parquet:   0%|          | 0.00/448M [00:00<?, ?B/s]

train-00002-of-00008.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

train-00003-of-00008.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

train-00004-of-00008.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

train-00005-of-00008.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

train-00006-of-00008.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

train-00007-of-00008.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Real dataset size: 200
Generated dataset size: 10000
Using 200 real samples
Using 200 fake samples

CALCULATING FID (Fréchet Inception Distance)


Extracting real features: 100%|██████████| 4/4 [00:01<00:00,  2.16it/s]
Extracting fake features: 100%|██████████| 4/4 [00:02<00:00,  1.79it/s]


FID Score: 142.6101
Lower is better - measures quality and diversity of generated images

CALCULATING KID (Kernel Inception Distance)


Extracting real features: 100%|██████████| 4/4 [00:01<00:00,  2.91it/s]
Extracting fake features: 100%|██████████| 4/4 [00:02<00:00,  1.79it/s]


KID Score: 0.112115
Lower is better - measures distribution difference (scale: ~0.001-0.1)

CALCULATING IS (Inception Score)


Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:00<00:00, 205MB/s] 
Getting IS predictions: 100%|██████████| 4/4 [00:02<00:00,  1.75it/s]

IS Score: 1.9186 ± 0.1806
Higher is better - measures quality and diversity

CALCULATING LPIPS (Learned Perceptual Image Patch Similarity)



Calculating LPIPS: 100%|██████████| 200/200 [00:01<00:00, 107.12it/s]


LPIPS Score: 0.5483 ± 0.0754
Lower is better - measures perceptual similarity between real and fake images

EVALUATION SUMMARY
FID Score: 142.6101
KID Score: 0.112115
IS Score: 1.9186 ± 0.1806
LPIPS Score: 0.5483 ± 0.0754
Results saved to: 200-fake.json


In [4]:
batch_size = 64

device = "cuda"
real_samples = 100
fake_samples = 100
output = '100-fake.json'
save_results = True


calculator = MetricsCalculator(
    device=device,
    batch_size=batch_size,
    num_workers=2
)

results = calculator.run_evaluation(
    real_max_samples=real_samples,
    fake_max_samples=fake_samples,
    save_results=save_results,
    output_file=output
)

Using device: cuda
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/python3.11/dist-packages/lpips/weights/v0.1/alex.pth
SKIN CANCER IMAGE GENERATION EVALUATION
Real Dataset: Shah1st/PH2
Generated Dataset: Shah1st/skin-cancer-flux.1-dev-images
Device: cuda
Loading real dataset (PH2)...
Loading generated dataset...
Real dataset size: 200
Generated dataset size: 10000
Using 100 real samples
Using 100 fake samples

CALCULATING FID (Fréchet Inception Distance)


Extracting real features: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s]
Extracting fake features: 100%|██████████| 2/2 [00:01<00:00,  1.39it/s]


FID Score: 146.4651
Lower is better - measures quality and diversity of generated images

CALCULATING KID (Kernel Inception Distance)


Extracting real features: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s]
Extracting fake features: 100%|██████████| 2/2 [00:01<00:00,  1.38it/s]


KID Score: 0.105913
Lower is better - measures distribution difference (scale: ~0.001-0.1)

CALCULATING IS (Inception Score)


Getting IS predictions: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s]


IS Score: 1.7624 ± 0.1730
Higher is better - measures quality and diversity

CALCULATING LPIPS (Learned Perceptual Image Patch Similarity)


Calculating LPIPS: 100%|██████████| 100/100 [00:00<00:00, 120.13it/s]


LPIPS Score: 0.5400 ± 0.0693
Lower is better - measures perceptual similarity between real and fake images

EVALUATION SUMMARY
FID Score: 146.4651
KID Score: 0.105913
IS Score: 1.7624 ± 0.1730
LPIPS Score: 0.5400 ± 0.0693
Results saved to: 100-fake.json


In [5]:
batch_size = 64

device = "cuda"
real_samples = None
fake_samples = 1000
output = '1000-fake.json'
save_results = True


calculator = MetricsCalculator(
    device=device,
    batch_size=batch_size,
    num_workers=2
)

results = calculator.run_evaluation(
    real_max_samples=real_samples,
    fake_max_samples=fake_samples,
    save_results=save_results,
    output_file=output
)


Using device: cuda
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/python3.11/dist-packages/lpips/weights/v0.1/alex.pth
SKIN CANCER IMAGE GENERATION EVALUATION
Real Dataset: Shah1st/PH2
Generated Dataset: Shah1st/skin-cancer-flux.1-dev-images
Device: cuda
Loading real dataset (PH2)...
Loading generated dataset...
Real dataset size: 200
Generated dataset size: 10000
Using 200 real samples
Using 1000 fake samples

CALCULATING FID (Fréchet Inception Distance)


Extracting real features: 100%|██████████| 4/4 [00:01<00:00,  2.79it/s]
Extracting fake features: 100%|██████████| 16/16 [00:07<00:00,  2.03it/s]


FID Score: 138.2395
Lower is better - measures quality and diversity of generated images

CALCULATING KID (Kernel Inception Distance)


Extracting real features: 100%|██████████| 4/4 [00:01<00:00,  2.78it/s]
Extracting fake features: 100%|██████████| 16/16 [00:08<00:00,  1.98it/s]


KID Score: 0.137252
Lower is better - measures distribution difference (scale: ~0.001-0.1)

CALCULATING IS (Inception Score)


Getting IS predictions: 100%|██████████| 16/16 [00:07<00:00,  2.01it/s]

IS Score: 2.0077 ± 0.0859
Higher is better - measures quality and diversity

CALCULATING LPIPS (Learned Perceptual Image Patch Similarity)



Calculating LPIPS: 100%|██████████| 200/200 [00:01<00:00, 114.39it/s]


LPIPS Score: 0.5503 ± 0.0825
Lower is better - measures perceptual similarity between real and fake images

EVALUATION SUMMARY
FID Score: 138.2395
KID Score: 0.137252
IS Score: 2.0077 ± 0.0859
LPIPS Score: 0.5503 ± 0.0825
Results saved to: 1000-fake.json


In [6]:
batch_size = 64

device = "cuda"
real_samples = None
fake_samples = 10000
output = '10k-fake.json'
save_results = True


calculator = MetricsCalculator(
    device=device,
    batch_size=batch_size,
    num_workers=2
)

results = calculator.run_evaluation(
    real_max_samples=real_samples,
    fake_max_samples=fake_samples,
    save_results=save_results,
    output_file=output
)


Using device: cuda
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /usr/local/lib/python3.11/dist-packages/lpips/weights/v0.1/alex.pth
SKIN CANCER IMAGE GENERATION EVALUATION
Real Dataset: Shah1st/PH2
Generated Dataset: Shah1st/skin-cancer-flux.1-dev-images
Device: cuda
Loading real dataset (PH2)...
Loading generated dataset...
Real dataset size: 200
Generated dataset size: 10000
Using 200 real samples
Using 10000 fake samples

CALCULATING FID (Fréchet Inception Distance)


Extracting real features: 100%|██████████| 4/4 [00:01<00:00,  2.81it/s]
Extracting fake features: 100%|██████████| 157/157 [01:12<00:00,  2.16it/s]


FID Score: 123.8974
Lower is better - measures quality and diversity of generated images

CALCULATING KID (Kernel Inception Distance)


Extracting real features: 100%|██████████| 4/4 [00:01<00:00,  2.77it/s]
Extracting fake features: 100%|██████████| 157/157 [01:12<00:00,  2.16it/s]


KID Score: 0.100224
Lower is better - measures distribution difference (scale: ~0.001-0.1)

CALCULATING IS (Inception Score)


Getting IS predictions: 100%|██████████| 157/157 [01:12<00:00,  2.17it/s]


IS Score: 2.0571 ± 0.0295
Higher is better - measures quality and diversity

CALCULATING LPIPS (Learned Perceptual Image Patch Similarity)


Calculating LPIPS: 100%|██████████| 200/200 [00:01<00:00, 116.49it/s]


LPIPS Score: 0.5540 ± 0.0649
Lower is better - measures perceptual similarity between real and fake images

EVALUATION SUMMARY
FID Score: 123.8974
KID Score: 0.100224
IS Score: 2.0571 ± 0.0295
LPIPS Score: 0.5540 ± 0.0649
Results saved to: 10k-fake.json
