In [None]:
"""
Image Similarity v3 — Pairwise Multi-Descriptor Fusion
========================================================
Compares EVERY image with EVERY other image in the test folder.

Three complementary descriptors computed for each pair:
  1. Deep CNN features  — EfficientNet-B4 at 480px
  2. Color histogram    — HSV histogram
  3. Texture (LBP)      — Local Binary Patterns

Output DataFrame columns:
  image1 | image2 | sim_deep | sim_color | sim_texture | sim_fused
"""

import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from itertools import combinations

import torch
import torchvision.transforms as T
import torchvision.models as models
from sklearn.metrics.pairwise import cosine_similarity
from skimage.feature import local_binary_pattern




TEST_DIR   = "/kaggle/input/datasets/kagglertw/dal-shemagh/dal-shemagh-detection-challenge/images/test"
OUTPUT_CSV = "pairwise_similarity_results.csv"


W_DEEP    = 0.50
W_COLOR   = 0.25
W_TEXTURE = 0.25


CNN_SIZE  = 480


LBP_RADIUS   = 3
LBP_N_POINTS = 8 * LBP_RADIUS




device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

backbone = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.IMAGENET1K_V1)
backbone.classifier = torch.nn.Identity()
backbone = backbone.to(device).eval()

cnn_transform = T.Compose([
    T.Resize((CNN_SIZE, CNN_SIZE)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),
])

def deep_feat(img_path: str) -> np.ndarray | None:
    """EfficientNet-B4 → 1792-D feature vector."""
    try:
        img = Image.open(img_path).convert("RGB")
        t   = cnn_transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            f = backbone(t)
        return f.squeeze().cpu().numpy()
    except Exception as e:
        print(f"  ⚠ deep_feat failed for {img_path}: {e}")
        return None




def color_hist(img_path: str, bins=(16, 8, 8)) -> np.ndarray | None:
    """HSV histogram: H=16 bins, S=8, V=8 → 1024-D normalised vector."""
    try:
        bgr = cv2.imread(img_path)
        if bgr is None:
            raise IOError("cv2.imread returned None")
        hsv = cv2.cvtColor(bgr, cv2.COLOR_BGR2HSV)
        hist = cv2.calcHist(
            [hsv], [0, 1, 2], None,
            [bins[0], bins[1], bins[2]],
            [0, 180, 0, 256, 0, 256]
        )
        hist = hist.flatten().astype(np.float32)
        norm = np.linalg.norm(hist)
        return hist / norm if norm > 0 else hist
    except Exception as e:
        print(f"  ⚠ color_hist failed for {img_path}: {e}")
        return None




def texture_lbp(img_path: str) -> np.ndarray | None:
    """LBP on grayscale image → normalised histogram."""
    try:
        bgr  = cv2.imread(img_path)
        if bgr is None:
            raise IOError("cv2.imread returned None")
        gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)

        h, w   = gray.shape
        n_bins = LBP_N_POINTS + 2
        hists  = []

        
        for row in range(3):
            for col in range(3):
                patch = gray[row*h//3:(row+1)*h//3,
                             col*w//3:(col+1)*w//3]
                lbp   = local_binary_pattern(patch, LBP_N_POINTS,
                                             LBP_RADIUS, method="uniform")
                hist, _ = np.histogram(lbp.ravel(), bins=n_bins,
                                       range=(0, n_bins), density=True)
                hists.append(hist)

        feat = np.concatenate(hists).astype(np.float32)
        norm = np.linalg.norm(feat)
        return feat / norm if norm > 0 else feat
    except Exception as e:
        print(f"  ⚠ texture_lbp failed for {img_path}: {e}")
        return None




all_images = sorted([
    f for f in os.listdir(TEST_DIR)
    if f.lower().endswith((".jpg", ".jpeg", ".png"))
])
print(f"\n── Found {len(all_images)} images ──")


features_cache = {}

print("\n── Extracting features for all images ──")
for fname in tqdm(all_images, desc="Feature extraction"):
    fpath = os.path.join(TEST_DIR, fname)
    
    d = deep_feat(fpath)
    c = color_hist(fpath)
    t = texture_lbp(fpath)
    
    if d is not None and c is not None and t is not None:
        features_cache[fname] = {
            'deep': d,
            'color': c,
            'texture': t
        }
    else:
        print(f"  ⚠ Skipping {fname} due to feature extraction failure")

valid_images = list(features_cache.keys())
print(f"\n✓ Successfully extracted features for {len(valid_images)} images")




total_pairs = len(valid_images) * (len(valid_images) - 1) // 2
print(f"\n── Computing {total_pairs:,} pairwise similarities ──")

records = []
for img1, img2 in tqdm(combinations(valid_images, 2), 
                       total=total_pairs, 
                       desc="Pairwise comparison"):
    
    feat1 = features_cache[img1]
    feat2 = features_cache[img2]
    
    
    sim_d = float(cosine_similarity(
        feat1['deep'].reshape(1, -1), 
        feat2['deep'].reshape(1, -1)
    )[0, 0])
    
    sim_c = float(cosine_similarity(
        feat1['color'].reshape(1, -1), 
        feat2['color'].reshape(1, -1)
    )[0, 0])
    
    sim_t = float(cosine_similarity(
        feat1['texture'].reshape(1, -1), 
        feat2['texture'].reshape(1, -1)
    )[0, 0])
    
    
    sim_f = W_DEEP * sim_d + W_COLOR * sim_c + W_TEXTURE * sim_t
    
    records.append({
        "image1"      : img1,
        "image2"      : img2,
        "sim_deep"    : round(sim_d, 6),
        "sim_color"   : round(sim_c, 6),
        "sim_texture" : round(sim_t, 6),
        "sim_fused"   : round(sim_f, 6),
    })




df = pd.DataFrame(records)
df = df.sort_values("sim_fused", ascending=False).reset_index(drop=True)

print("\n── Top 10 most similar pairs ──")
print(df.head(10).to_string(index=False))
print(f"\nTotal pairs: {len(df):,}")
print(f"Columns    : {list(df.columns)}")




df.to_csv(OUTPUT_CSV, index=False)
print(f"\n✅  Saved → '{OUTPUT_CSV}'")


print("\n── Statistics ──")
print(f"Mean fused similarity: {df['sim_fused'].mean():.4f}")
print(f"Max fused similarity:  {df['sim_fused'].max():.4f}")
print(f"Min fused similarity:  {df['sim_fused'].min():.4f}")

Device: cuda

── Found 842 images ──

── Extracting features for all images ──


Feature extraction:  61%|██████    | 511/842 [02:01<01:18,  4.20it/s]