In [None]:
# Cell 1: imports & corrected skimage imports
import os, json, time
from pathlib import Path
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

# FIXED imports (greycomatrix → graycomatrix, greycoprops → graycoprops)
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops
from skimage.measure import shannon_entropy

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array

import warnings
warnings.filterwarnings("ignore")

# Correct path to your preprocessing outputs
# ---------------------- IMPORTANT ----------------------
# Change to YOUR dataset folder printed by "os.listdir('/kaggle/input')"
OUT_ROOT = Path("/kaggle/input/2-preprocessing/artifacts/preprocessed_fast")
# -------------------------------------------------------

STRICT_DIR = OUT_ROOT / "strict"
CNN_DIR    = OUT_ROOT / "cnn"

DSV_OUT = Path("artifacts/dsv_features")   # output directory
DSV_OUT.mkdir(parents=True, exist_ok=True)

splits = ["train", "validation", "test"]

print("Using OUT_ROOT:", OUT_ROOT)
print("Strict exists:", STRICT_DIR.exists())
print("CNN exists:", CNN_DIR.exists())

In [None]:
# Cell 2: helpers
def read_image(p):
    img = cv2.imread(str(p))
    if img is None:
        return None
    return img

def to_gray(img):
    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

def safe_resize(img, size=(224,224)):
    return cv2.resize(img, size, interpolation=cv2.INTER_AREA)

def extract_grid_patches(img, num_patches=4, patch_size=(160,160)):
    # grid sampling: 2x2 grid if num_patches=4; if num_patches>4, sample random
    h,w = img.shape[:2]
    patches = []
    if num_patches == 1:
        cx, cy = w//2, h//2
        x1 = max(0, cx - patch_size[0]//2); y1 = max(0, cy - patch_size[1]//2)
        patches.append(img[y1:y1+patch_size[1], x1:x1+patch_size[0]])
        return patches
    # attempt 2x2 grid positions
    rows = int(np.sqrt(num_patches))
    cols = rows
    if rows*cols < num_patches:
        cols += 1
    x_steps = np.linspace(0, w-patch_size[0], cols, dtype=int)
    y_steps = np.linspace(0, h-patch_size[1], rows, dtype=int)
    for y in y_steps:
        for x in x_steps:
            if len(patches) >= num_patches: break
            p = img[y:y+patch_size[1], x:x+patch_size[0]]
            if p.shape[0] == patch_size[1] and p.shape[1] == patch_size[0]:
                patches.append(p)
    # if still fewer, sample random
    while len(patches) < num_patches:
        rx = np.random.randint(0, max(1, w-patch_size[0]+1))
        ry = np.random.randint(0, max(1, h-patch_size[1]+1))
        p = img[ry:ry+patch_size[1], rx:rx+patch_size[0]]
        patches.append(p)
    return patches

In [None]:
# Cell 3: classical features — FULL, CLEAN, CORRECTED VERSION

def lbp_hist_features(gray, P=8, R=1):
    lbp = local_binary_pattern(gray, P, R, method="uniform")
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, int(lbp.max())+2), density=True)
    return hist

def glcm_features(gray, distances=[1], angles=[0], levels=8):
    imgq = np.floor(gray / (256/levels)).astype('uint8')

    glcm = graycomatrix(
        imgq,
        distances=distances,
        angles=angles,
        levels=levels,
        symmetric=True,
        normed=True
    )

    feats = {}
    props = ['contrast','dissimilarity','homogeneity','energy','correlation']
    for p in props:
        try:
            feats[p] = float(graycoprops(glcm, p).mean())
        except:
            feats[p] = 0.0
    return feats

def fft_bandpower_features(gray):
    f = np.fft.fft2(gray.astype(float))
    fshift = np.fft.fftshift(f)
    mag = np.abs(fshift)

    h,w = gray.shape
    cy, cx = h//2, w//2
    r = np.sqrt((np.arange(h)[:,None] - cy)**2 + (np.arange(w)[None,:] - cx)**2)
    r = r / r.max()

    low = mag[r<=0.1].mean() if (r<=0.1).any() else 0.0
    mid = mag[(r>0.1)&(r<=0.4)].mean() if ((r>0.1)&(r<=0.4)).any() else 0.0
    high = mag[r>0.4].mean() if (r>0.4).any() else 0.0
    total = low + mid + high if (low+mid+high)>0 else 1.0

    return {
        'fft_low': float(low/total),
        'fft_mid': float(mid/total),
        'fft_high': float(high/total)
    }

def edge_features(gray):
    sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0)
    sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1)
    mag = np.sqrt(sobelx*sobelx + sobely*sobely)
    sobel_mean = float(np.mean(mag))

    canny = cv2.Canny(gray, 50, 150)
    canny_ratio = float(canny.mean() / 255.0)

    return {
        'sobel_mean': sobel_mean,
        'canny_ratio': canny_ratio
    }

def keypoint_counts(gray):
    orb = cv2.ORB_create(nfeatures=1000)
    kp_orb = orb.detect(gray, None)
    orb_count = len(kp_orb)

    try:
        ak = cv2.AKAZE_create()
        kp_ak = ak.detect(gray, None)
        ak_count = len(kp_ak)
    except:
        ak_count = 0

    return {
        'orb_kp': orb_count,
        'akaze_kp': ak_count
    }

def entropy_feature(gray):
    return float(shannon_entropy(gray))

In [None]:
# Cell 4: CNN patch embedding extractor
# We will use MobileNetV2 (imagenet) as a fixed feature extractor (no training here).
IMG_SIZE = (160,160)
base_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg', input_shape=(IMG_SIZE[1], IMG_SIZE[0], 3))
print("Loaded MobileNetV2 base for patch embeddings.")

def patch_embeddings(img, num_patches=4):
    patches = extract_grid_patches(img, num_patches=num_patches, patch_size=IMG_SIZE)
    emb_list = []
    for p in patches:
        # ensure 3 channels
        if p.shape[2] == 1:
            p = cv2.cvtColor(p, cv2.COLOR_GRAY2BGR)
        p_resized = cv2.resize(p, IMG_SIZE)
        x = img_to_array(p_resized)
        x = preprocess_input(x)
        x = np.expand_dims(x, axis=0)
        feat = base_model.predict(x, verbose=0)
        emb_list.append(feat.flatten())
    emb_arr = np.vstack(emb_list)  # shape (num_patches, feat_dim)
    # return aggregated stats
    return {
        'patch_emb_mean': emb_arr.mean(axis=0),
        'patch_emb_std': emb_arr.std(axis=0),
        'patch_count': emb_arr.shape[0]
    }

In [None]:
# Cell 5: driver to extract features for a single image (returns dict)
def extract_image_features(img_path, num_patches=4):
    img = read_image(img_path)
    if img is None:
        return None
    gray = to_gray(img)
    feats = {}
    # basic metadata
    feats['image'] = str(img_path)
    feats['height'], feats['width'] = img.shape[:2]
    # classical
    feats.update(fft_bandpower_features(gray))
    feats.update(edge_features(gray))
    feats.update(keypoint_counts(gray))
    feats['entropy'] = entropy_feature(gray)
    # LBP summary (we'll store first 10 histogram bins; pad if shorter)
    lbp_hist = lbp_hist_features(gray, P=8, R=1)
    for i in range(16):
        feats[f'lbp_{i}'] = float(lbp_hist[i]) if i < len(lbp_hist) else 0.0
    # GLCM
    glcm = glcm_features(gray, distances=[1], angles=[0], levels=8)
    for k,v in glcm.items():
        feats[f'glcm_{k}'] = v
    # patch embeddings (aggregated mean/std) -> these are vectors; we'll save reduced stats
    emb = patch_embeddings(img, num_patches=num_patches)
    # to keep CSV small, store embedding mean length stats (mean of embedding values and std)
    feats['patch_emb_mean_mean'] = float(np.mean(emb['patch_emb_mean']))
    feats['patch_emb_mean_std']  = float(np.std(emb['patch_emb_mean']))
    feats['patch_emb_std_mean']  = float(np.mean(emb['patch_emb_std']))
    feats['patch_emb_std_std']   = float(np.std(emb['patch_emb_std']))
    feats['patch_count'] = int(emb['patch_count'])
    return feats

In [None]:
# Cell 6: batch run and CSV output (label-preserving version)

OUT_DIR = DSV_OUT
LOG_JSON = OUT_DIR / "dsv_extract_log.json"

if LOG_JSON.exists():
    with open(LOG_JSON,"r") as f:
        processed_log = json.load(f)
else:
    processed_log = {}

for split in splits:
    print(f"\nProcessing split: {split}")

    input_dir = CNN_DIR / split

    if not input_dir.exists():
        print("Missing directory:", input_dir)
        continue

    print("Input dir:", input_dir)
    files = sorted(input_dir.rglob("*.jpg"))
    print("Files:", len(files))

    rows = []
    csv_path = OUT_DIR / f"{split}.csv"

    for p in tqdm(files):
        key = f"{split}/{p.name}"
        if key in processed_log:
            continue

        feats = extract_image_features(p, num_patches=4)
        if feats is None:
            processed_log[key] = {"status":"read_fail"}
            continue

        # Extract true label from folder name
        true_label = p.parent.name     # e.g., denomination_10, denomination_fake

        feats['source'] = true_label 
        feats['split'] = split
        rows.append(feats)
        processed_log[key] = {"status":"done"}

        if len(rows) >= 200:
            df = pd.DataFrame(rows)
            if csv_path.exists():
                df.to_csv(csv_path, mode='a', header=False, index=False)
            else:
                df.to_csv(csv_path, index=False)
            rows = []
            with open(LOG_JSON,"w") as f:
                json.dump(processed_log, f, indent=2)

    # final flush
    if rows:
        df = pd.DataFrame(rows)
        if csv_path.exists():
            df.to_csv(csv_path, mode='a', header=False, index=False)
        else:
            df.to_csv(csv_path, index=False)

    with open(LOG_JSON,"w") as f:
        json.dump(processed_log, f, indent=2)

    print("Saved:", csv_path)

print("DSV feature extraction complete.")

In [None]:
# Cell 7: combine and summary
dfs = []
for split in splits:
    p = DSV_OUT / f"{split}.csv"
    if p.exists():
        df = pd.read_csv(p)
        dfs.append(df)
if len(dfs)>0:
    all_df = pd.concat(dfs, ignore_index=True)
    all_csv = DSV_OUT / "all_dsv_features.csv"
    all_df.to_csv(all_csv, index=False)
    print("Combined CSV saved to:", all_csv)
    print("Total rows:", len(all_df))
else:
    print("No per-split CSVs found.")