In [1]:
import cv2
import numpy as np
import os
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
from collections import defaultdict
import json

class CBIRSystem:
    def __init__(self, db_path="db", query_path="query"):
        self.db_path = db_path
        self.query_path = query_path
        self.db_features = {}
        self.feature_names = []
        self.current_feature_type = "all"
        self.feature_scaler = None  # For normalization

    def extract_color_features(self, image):
        """Ekstraksi fitur warna: Histogram HSV + Statistik"""
        # Konversi ke HSV
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

        # Histogram HSV (16-8-8 bin)
        hist = cv2.calcHist([hsv], [0, 1, 2], None, [16, 8, 8],
                           [0, 180, 0, 256, 0, 256])
        hist = cv2.normalize(hist, hist).flatten()

        # Statistik per channel
        stats = []
        for i in range(3):
            channel = hsv[:, :, i].flatten()
            stats.extend([
                np.mean(channel),
                np.std(channel),
                skew(channel),
                kurtosis(channel)
            ])

        return np.concatenate([hist, stats])

    def extract_glcm_features(self, image):
        """Ekstraksi fitur tekstur menggunakan GLCM"""
        # Konversi ke grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # GLCM untuk 4 arah: 0°, 45°, 90°, 135°
        angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]
        distances = [1]

        features = []

        for angle in angles:
            # Hitung GLCM manual (sederhana)
            glcm = self.calculate_glcm(gray, distance=1, angle=angle)

            # Ekstrak fitur dari GLCM
            contrast = self.glcm_contrast(glcm)
            correlation = self.glcm_correlation(glcm)
            energy = self.glcm_energy(glcm)
            homogeneity = self.glcm_homogeneity(glcm)

            features.extend([contrast, correlation, energy, homogeneity])

        # Rata-rata dari semua arah
        features = np.array(features).reshape(4, 4)
        avg_features = np.mean(features, axis=0)

        return avg_features

    def calculate_glcm(self, image, distance=1, angle=0):
        """Hitung GLCM matrix"""
        # Kuantisasi ke 8 level untuk efisiensi
        image = (image // 32).astype(np.uint8)
        levels = 8

        glcm = np.zeros((levels, levels), dtype=np.float32)

        # Hitung offset berdasarkan angle
        dy = int(distance * np.sin(angle))
        dx = int(distance * np.cos(angle))

        rows, cols = image.shape

        for i in range(rows - abs(dy)):
            for j in range(cols - abs(dx)):
                i_new = i + dy
                j_new = j + dx

                if 0 <= i_new < rows and 0 <= j_new < cols:
                    glcm[image[i, j], image[i_new, j_new]] += 1

        # Normalisasi
        glcm = glcm / (glcm.sum() + 1e-6)

        return glcm

    def glcm_contrast(self, glcm):
        """Contrast dari GLCM"""
        contrast = 0
        for i in range(glcm.shape[0]):
            for j in range(glcm.shape[1]):
                contrast += glcm[i, j] * (i - j) ** 2
        return contrast

    def glcm_correlation(self, glcm):
        """Correlation dari GLCM"""
        mean_i = np.sum(np.arange(glcm.shape[0]).reshape(-1, 1) * glcm)
        mean_j = np.sum(np.arange(glcm.shape[1]).reshape(1, -1) * glcm)

        std_i = np.sqrt(np.sum(((np.arange(glcm.shape[0]).reshape(-1, 1) - mean_i) ** 2) * glcm))
        std_j = np.sqrt(np.sum(((np.arange(glcm.shape[1]).reshape(1, -1) - mean_j) ** 2) * glcm))

        correlation = 0
        for i in range(glcm.shape[0]):
            for j in range(glcm.shape[1]):
                correlation += ((i - mean_i) * (j - mean_j) * glcm[i, j])

        return correlation / (std_i * std_j + 1e-6)

    def glcm_energy(self, glcm):
        """Energy (ASM) dari GLCM"""
        return np.sum(glcm ** 2)

    def glcm_homogeneity(self, glcm):
        """Homogeneity dari GLCM"""
        homogeneity = 0
        for i in range(glcm.shape[0]):
            for j in range(glcm.shape[1]):
                homogeneity += glcm[i, j] / (1 + abs(i - j))
        return homogeneity

    def extract_shape_features(self, image):
        """Ekstraksi fitur bentuk"""
        # Konversi ke grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Threshold Otsu
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # Cari contours
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Jika tidak ada contour, gunakan keseluruhan image sebagai contour
        if len(contours) == 0:
            # Return default shape features
            h, w = gray.shape
            return np.array([
                w * h * 0.5,  # area (estimasi)
                2 * (w + h),  # perimeter
                w / (h + 1e-6),  # aspect_ratio
                0.5,  # extent
                0.8,  # solidity
                0.5   # circularity
            ])

        # Ambil contour terbesar
        contour = max(contours, key=cv2.contourArea)

        # Hitung fitur bentuk
        area = cv2.contourArea(contour)
        perimeter = cv2.arcLength(contour, True)

        # Jika area atau perimeter terlalu kecil, gunakan nilai default
        if area < 10 or perimeter < 10:
            h, w = gray.shape
            return np.array([
                w * h * 0.5,
                2 * (w + h),
                w / (h + 1e-6),
                0.5,
                0.8,
                0.5
            ])

        # Bounding rectangle
        x, y, w, h = cv2.boundingRect(contour)
        aspect_ratio = w / (h + 1e-6)
        rect_area = w * h
        extent = area / (rect_area + 1e-6)

        # Convex hull
        hull = cv2.convexHull(contour)
        hull_area = cv2.contourArea(hull)
        solidity = area / (hull_area + 1e-6)

        # Circularity
        circularity = (4 * np.pi * area) / (perimeter ** 2 + 1e-6)

        # Clip values to reasonable ranges
        aspect_ratio = np.clip(aspect_ratio, 0.01, 100)
        extent = np.clip(extent, 0, 1)
        solidity = np.clip(solidity, 0, 1)
        circularity = np.clip(circularity, 0, 1)

        return np.array([area, perimeter, aspect_ratio, extent, solidity, circularity])

    def extract_features(self, image_path, feature_type="all"):
        """Ekstrak fitur berdasarkan tipe"""
        image = cv2.imread(str(image_path))
        if image is None:
            return None

        # Resize untuk konsistensi
        image = cv2.resize(image, (256, 256))

        features = []
        feature_dims = []

        if feature_type in ["all", "color"]:
            color_feat = self.extract_color_features(image)
            features.append(color_feat)
            feature_dims.append(len(color_feat))

        if feature_type in ["all", "glcm"]:
            glcm_feat = self.extract_glcm_features(image)
            features.append(glcm_feat)
            feature_dims.append(len(glcm_feat))

        if feature_type in ["all", "shape"]:
            shape_feat = self.extract_shape_features(image)
            features.append(shape_feat)
            feature_dims.append(len(shape_feat))

        if len(features) == 0:
            return None

        combined = np.concatenate(features)

        # Store expected dimensions for this feature type
        if not hasattr(self, 'expected_dims'):
            self.expected_dims = {}
        self.expected_dims[feature_type] = len(combined)

        return combined

    def build_database(self, feature_type="all", normalize=True):
        """Build feature database dari semua gambar di db/"""
        print(f"Building database with feature type: {feature_type}")
        self.db_features = {}
        self.current_feature_type = feature_type

        db_path = Path(self.db_path)

        # Iterasi semua kategori
        failed_images = []
        all_features = []
        temp_data = []

        for category in db_path.iterdir():
            if category.is_dir():
                print(f"Processing category: {category.name}")

                # Iterasi semua gambar dalam kategori
                for img_path in category.glob("*.*"):
                    if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
                        try:
                            features = self.extract_features(img_path, feature_type)

                            if features is not None:
                                rel_path = str(img_path.relative_to(db_path))
                                temp_data.append({
                                    'rel_path': rel_path,
                                    'features': features,
                                    'category': category.name,
                                    'path': str(img_path)
                                })
                                all_features.append(features)
                        except Exception as e:
                            failed_images.append((str(img_path), str(e)))
                            print(f"  ⚠ Failed to process {img_path.name}: {e}")

        # Normalize features (optional but recommended)
        if normalize and len(all_features) > 0:
            print("Normalizing features...")
            all_features = np.array(all_features)

            # Store mean and std for normalization
            self.feature_mean = np.mean(all_features, axis=0)
            self.feature_std = np.std(all_features, axis=0) + 1e-8

            # Apply normalization
            for i, data in enumerate(temp_data):
                normalized_feat = (data['features'] - self.feature_mean) / self.feature_std
                self.db_features[data['rel_path']] = {
                    'features': normalized_feat,
                    'category': data['category'],
                    'path': data['path']
                }
        else:
            # No normalization
            for data in temp_data:
                self.db_features[data['rel_path']] = {
                    'features': data['features'],
                    'category': data['category'],
                    'path': data['path']
                }

        if failed_images:
            print(f"\n⚠ Warning: {len(failed_images)} images failed to process")

        print(f"Database built with {len(self.db_features)} images")
        return len(self.db_features)

    def search(self, query_path, top_k=5, distance_metric="cosine"):
        """Cari gambar mirip"""
        # Ekstrak fitur query dengan feature type yang sama
        feature_type = getattr(self, 'current_feature_type', 'all')
        query_features = self.extract_features(query_path, feature_type)

        if query_features is None:
            return []

        # Hitung jarak ke semua gambar di database
        distances = []

        for img_rel_path, data in self.db_features.items():
            db_feat = data['features']

            # Validasi dimensi fitur
            if len(query_features) != len(db_feat):
                print(f"⚠ Warning: Feature dimension mismatch for {img_rel_path}")
                print(f"  Query: {len(query_features)}, DB: {len(db_feat)}")
                continue

            try:
                # Hitung jarak
                if distance_metric == "cosine":
                    # Cosine similarity (higher is better, so we negate for sorting)
                    sim = cosine_similarity([query_features], [db_feat])[0][0]
                    dist = 1 - sim  # Convert to distance
                else:  # euclidean
                    dist = np.linalg.norm(query_features - db_feat)

                distances.append({
                    'path': data['path'],
                    'rel_path': img_rel_path,
                    'category': data['category'],
                    'distance': dist
                })
            except Exception as e:
                print(f"⚠ Error computing distance for {img_rel_path}: {e}")
                continue

        # Sort berdasarkan jarak (ascending)
        distances.sort(key=lambda x: x['distance'])

        return distances[:top_k]

    def evaluate_precision(self, query_category, results):
        """Hitung Precision@K"""
        relevant = sum(1 for r in results if r['category'] == query_category)
        return relevant / len(results) if len(results) > 0 else 0

    def run_evaluation(self, feature_configs, top_k=5):
        """Jalankan evaluasi untuk berbagai konfigurasi fitur"""
        results = {}

        for config_name, feature_type in feature_configs.items():
            print(f"\n{'='*50}")
            print(f"Evaluating: {config_name}")
            print(f"{'='*50}")

            # Build database dengan fitur tertentu
            self.build_database(feature_type)

            # Evaluasi semua query
            query_path = Path(self.query_path)
            precisions = []

            for category in query_path.iterdir():
                if category.is_dir():
                    for query_img in category.glob("*.*"):
                        if query_img.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
                            # Search
                            search_results = self.search(query_img, top_k)

                            # Hitung precision
                            precision = self.evaluate_precision(category.name, search_results)
                            precisions.append(precision)

            avg_precision = np.mean(precisions) if precisions else 0
            results[config_name] = {
                'avg_precision': avg_precision,
                'all_precisions': precisions
            }

            print(f"Average Precision@{top_k}: {avg_precision:.4f}")

        return results

    def visualize_results(self, query_path, results, save_path=None):
        """Visualisasi hasil pencarian"""
        # Load query image
        query_img = cv2.imread(str(query_path))
        query_img = cv2.cvtColor(query_img, cv2.COLOR_BGR2RGB)

        # Create figure
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        fig.suptitle(f'Query: {Path(query_path).name}', fontsize=16)

        # Display query
        axes[0, 0].imshow(query_img)
        axes[0, 0].set_title('QUERY IMAGE', fontweight='bold')
        axes[0, 0].axis('off')

        # Display top 5 results
        for idx, result in enumerate(results[:5]):
            row = (idx + 1) // 3
            col = (idx + 1) % 3

            img = cv2.imread(result['path'])
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            axes[row, col].imshow(img)
            title = f"Rank {idx+1}: {result['category']}\nDist: {result['distance']:.4f}"
            axes[row, col].set_title(title)
            axes[row, col].axis('off')

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=150, bbox_inches='tight')
            print(f"Results saved to {save_path}")

        plt.show()


# ==================== MAIN EXECUTION ====================
if __name__ == "__main__":
    # Inisialisasi sistem
    cbir = CBIRSystem(db_path="db", query_path="query")

    # Konfigurasi fitur untuk evaluasi
    feature_configs = {
        "A. Color Only": "color",
        "B. GLCM Only": "glcm",
        "C. Combined (Color+GLCM+Shape)": "all"
    }

    # Jalankan evaluasi
    print("Starting CBIR Evaluation...")
    evaluation_results = cbir.run_evaluation(feature_configs, top_k=5)

    # Tampilkan hasil evaluasi
    print("\n" + "="*60)
    print("EVALUATION SUMMARY")
    print("="*60)

    for config_name, result in evaluation_results.items():
        print(f"\n{config_name}")
        print(f"  Average Precision@5: {result['avg_precision']:.4f}")

    # Contoh pencarian untuk beberapa query
    print("\n" + "="*60)
    print("EXAMPLE SEARCH RESULTS")
    print("="*60)

    # Build database dengan fitur gabungan
    cbir.build_database("all")

    # Cari beberapa contoh query
    query_examples = [
        "query/street/2.jpg",
        # Tambahkan query lain jika ada
    ]

    for query_path in query_examples:
        if Path(query_path).exists():
            print(f"\nQuery: {query_path}")
            results = cbir.search(query_path, top_k=5)

            # Visualisasi
            save_path = f"results_{Path(query_path).stem}.png"
            cbir.visualize_results(query_path, results, save_path)

            # Print hasil
            for idx, r in enumerate(results, 1):
                print(f"  {idx}. {r['rel_path']} (category: {r['category']}, distance: {r['distance']:.4f})")

    # Simpan hasil evaluasi ke JSON
    output = {
        config: {
            'avg_precision': result['avg_precision'],
            'num_queries': len(result['all_precisions'])
        }
        for config, result in evaluation_results.items()
    }

    with open('evaluation_results.json', 'w') as f:
        json.dump(output, f, indent=2)

    print("\n✓ Evaluation complete! Results saved to evaluation_results.json")

Starting CBIR Evaluation...

Evaluating: A. Color Only
Building database with feature type: color
Processing category: buildings
Processing category: forest
Processing category: glacier
Processing category: mountain
Processing category: sea
Processing category: street
Normalizing features...
Database built with 14034 images
⚠ Error computing distance for buildings\0.jpg: Input contains NaN.
⚠ Error computing distance for buildings\10006.jpg: Input contains NaN.
⚠ Error computing distance for buildings\1001.jpg: Input contains NaN.
⚠ Error computing distance for buildings\10014.jpg: Input contains NaN.
⚠ Error computing distance for buildings\10018.jpg: Input contains NaN.
⚠ Error computing distance for buildings\10029.jpg: Input contains NaN.
⚠ Error computing distance for buildings\10032.jpg: Input contains NaN.
⚠ Error computing distance for buildings\10056.jpg: Input contains NaN.
⚠ Error computing distance for buildings\1009.jpg: Input contains NaN.
⚠ Error computing distance for b

KeyboardInterrupt: 