## Step 1: Full Handcrafted Features Pipeline (Lab* + GLCM + Stats)

In [1]:
import os

data_dir = "/kaggle/input/processed-images-224x224"
for grade in ["Grade 1", "Grade 2", "Grade 3", "Grade 4"]:
    folder = os.path.join(data_dir, grade)
    count = len([f for f in os.listdir(folder) if f.lower().endswith(('.jpg', '.png'))]) if os.path.exists(folder) else 0
    print(f"{folder}: {count} images")

/kaggle/input/processed-images-224x224/Grade 1: 1084 images
/kaggle/input/processed-images-224x224/Grade 2: 1050 images
/kaggle/input/processed-images-224x224/Grade 3: 1503 images
/kaggle/input/processed-images-224x224/Grade 4: 966 images


In [2]:
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from skimage.feature import graycomatrix, graycoprops 
from skimage import img_as_ubyte
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------ FEATURE EXTRACTION ------------------

def load_and_convert_to_lab(image_path):
    """Load image and convert directly to L*a*b* (no background removal)."""
    img = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_lab = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2LAB)
    return img_lab

def extract_color_histograms(lab_img, bins=32):
    """Extract normalized histograms for a* and b* channels."""
    a_channel = lab_img[:, :, 1]
    b_channel = lab_img[:, :, 2]
    hist_a, _ = np.histogram(a_channel.ravel(), bins=bins, range=(0, 256), density=True)
    hist_b, _ = np.histogram(b_channel.ravel(), bins=bins, density=True)
    return np.concatenate([hist_a, hist_b])

def extract_statistical_features(lab_img):
    """Mean, std, skew for L*, a*, b*."""
    features = []
    for i in range(3):  # L=0, a=1, b=2
        channel = lab_img[:, :, i].astype(np.float64)
        mean = np.mean(channel)
        std = np.std(channel)
        skew = (np.mean((channel - mean) ** 3)) / (std ** 3 + 1e-6)
        features.extend([mean, std, skew])
    return np.array(features)

def extract_glcm_features(lab_img, distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4]):
    """GLCM on L* channel (luminance = texture)."""
    l_channel = lab_img[:, :, 0]
    l_ubyte = img_as_ubyte(l_channel / 255.0)  # Normalize to [0,1] then to 8-bit
    
    features = []
    for dist in distances:
        for angle in angles:
            glcm = graycomatrix(l_ubyte, distances=[dist], angles=[angle], levels=256, symmetric=True, normed=True)
            for prop in ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation']:
                features.append(graycoprops(glcm, prop)[0, 0])
    return np.array(features)

def extract_features(image_path):
    """Main feature extractor ‚Äî no background removal."""
    try:
        lab_img = load_and_convert_to_lab(image_path)
        hist_feat = extract_color_histograms(lab_img)
        stat_feat = extract_statistical_features(lab_img)
        glcm_feat = extract_glcm_features(lab_img)
        return np.concatenate([hist_feat, stat_feat, glcm_feat])
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# ------------------ DATA LOADING ------------------

def load_dataset(data_dir="/kaggle/input/processed-images-224x224"):
    """
    Load images from your actual folder structure:
    /kaggle/input/processed-images-224x224/Grade 1/
    /kaggle/input/processed-images-224x224/Grade 2/
    etc.
    """
    grade_folders = {
        'Grade 1': 0,
        'Grade 2': 1,
        'Grade 3': 2,
        'Grade 4': 3
    }
    
    image_paths = []
    labels = []
    
    for grade_name, label in grade_folders.items():
        folder_path = os.path.join(data_dir, grade_name)
        if not os.path.exists(folder_path):
            print(f"‚ö†Ô∏è Warning: Folder not found: {folder_path}")
            continue
        
        for f in os.listdir(folder_path):
            if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                image_paths.append(os.path.join(folder_path, f))
                labels.append(label)
    
    print(f"üìÅ Loaded {len(image_paths)} images from {data_dir}")
    return image_paths, labels

# ------------------ FEATURE EXTRACTION RUNNER ------------------

def extract_dataset_features():
    """No resolution argument ‚Äî uses fixed input path."""
    print("üîÑ Extracting handcrafted features from /kaggle/input/processed-images-224x224/...")
    image_paths, labels = load_dataset()  # Uses default path
    
    if len(image_paths) == 0:
        raise ValueError("‚ùå No images found! Check your input folder structure.")
    
    features = []
    valid_paths, valid_labels = [], []
    
    for path, label in tqdm(zip(image_paths, labels), total=len(image_paths)):
        feat = extract_features(path)
        if feat is not None:
            features.append(feat)
            valid_paths.append(path)
            valid_labels.append(label)
    
    if len(features) == 0:
        raise RuntimeError("‚ùå All feature extractions failed.")
    
    X = np.array(features)
    y = np.array(valid_labels)
    print(f"‚úÖ Extracted {X.shape[0]} samples with {X.shape[1]} features")
    return X, y, valid_paths

## Step 2: Anti-Overfitting Experimental Design

### To find the best model without overfitting, use this 3-stage validation strategy:

### Experiment 1: Model Selection with Nested Cross-Validation

Goal: Unbiased comparison of SVM vs Random Forest.

In [3]:
def model_selection_nested_cv(X, y, n_splits=5):
    """Nested CV: outer loop for evaluation, inner loop for hyperparameter tuning."""
    outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    models = {
        'SVM': {
            'model': SVC(random_state=42),
            'params': {
                'C': [0.1, 1, 10, 100],
                'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
            }
        },
        'RandomForest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        }
    }
    
    results = {}
    
    for name, cfg in models.items():
        print(f"\nüîç Evaluating {name} with Nested CV...")
        outer_scores = []
        
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            # Standardize features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Inner CV for hyperparameter tuning
            inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
            grid = GridSearchCV(cfg['model'], cfg['params'], cv=inner_cv, scoring='f1_weighted', n_jobs=-1)
            grid.fit(X_train_scaled, y_train)
            
            # Evaluate on outer test set
            y_pred = grid.predict(X_test_scaled)
            score = f1_score(y_test, y_pred, average='weighted')
            outer_scores.append(score)
        
        results[name] = {
            'mean_f1': np.mean(outer_scores),
            'std_f1': np.std(outer_scores),
            'scores': outer_scores
        }
        print(f"  ‚Üí Mean F1: {np.mean(outer_scores):.4f} ¬± {np.std(outer_scores):.4f}")
    
    return results

### Experiment 2: Feature Ablation Study

Goal: Understand which feature groups (color, stats, texture) matter most.

In [4]:
def feature_ablation_study(X, y, feature_lengths):
    """Test performance with subsets of features."""
    hist_len = feature_lengths['hist']
    stat_len = feature_lengths['stat']
    glcm_len = feature_lengths['glcm']
    
    feature_sets = {
        'Color Histograms': (0, hist_len),
        'Statistical Features': (hist_len, hist_len + stat_len),
        'Texture (GLCM)': (hist_len + stat_len, hist_len + stat_len + glcm_len),
        'Color + Stats': (0, hist_len + stat_len),
        'Color + Texture': (0, hist_len) + (hist_len + stat_len, hist_len + stat_len + glcm_len),  # tuple hack
        'Stats + Texture': (hist_len, hist_len + stat_len + glcm_len),
        'All Features': (0, X.shape[1])
    }
    
    best_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
    results = {}
    
    for name, idx in feature_sets.items():
        if isinstance(idx, tuple) and len(idx) == 4:  # Handle combined non-contiguous
            X_subset = np.concatenate([X[:, idx[0]:idx[1]], X[:, idx[2]:idx[3]]], axis=1)
        else:
            X_subset = X[:, idx[0]:idx[1]]
        
        # Simple train-test split for speed
        X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, stratify=y, random_state=42)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        best_model.fit(X_train_scaled, y_train)
        y_pred = best_model.predict(X_test_scaled)
        f1 = f1_score(y_test, y_pred, average='weighted')
        results[name] = f1
        print(f"  {name}: F1 = {f1:.4f}")
    
    return results

### Experiment 3: Final Model Training with Hold-Out Test Set

Goal: Report final performance on a never-seen test set.

In [5]:
def train_final_model(X, y, best_model_name, best_params):
    """Train final model on full train+val, evaluate on held-out test."""
    # Split: 70% train, 15% val, 15% test
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, stratify=y_temp, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    if best_model_name == 'SVM':
        model = SVC(**best_params, random_state=42)
    else:
        model = RandomForestClassifier(**best_params, random_state=42)
    
    model.fit(X_train_scaled, y_train)
    
    # Evaluate on test set
    y_pred = model.predict(X_test_scaled)
    test_f1 = f1_score(y_test, y_pred, average='weighted')
    test_acc = accuracy_score(y_test, y_pred)
    
    print("\nüéØ FINAL MODEL PERFORMANCE (Hold-Out Test Set)")
    print(f"Accuracy: {test_acc:.4f}")
    print(f"F1-Score: {test_f1:.4f}")
    print("\nüìã Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Grade 1', 'Grade 2', 'Grade 3', 'Grade 4']))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Grade 1', 'Grade 2', 'Grade 3', 'Grade 4'],
                yticklabels=['Grade 1', 'Grade 2', 'Grade 3', 'Grade 4'])
    plt.title('Handcrafted Features - Confusion Matrix')
    plt.savefig('/kaggle/working/handcrafted_cm.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return model, scaler

## Step 3: Full Experimental Run

In [6]:
def run_handcrafted_features_experiment():
    print("üöÄ Starting Handcrafted Features Pipeline")
    print("=" * 50)
    
    # 1. Extract features
    X, y, paths = extract_dataset_features()
    
    # Save feature lengths for ablation
    sample_feat = extract_features(paths[0])
    hist_len = 64  # 32 bins * 2 channels
    stat_len = 9   # 3 stats * 3 channels
    glcm_len = 20  # 5 props * 4 angles
    
    # 2. Avoid data leakage: standardize AFTER split
    # But for now, just record lengths
    feature_lengths = {'hist': hist_len, 'stat': stat_len, 'glcm': glcm_len}
    
    # 3. Experiment 1: Model selection
    model_results = model_selection_nested_cv(X, y)
    best_model_name = max(model_results, key=lambda k: model_results[k]['mean_f1'])
    print(f"\nüèÜ Best Model: {best_model_name}")
    
    # 4. Experiment 2: Feature ablation
    print("\nüî¨ Feature Ablation Study:")
    ablation_results = feature_ablation_study(X, y, feature_lengths)
    
    # 5. Train final model (using best config from nested CV)
    # For simplicity, we'll re-tune on full data (in practice, use best_params from nested CV)
    if best_model_name == 'SVM':
        final_model = SVC(C=10, gamma='scale', random_state=42)
    else:
        final_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
    
    # Train and evaluate on hold-out test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    final_model.fit(X_train_scaled, y_train)
    y_pred = final_model.predict(X_test_scaled)
    final_f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"\n‚úÖ Final Test F1: {final_f1:.4f}")
    
    # Save results
    results_df = pd.DataFrame({
        'Model': [best_model_name],
        'Test F1': [final_f1],
        'Feature_Ablation': [ablation_results]
    })
    results_df.to_csv('/kaggle/working/handcrafted_results.csv', index=False)
    
    return final_model, scaler

# Run it
if __name__ == "__main__":
    model, scaler = run_handcrafted_features_experiment()

üöÄ Starting Handcrafted Features Pipeline
üîÑ Extracting handcrafted features from /kaggle/input/processed-images-224x224/...
üìÅ Loaded 4603 images from /kaggle/input/processed-images-224x224


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4603/4603 [02:46<00:00, 27.59it/s]


‚úÖ Extracted 4603 samples with 93 features

üîç Evaluating SVM with Nested CV...
  ‚Üí Mean F1: 0.9578 ¬± 0.0047

üîç Evaluating RandomForest with Nested CV...
  ‚Üí Mean F1: 0.9211 ¬± 0.0060

üèÜ Best Model: SVM

üî¨ Feature Ablation Study:
  Color Histograms: F1 = 0.8807
  Statistical Features: F1 = 0.9159
  Texture (GLCM): F1 = 0.8213
  Color + Stats: F1 = 0.9188
  Color + Texture: F1 = 0.9123
  Stats + Texture: F1 = 0.9324
  All Features: F1 = 0.9345

‚úÖ Final Test F1: 0.9623
