Feature Extraction (Step 3)

In [None]:
import os
import gc
import pickle
import numpy as np
import logging
from pathlib import Path
from datetime import datetime
from skimage.feature import hog, local_binary_pattern
import cv2

class FeatureExtractor:
    def __init__(self, processed_dir='./data/processed', feature_dir='./data/features', log_dir='./data/logs'):
        """
        Step 3: Feature Extraction Only
        Extract LBP, HOG, and SIFT features from preprocessed train and test files
        """
        self.processed_dir = processed_dir
        self.feature_dir = feature_dir
        self.log_dir = log_dir
        
        # Create directories
        for directory in [feature_dir, log_dir]:
            os.makedirs(directory, exist_ok=True)
        
        # Setup logging
        self._setup_logging()
        
        self.logger.info("Feature extractor initialized successfully")
        
    def _setup_logging(self):
        """Setup comprehensive logging system"""
        log_filename = f"feature_extraction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        log_path = os.path.join(self.log_dir, log_filename)
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_path),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
    
    def _load_data(self, pickle_path, sample_size=None):
        """Load data from pickle file and optionally sample"""
        self.logger.info(f"Loading data from: {pickle_path}")
        
        with open(pickle_path, "rb") as f:
            data = pickle.load(f)
        
        if isinstance(data, dict):
            if 'images' in data and 'labels' in data:
                X, y = data['images'], data['labels']
            elif 'X' in data and 'y' in data:
                X, y = data['X'], data['y']
            else:
                raise ValueError("Unsupported pickle structure")
        else:
            X, y = data
            
        X, y = np.asarray(X), np.asarray(y)
        
        # Sample data if specified (10k for train, 2k for test)
        if sample_size and len(X) > sample_size:
            indices = np.random.choice(len(X), sample_size, replace=False)
            X, y = X[indices], y[indices]
            self.logger.info(f"Sampled {sample_size} examples from {len(X)} total")
        
        self.logger.info(f"Loaded {len(X)} images with {len(np.unique(y))} classes")
        return X, y
    
    def _ensure_gray(self, img):
        """Convert image to grayscale for feature extraction"""
        if img.ndim == 2:
            return img
        if img.shape[-1] == 3:
            gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        else:
            gray = img[..., 0]
        
        # Normalize to 0-255 if needed
        if gray.max() <= 1.0:
            gray = (gray * 255).astype(np.uint8)
        else:
            gray = gray.astype(np.uint8)
            
        return gray
    
    def extract_lbp_features(self, images, P=8, R=1):
        """Extract Local Binary Pattern features"""
        self.logger.info("Starting LBP feature extraction")
        
        features = []
        for i, img in enumerate(images):
            gray = self._ensure_gray(img)
            lbp = local_binary_pattern(gray, P, R, method='uniform')
            hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, P + 3), range=(0, P + 2))
            hist = hist.astype(np.float32)
            hist /= (hist.sum() + 1e-6)  # Normalize
            features.append(hist)
            
            if (i + 1) % 1000 == 0:
                self.logger.info(f"LBP: Processed {i + 1}/{len(images)} images")
        
        self.logger.info("LBP feature extraction completed")
        return np.array(features)
    
    def extract_hog_features(self, images):
        """Extract HOG features"""
        self.logger.info("Starting HOG feature extraction")
        
        features = []
        for i, img in enumerate(images):
            gray = self._ensure_gray(img)
            # Adjust parameters based on image size
            pixels_per_cell = (8, 8)
            if gray.shape[0] < 16 or gray.shape[1] < 16:
                pixels_per_cell = (4, 4)
                
            hog_feat = hog(gray, orientations=9, pixels_per_cell=pixels_per_cell,
                          cells_per_block=(2, 2), block_norm='L2-Hys', feature_vector=True)
            features.append(hog_feat.astype(np.float32))
            
            if (i + 1) % 1000 == 0:
                self.logger.info(f"HOG: Processed {i + 1}/{len(images)} images")
        
        self.logger.info("HOG feature extraction completed")
        return np.array(features)
    
    def extract_sift_features(self, images):
        """Extract SIFT descriptors"""
        self.logger.info("Starting SIFT feature extraction")
        
        sift = cv2.SIFT_create()
        all_descriptors = []
        
        for i, img in enumerate(images):
            gray = self._ensure_gray(img)
            _, descriptors = sift.detectAndCompute(gray, None)
            
            if descriptors is not None:
                all_descriptors.append(descriptors)
            else:
                # Create empty descriptor if no features found
                all_descriptors.append(np.array([]).reshape(0, 128))
            
            if (i + 1) % 1000 == 0:
                self.logger.info(f"SIFT: Processed {i + 1}/{len(images)} images")
        
        self.logger.info("SIFT feature extraction completed")
        return all_descriptors
    
    def save_features(self, features, labels, feature_type, dataset_name, set_type, augmentation_type):
        """Save extracted features to pickle files"""
        save_dir = os.path.join(self.feature_dir, dataset_name, feature_type, set_type)
        os.makedirs(save_dir, exist_ok=True)
        
        filename = f"{augmentation_type}.pkl"
        filepath = os.path.join(save_dir, filename)
        
        # Save as dictionary for easy loading later
        feature_data = {
            'features': features,
            'labels': labels,
            'feature_type': feature_type,
            'dataset_name': dataset_name,
            'set_type': set_type,
            'augmentation_type': augmentation_type,
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(feature_data, f)
        
        self.logger.info(f"Saved {feature_type} features to: {filepath}")
        return filepath
    
    def extract_features_for_file(self, pickle_path, dataset_name, set_type, augmentation_type, 
                                 sample_size=None, feature_types=['LBP', 'HOG', 'SIFT']):
        """Extract all features for a single pickle file"""
        self.logger.info(f"Processing: {dataset_name} - {set_type} - {augmentation_type}")
        
        # Load data (with sampling if specified)
        X, y = self._load_data(pickle_path, sample_size)
        
        results = {}
        
        for feature_type in feature_types:
            self.logger.info(f"Extracting {feature_type} features...")
            
            try:
                if feature_type == 'LBP':
                    features = self.extract_lbp_features(X)
                elif feature_type == 'HOG':
                    features = self.extract_hog_features(X)
                elif feature_type == 'SIFT':
                    features = self.extract_sift_features(X)
                
                # Save features
                save_path = self.save_features(features, y, feature_type, dataset_name, 
                                             set_type, augmentation_type)
                
                # Store feature info
                if feature_type == 'SIFT':
                    # For SIFT, we have list of variable-length descriptors
                    shapes = [f.shape if len(f) > 0 else (0, 128) for f in features]
                    results[feature_type] = {
                        'num_descriptors': sum(len(f) for f in features),
                        'shapes': shapes,
                        'save_path': save_path
                    }
                else:
                    # For LBP and HOG, we have fixed-length features
                    results[feature_type] = {
                        'features_shape': features.shape,
                        'save_path': save_path
                    }
                
                self.logger.info(f"{feature_type} features extracted successfully")
                
                # Cleanup
                del features
                gc.collect()
                
            except Exception as e:
                self.logger.error(f"Error extracting {feature_type} features: {str(e)}")
                results[feature_type] = {'error': str(e)}
        
        # Cleanup
        del X, y
        gc.collect()
        
        return results
    
    def run_feature_extraction(self):
        """Run complete feature extraction for all datasets and files"""
        self.logger.info("Starting feature extraction pipeline")
        
        datasets = ['cifar10', 'mnist']
        
        # Train files - sample 10,000 from each
        train_files = [
            ('original.pkl', 'original'),
            ('mixed_augmented.pkl', 'mixed_augmented'), 
            ('combined_augmented.pkl', 'combined_augmented')
        ]
        
        # Test files - sample 2,000 from each  
        test_files = [
            ('original.pkl', 'original'),
            ('rotation_15.pkl', 'rotation_15'),
            ('noise.pkl', 'noise'),
            ('scaling_0.8.pkl', 'scaling_08'),
            ('occlusion_25.pkl', 'occlusion_25'),
            ('all_combined.pkl', 'all_combined')
        ]
        
        all_results = {}
        
        for dataset in datasets:
            self.logger.info(f"Processing dataset: {dataset.upper()}")
            all_results[dataset] = {'train': {}, 'test': {}}
            
            # Process TRAIN files (10k samples each)
            self.logger.info("Processing TRAIN files...")
            for train_file, aug_name in train_files:
                train_path = os.path.join(self.processed_dir, f"{dataset}_train", train_file)
                
                if os.path.exists(train_path):
                    self.logger.info(f"Extracting features from TRAIN: {train_file}")
                    results = self.extract_features_for_file(
                        train_path, dataset, 'train', aug_name, sample_size=10000
                    )
                    all_results[dataset]['train'][aug_name] = results
                    self.logger.info(f"Completed TRAIN: {train_file}")
                else:
                    self.logger.warning(f"Train file not found: {train_path}")
            
            # Process TEST files (2k samples each)
            self.logger.info("Processing TEST files...")
            for test_file, aug_name in test_files:
                test_path = os.path.join(self.processed_dir, f"{dataset}_test", test_file)
                
                if os.path.exists(test_path):
                    self.logger.info(f"Extracting features from TEST: {test_file}")
                    results = self.extract_features_for_file(
                        test_path, dataset, 'test', aug_name, sample_size=2000
                    )
                    all_results[dataset]['test'][aug_name] = results
                    self.logger.info(f"Completed TEST: {test_file}")
                else:
                    self.logger.warning(f"Test file not found: {test_path}")
        
        self.logger.info("Feature extraction pipeline completed successfully")
        
        # Save summary
        summary_path = os.path.join(self.feature_dir, "extraction_summary.pkl")
        with open(summary_path, 'wb') as f:
            pickle.dump(all_results, f)
        
        self.logger.info(f"Extraction summary saved to: {summary_path}")
        return all_results

# Run the feature extraction
if __name__ == "__main__":
    print("Starting Step 3: Feature Extraction")
    print("This will extract LBP, HOG, and SIFT features from:")
    print("- TRAIN files: original, mixed_augmented, combined_augmented (10k samples each)")
    print("- TEST files: original, rotation_15, noise, scaling_0.8, occlusion_25, all_combined (2k samples each)")
    print("Features will be saved in: ./data/features/")
    print("=" * 60)
    
    extractor = FeatureExtractor()
    results = extractor.run_feature_extraction()
    
    print("Feature extraction completed!")
    print("All features saved in: ./data/features/")
    print("Check the logs for detailed information")

Step 4 (Clustering and BoW using KMeans)

In [None]:
import os
import gc
import pickle
import numpy as np
import logging
from datetime import datetime
from pathlib import Path
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class ProfessionalFeatureProcessor:
    def __init__(self, feature_dir='./data/features', output_dir='./data/processed', log_dir='./data/logs'):
        self.feature_dir = feature_dir
        self.output_dir = output_dir
        self.log_dir = log_dir
        self._setup_logging()
        self._create_directories()
        self.config = {
            'vocab_sizes': [50, 100, 200],
            'max_descriptors': 100000,
            'batch_size': 1000,
            'random_state': 42
        }
        self.logger.info("Feature processor initialized successfully")

    def _create_directories(self):
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.log_dir, exist_ok=True)

    def _setup_logging(self):
        log_file = os.path.join(self.log_dir, f"processing_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
        logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger()

    def load_features(self, dataset='mnist', method='SIFT_BoW', subset='train'):
        path = Path(self.feature_dir) / dataset / method / subset
        all_features, all_labels = [], []
        for file in path.glob('*.pkl'):
            with open(file, 'rb') as f:
                data = pickle.load(f)
            all_features.append(data['features'])
            all_labels.append(data['labels'])
        X = np.vstack(all_features)
        y = np.hstack(all_labels)
        self.logger.info(f"Loaded features from {path} with shape {X.shape}")
        return X, y

    def perform_clustering(self, X):
        clustered_data = {}
        for vocab_size in self.config['vocab_sizes']:
            self.logger.info(f"Clustering with vocab size {vocab_size}")
            kmeans = MiniBatchKMeans(n_clusters=vocab_size, batch_size=self.config['batch_size'], random_state=self.config['random_state'])
            kmeans.fit(X[:self.config['max_descriptors']])
            clustered_data[vocab_size] = kmeans
            output_file = os.path.join(self.output_dir, f"kmeans_{vocab_size}.pkl")
            with open(output_file, 'wb') as f:
                pickle.dump(kmeans, f)
            self.logger.info(f"Saved KMeans model for vocab {vocab_size} at {output_file}")
        return clustered_data

    def transform_features(self, X, clustered_data):
        transformed_data = {}
        for vocab_size, kmeans in clustered_data.items():
            labels = kmeans.predict(X)
            hist = np.zeros((X.shape[0], vocab_size), dtype=np.float32)
            for i, label in enumerate(labels):
                hist[i, label] += 1
            scaler = StandardScaler()
            hist = scaler.fit_transform(hist)
            transformed_data[vocab_size] = hist
            output_file = os.path.join(self.output_dir, f"transformed_{vocab_size}.pkl")
            with open(output_file, 'wb') as f:
                pickle.dump(hist, f)
            self.logger.info(f"Saved transformed features for vocab {vocab_size} at {output_file}")
        return transformed_data

if __name__ == '__main__':
    processor = ProfessionalFeatureProcessor()
    X, y = processor.load_features(dataset='mnist', method='SIFT_BoW', subset='train')
    clustered = processor.perform_clustering(X)
    transformed = processor.transform_features(X, clustered)
    gc.collect()


Model Training and Classification 

In [None]:
import os
import pickle
import numpy as np
import logging
from datetime import datetime
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedShuffleSplit
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')


class SIFTBowClassification:
    def __init__(self):
        self.base_path = "data/features/mnist/SIFT_BoW"
        self.results_path = "mnist_results/SIFT_BoW"
        os.makedirs(self.results_path, exist_ok=True)
        
        # Define training and test files
        self.train_files = [
            'original.pkl',
            'combined_augmented.pkl', 
            'mixed_augmented.pkl'
        ]
        
        self.test_files = [
            'original.pkl',
            'noise.pkl',
            'occlusion_25.pkl',
            'rotation_15.pkl', 
            'scaling_08.pkl',
            'all_combined.pkl'
        ]
        
        # Initialize classifiers
        self.classifiers = {
            'SVM_RBF': SVC(kernel='rbf', probability=True, random_state=42),
            'Logistic_Regression': LogisticRegression(random_state=42, max_iter=1000),
            'Random_Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='mlogloss'),
            'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, max_iter=1000),
            'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1)
        }
        
        # Store all results for top model selection
        self.all_results = []
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('sift_bow_classification.log'),
                logging.StreamHandler()
            ]
        )
        
    def debug_data_structure(self, data, file_path):
        """Debug function to understand data structure"""
        logging.info(f"DEBUG: Data type: {type(data)}")
        
        if isinstance(data, dict):
            logging.info(f"DEBUG: Dictionary keys: {list(data.keys())}")
            for key, value in data.items():
                if hasattr(value, 'shape'):
                    logging.info(f"DEBUG: Key '{key}' shape: {value.shape}")
                elif hasattr(value, '__len__'):
                    logging.info(f"DEBUG: Key '{key}' length: {len(value)}")
                else:
                    logging.info(f"DEBUG: Key '{key}' type: {type(value)}")
        
        elif isinstance(data, (list, tuple)):
            logging.info(f"DEBUG: List/tuple length: {len(data)}")
            for i, item in enumerate(data):
                if hasattr(item, 'shape'):
                    logging.info(f"DEBUG: Item {i} shape: {item.shape}")
                elif hasattr(item, '__len__'):
                    logging.info(f"DEBUG: Item {i} length: {len(item)}")
                else:
                    logging.info(f"DEBUG: Item {i} type: {type(item)}")
        
        else:
            logging.info(f"DEBUG: Unknown data structure in {file_path}")

    def extract_features_and_labels(self, data, file_path):
        """Extract features and labels from various data formats"""
        features = None
        labels = None
        
        # Try different data structures
        if isinstance(data, dict):
            # Try common keys for features
            feature_keys = ['features', 'descriptors', 'bow_features', 'histograms', 
                          'sift_bow_features', 'sift_features', 'bow_vectors']
            for key in feature_keys:
                if key in data:
                    features = data[key]
                    logging.info(f"Found features with key: '{key}'")
                    break
            
            # Try common keys for labels
            label_keys = ['labels', 'targets', 'target', 'y']
            for key in label_keys:
                if key in data:
                    labels = data[key]
                    logging.info(f"Found labels with key: '{key}'")
                    break
            
            # If still not found, try to extract first two elements
            if features is None and len(data) >= 2:
                keys = list(data.keys())
                features = data.get(keys[0])
                labels = data.get(keys[1])
                logging.info(f"Using first two keys: '{keys[0]}' and '{keys[1]}'")
                
        elif isinstance(data, (list, tuple)):
            if len(data) >= 2:
                features = data[0]
                labels = data[1]
                logging.info("Using first two elements of list/tuple")
        
        # Handle sparse matrices
        if features is not None and hasattr(features, 'toarray'):
            features = features.toarray()
            logging.info("Converted sparse matrix to dense array")
        
        # Convert to numpy arrays
        if features is not None:
            features = np.array(features)
        if labels is not None:
            labels = np.array(labels)
        
        return features, labels
    
    def load_features(self, file_path, n_samples=None):
        """Load features from pickle file with extensive error handling"""
        if not os.path.exists(file_path):
            logging.error(f"File not found: {file_path}")
            return None, None
            
        try:
            logging.info(f"Loading features from: {file_path}")
            
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
            
            # Debug the data structure
            self.debug_data_structure(data, file_path)
            
            # Extract features and labels
            features, labels = self.extract_features_and_labels(data, file_path)
            
            if features is None or labels is None:
                logging.error(f"Could not extract features and labels from {file_path}")
                return None, None
            
            logging.info(f"Raw features shape: {features.shape}, labels shape: {labels.shape}")
            
            # Clean features (handle NaN/inf)
            features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
            
            # Reshape if needed (for 2D+ features)
            if len(features.shape) > 2:
                original_shape = features.shape
                features = features.reshape(features.shape[0], -1)
                logging.info(f"Reshaped features from {original_shape} to {features.shape}")
            
            # Ensure we have at least 2D features
            if len(features.shape) == 1:
                features = features.reshape(-1, 1)
                logging.info(f"Reshaped 1D features to 2D: {features.shape}")
            
            # Sample if requested
            if n_samples and len(features) > n_samples:
                logging.info(f"Sampling {n_samples} from {len(features)} total samples")
                # Use stratified sampling to maintain class distribution
                splitter = StratifiedShuffleSplit(n_splits=1, train_size=n_samples, random_state=42)
                for train_idx, _ in splitter.split(features, labels):
                    features = features[train_idx]
                    labels = labels[train_idx]
                    break
            
            logging.info(f"Final features shape: {features.shape}, labels shape: {labels.shape}")
            return features, labels
            
        except Exception as e:
            logging.error(f"Error loading {file_path}: {str(e)}")
            import traceback
            logging.error(traceback.format_exc())
            return None, None
    
    def validate_data(self, features, labels, file_path):
        """Validate the loaded data"""
        if features is None or labels is None:
            return False
        
        if len(features) != len(labels):
            logging.error(f"Feature count ({len(features)}) != label count ({len(labels)}) in {file_path}")
            return False
        
        if len(features) == 0:
            logging.error(f"No features loaded from {file_path}")
            return False
        
        # Check for constant features
        if features.shape[1] > 1:
            std_dev = np.std(features, axis=0)
            constant_features = np.sum(std_dev == 0)
            if constant_features > 0:
                logging.warning(f"Found {constant_features} constant features in {file_path}")
        
        logging.info(f"Data validation passed for {file_path}")
        return True
    
    def train_and_evaluate(self, X_train, y_train, X_test, y_test, train_file, test_file):
        """Train all classifiers and evaluate on test data"""
        results = {}
        
        for clf_name, classifier in self.classifiers.items():
            try:
                logging.info(f"Training {clf_name} on {train_file} -> {test_file}")
                
                # Train classifier
                classifier.fit(X_train, y_train)
                
                # Predictions
                y_pred = classifier.predict(X_test)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)
                f1_macro = f1_score(y_test, y_pred, average='macro')
                
                # Generate classification report
                report = classification_report(y_test, y_pred, digits=4, output_dict=True)
                
                results[clf_name] = {
                    'accuracy': accuracy,
                    'f1_macro': f1_macro,
                    'classification_report': report,
                    'predictions': y_pred,
                    'model': classifier
                }
                
                # Store for top model selection
                self.all_results.append({
                    'train_file': train_file,
                    'test_file': test_file,
                    'classifier': clf_name,
                    'f1_macro': f1_macro,
                    'accuracy': accuracy,
                    'model': classifier
                })
                
                logging.info(f"{clf_name} - Accuracy: {accuracy:.4f}, F1 Macro: {f1_macro:.4f}")
                
            except Exception as e:
                logging.error(f"Error training {clf_name}: {str(e)}")
                import traceback
                logging.error(traceback.format_exc())
                results[clf_name] = None
        
        return results
    
    def run_all_experiments(self):
        """Run all 18 experiments for SIFT_BoW"""
        logging.info("Starting SIFT_BoW Classification Experiments")
        logging.info("18 experiments: 3 train files × 6 test files")
        
        # Check if base directory exists
        if not os.path.exists(self.base_path):
            logging.error(f"SIFT_BoW base directory not found: {self.base_path}")
            return
        
        # Check train and test directories
        train_dir = os.path.join(self.base_path, "train")
        test_dir = os.path.join(self.base_path, "test")
        
        if not os.path.exists(train_dir):
            logging.error(f"Train directory not found: {train_dir}")
            return
        
        if not os.path.exists(test_dir):
            logging.error(f"Test directory not found: {test_dir}")
            return
        
        # Iterate through all train-test combinations
        for train_file in self.train_files:
            for test_file in self.test_files:
                combo_name = f"{train_file.replace('.pkl', '')}_{test_file.replace('.pkl', '')}"
                logging.info(f"\n{'='*50}")
                logging.info(f"Processing combination: {combo_name}")
                logging.info(f"{'='*50}")
                
                # Load training data
                train_path = os.path.join(train_dir, train_file)
                X_train, y_train = self.load_features(train_path, 1000)
                
                if not self.validate_data(X_train, y_train, train_path):
                    logging.error(f"Failed to load/validate training data: {train_path}")
                    continue
                
                # Load test data
                test_path = os.path.join(test_dir, test_file)
                X_test, y_test = self.load_features(test_path, 200)
                
                if not self.validate_data(X_test, y_test, test_path):
                    logging.error(f"Failed to load/validate test data: {test_path}")
                    continue
                
                # Train and evaluate
                results = self.train_and_evaluate(X_train, y_train, X_test, y_test, train_file, test_file)
                
                # Save individual result file
                self.save_individual_result(combo_name, results, train_file, test_file, y_test)
        
        # Save summary and select top models
        if self.all_results:
            self.save_summary_and_top_models()
            logging.info("SIFT_BoW experiments completed successfully!")
        else:
            logging.error("No experiments were completed successfully!")
    
    def save_individual_result(self, combo_name, results, train_file, test_file, y_test):
        """Save individual experiment result to text file"""
        filename = f"{combo_name}.txt"
        filepath = os.path.join(self.results_path, filename)
        
        with open(filepath, 'w') as f:
            f.write("SIFT_BoW MNIST Classification Result\n")
            f.write("=" * 50 + "\n\n")
            f.write(f"Training File: {train_file}\n")
            f.write(f"Test File: {test_file}\n")
            f.write(f"Training Samples: 1000\n")
            f.write(f"Test Samples: 200\n")
            f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write("=" * 50 + "\n\n")
            
            successful_models = 0
            for clf_name, result in results.items():
                if result is None:
                    f.write(f"Classifier: {clf_name}\n")
                    f.write("Status: Failed to train\n\n")
                    f.write("-" * 40 + "\n\n")
                    continue
                
                successful_models += 1
                f.write(f"Classifier: {clf_name}\n")
                f.write(f"Accuracy: {result['accuracy']:.4f}\n")
                f.write(f"F1 Macro: {result['f1_macro']:.4f}\n\n")
                
                # Convert dict report to string
                report_str = classification_report(y_test, result['predictions'], digits=4)
                f.write("Classification Report:\n")
                f.write(report_str)
                f.write("\n" + "-" * 40 + "\n\n")
            
            f.write(f"Successful models: {successful_models}/{len(self.classifiers)}\n")
            
            # --- Top 3 models for this iteration (by F1 Macro) ---
            try:
                entries = []
                for clf_name, res in results.items():
                    if res is None: 
                        continue
                    entries.append((res.get('f1_macro', 0.0), res.get('accuracy', 0.0), clf_name))
                entries.sort(key=lambda x: x[0], reverse=True)
                f.write("\nTOP 3 MODELS (this iteration)\n")
                f.write("-" * 50 + "\n")
                for rank, (f1, acc, clf) in enumerate(entries[:3], start=1):
                    f.write(f"{rank}. {clf:<20}  F1: {f1:.4f}  Acc: {acc:.4f}\n")
            except Exception as e:
                logging.error(f"Top-3 write failed for {combo_name}: {str(e)}")
        
        logging.info(f"Saved result file: {filename}")
    
    def save_summary_and_top_models(self):
        """Save summary of all experiments and select top 3 models"""
        summary_file = os.path.join(self.results_path, "SIFT_BoW_Summary.txt")
        
        with open(summary_file, 'w') as f:
            f.write("SIFT_BoW MNIST Classification - Complete Summary\n")
            f.write("=" * 70 + "\n\n")
            f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total Experiments: {len(self.all_results)}\n")
            f.write(f"Training samples per file: 1000\n")
            f.write(f"Test samples per file: 200\n")
            f.write(f"Primary metric: F1 Macro\n\n")
            
            # Sort all results by F1 macro descending
            sorted_results = sorted(self.all_results, key=lambda x: x['f1_macro'], reverse=True)
            
            # Top 3 models section
            f.write("TOP 3 MODELS (by F1 Macro)\n")
            f.write("=" * 70 + "\n\n")
            
            for i, result in enumerate(sorted_results[:3]):
                f.write(f"Rank {i+1}:\n")
                f.write(f"  Training File: {result['train_file']}\n")
                f.write(f"  Test File: {result['test_file']}\n")
                f.write(f"  Classifier: {result['classifier']}\n")
                f.write(f"  F1 Macro: {result['f1_macro']:.4f}\n")
                f.write(f"  Accuracy: {result['accuracy']:.4f}\n\n")
            
            # Detailed results matrix
            f.write("\nF1 MACRO SCORES MATRIX\n")
            f.write("=" * 70 + "\n\n")
            
            # Header row
            f.write(f"{'Train/Test':<25}")
            for test_file in self.test_files:
                f.write(f"{test_file.replace('.pkl', ''):<15}")
            f.write("\n")
            f.write("-" * 130 + "\n")
            
            # For each classifier, create a matrix
            for clf_name in self.classifiers.keys():
                f.write(f"\nClassifier: {clf_name}\n")
                f.write("-" * 130 + "\n")
                
                for train_file in self.train_files:
                    f.write(f"{train_file.replace('.pkl', ''):<25}")
                    
                    for test_file in self.test_files:
                        # Find the result for this combination
                        combo_result = next((r for r in self.all_results if 
                                           r['train_file'] == train_file and 
                                           r['test_file'] == test_file and 
                                           r['classifier'] == clf_name), None)
                        
                        if combo_result:
                            f.write(f"{combo_result['f1_macro']:<15.4f}")
                        else:
                            f.write(f"{'N/A':<15}")
                    
                    f.write("\n")
                
                f.write("\n")
            
            # Save top 3 models to pickle files
            f.write("\n" + "=" * 70 + "\n")
            f.write("TOP 3 MODELS SAVED\n")
            f.write("=" * 70 + "\n\n")
            
            # Create top_models directory
            top_models_dir = os.path.join(self.results_path, "top_models")
            os.makedirs(top_models_dir, exist_ok=True)
            
            for i, result in enumerate(sorted_results[:3]):
                model_filename = f"top_model_{i+1}_{result['classifier']}.pkl"
                model_path = os.path.join(top_models_dir, model_filename)
                
                # Save the model
                with open(model_path, 'wb') as model_file:
                    pickle.dump({
                        'model': result['model'],
                        'train_file': result['train_file'],
                        'test_file': result['test_file'],
                        'classifier': result['classifier'],
                        'f1_macro': result['f1_macro'],
                        'accuracy': result['accuracy'],
                        'rank': i+1
                    }, model_file)
                
                f.write(f"Model {i+1}: {model_filename}\n")
                f.write(f"  - Training: {result['train_file']}\n")
                f.write(f"  - Testing: {result['test_file']}\n")
                f.write(f"  - Classifier: {result['classifier']}\n")
                f.write(f"  - F1 Macro: {result['f1_macro']:.4f}\n")
                f.write(f"  - Saved to: {model_path}\n\n")
        
        logging.info(f"Summary saved to: {summary_file}")
        logging.info("Top 3 models selected and saved!")

class MNISTClassificationExperiment:
    def __init__(self):
        self.base_path = "data/features/mnist"
        self.results_path = "mnist_results"
        os.makedirs(self.results_path, exist_ok=True)
        
        # Define training and test files
        self.train_files = [
            'original.pkl',
            'combined_augmented.pkl', 
            'mixed_augmented.pkl'
        ]
        
        self.test_files = [
            'original.pkl',
            'noise.pkl',
            'occlusion_25.pkl',
            'rotation_15.pkl', 
            'scaling_08.pkl',
            'all_combined.pkl'
        ]
        
        # Initialize classifiers
        self.classifiers = {
            'SVM_RBF': SVC(kernel='rbf', probability=True, random_state=42),
            'Logistic_Regression': LogisticRegression(random_state=42, max_iter=1000),
            'Random_Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='mlogloss'),
            'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, max_iter=1000),
            'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1)
        }
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('mnist_classification.log'),
                logging.StreamHandler()
            ]
        )
        
    def load_features(self, file_path, n_samples=None):
        """Load features from pickle file with robust error handling"""
        if not os.path.exists(file_path):
            logging.warning(f"File not found: {file_path}")
            return None, None
            
        try:
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
            
            # Handle different data formats
            if isinstance(data, dict):
                features = data.get('features', data.get('descriptors', None))
                labels = data.get('labels', None)
            elif isinstance(data, (list, tuple)) and len(data) >= 2:
                features, labels = data[0], data[1]
            else:
                logging.error(f"Unexpected data format in {file_path}")
                return None, None
            
            if features is None or labels is None:
                logging.error(f"Could not extract features and labels from {file_path}")
                return None, None
            
            features = np.array(features)
            labels = np.array(labels)
            
            # Clean features
            features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
            
            # Sample if requested
            if n_samples and len(features) > n_samples:
                # Use stratified sampling to maintain class distribution
                splitter = StratifiedShuffleSplit(n_splits=1, train_size=n_samples, random_state=42)
                for train_idx, _ in splitter.split(features, labels):
                    features = features[train_idx]
                    labels = labels[train_idx]
                    break
            
            return features, labels
            
        except Exception as e:
            logging.error(f"Error loading {file_path}: {str(e)}")
            return None, None
    
    def train_and_evaluate(self, X_train, y_train, X_test, y_test, train_file, test_file, feature_type):
        """Train all classifiers and evaluate on test data"""
        results = {}
        
        for clf_name, classifier in self.classifiers.items():
            try:
                logging.info(f"Training {clf_name} on {train_file} -> {test_file}")
                
                # Train classifier
                classifier.fit(X_train, y_train)
                
                # Predictions
                y_pred = classifier.predict(X_test)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)
                f1_macro = f1_score(y_test, y_pred, average='macro')
                
                # Generate classification report
                report = classification_report(y_test, y_pred, digits=4)
                
                results[clf_name] = {
                    'accuracy': accuracy,
                    'f1_macro': f1_macro,
                    'classification_report': report,
                    'predictions': y_pred
                }
                
            except Exception as e:
                logging.error(f"Error training {clf_name}: {str(e)}")
                results[clf_name] = None
        
        return results
    
    def run_experiment_for_feature_type(self, feature_type):
        """Run all 18 experiments for a specific feature type"""
        logging.info(f"Starting experiments for {feature_type}")
        
        # Create feature type directory
        feature_dir = os.path.join(self.results_path, feature_type)
        os.makedirs(feature_dir, exist_ok=True)
        
        all_results = {}
        
        # Iterate through all train-test combinations
        for train_file in self.train_files:
            for test_file in self.test_files:
                combo_name = f"{train_file.replace('.pkl', '')}_{test_file.replace('.pkl', '')}"
                logging.info(f"Processing combination: {combo_name}")
                
                # Load training data
                train_path = os.path.join(self.base_path, feature_type, "train", train_file)
                X_train, y_train = self.load_features(train_path, 1000)
                
                if X_train is None:
                    logging.error(f"Failed to load training data: {train_path}")
                    continue
                
                # Load test data
                test_path = os.path.join(self.base_path, feature_type, "test", test_file)
                X_test, y_test = self.load_features(test_path, 200)
                
                if X_test is None:
                    logging.error(f"Failed to load test data: {test_path}")
                    continue
                
                # Reshape if needed
                if len(X_train.shape) > 2:
                    X_train = X_train.reshape(X_train.shape[0], -1)
                    X_test = X_test.reshape(X_test.shape[0], -1)
                
                # Train and evaluate
                results = self.train_and_evaluate(X_train, y_train, X_test, y_test, train_file, test_file, feature_type)
                all_results[combo_name] = results
                
                # Save individual result file
                self.save_individual_result(feature_dir, combo_name, results, train_file, test_file, feature_type)
        
        # Save summary for this feature type
        self.save_feature_summary(feature_dir, feature_type, all_results)
        
        return all_results
    
    def save_individual_result(self, feature_dir, combo_name, results, train_file, test_file, feature_type):
        """Save individual experiment result to text file"""
        filename = f"{combo_name}.txt"
        filepath = os.path.join(feature_dir, filename)
        
        with open(filepath, 'w') as f:
            f.write("MNIST Classification Result\n")
            f.write("=" * 50 + "\n\n")
            f.write(f"Feature Type: {feature_type}\n")
            f.write(f"Training File: {train_file}\n")
            f.write(f"Test File: {test_file}\n")
            f.write(f"Training Samples: 1000\n")
            f.write(f"Test Samples: 200\n")
            f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write("=" * 50 + "\n\n")
            
            for clf_name, result in results.items():
                if result is None:
                    f.write(f"Classifier: {clf_name}\n")
                    f.write("Status: Failed to train\n\n")
                    f.write("-" * 40 + "\n\n")
                    continue
                
                f.write(f"Classifier: {clf_name}\n")
                f.write(f"Accuracy: {result['accuracy']:.4f}\n")
                f.write(f"F1 Macro: {result['f1_macro']:.4f}\n\n")
                f.write("Classification Report:\n")
                f.write(result['classification_report'])
                f.write("\n" + "-" * 40 + "\n\n")
            
            # --- Top 3 models for this iteration (by F1 Macro) ---
            try:
                entries = []
                for clf_name, res in results.items():
                    if res is None: 
                        continue
                    entries.append((res.get('f1_macro', 0.0), res.get('accuracy', 0.0), clf_name))
                entries.sort(key=lambda x: x[0], reverse=True)
                f.write("\nTOP 3 MODELS (this iteration)\n")
                f.write("-" * 50 + "\n")
                for rank, (f1, acc, clf) in enumerate(entries[:3], start=1):
                    f.write(f"{rank}. {clf:<20}  F1: {f1:.4f}  Acc: {acc:.4f}\n")
            except Exception as e:
                logging.error(f"Top-3 write failed for {combo_name}: {str(e)}")
    
    def save_feature_summary(self, feature_dir, feature_type, all_results):
        """Save summary of all experiments for a feature type"""
        summary_file = os.path.join(feature_dir, f"summary_{feature_type}.txt")
        
        with open(summary_file, 'w') as f:
            f.write(f"MNIST {feature_type} Classification - Complete Summary\n")
            f.write("=" * 70 + "\n\n")
            f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total Experiments: {len(all_results)} (3 train files × 6 test files)\n")
            f.write(f"Training samples per file: 1000\n")
            f.write(f"Test samples per file: 200\n")
            f.write(f"Primary metric: F1 Macro\n\n")
            
            # Create a matrix of F1 scores
            f.write("F1 Macro Scores Matrix\n")
            f.write("=" * 70 + "\n\n")
            
            # Header row
            f.write(f"{'Train/Test':<25}")
            for test_file in self.test_files:
                f.write(f"{test_file.replace('.pkl', ''):<15}")
            f.write("\n")
            f.write("-" * 130 + "\n")
            
            # For each classifier, create a matrix
            for clf_name in self.classifiers.keys():
                f.write(f"\nClassifier: {clf_name}\n")
                f.write("-" * 130 + "\n")
                
                for train_file in self.train_files:
                    f.write(f"{train_file.replace('.pkl', ''):<25}")
                    
                    for test_file in self.test_files:
                        combo_name = f"{train_file.replace('.pkl', '')}_{test_file.replace('.pkl', '')}"
                        
                        if combo_name in all_results and all_results[combo_name][clf_name] is not None:
                            f1_score = all_results[combo_name][clf_name]['f1_macro']
                            f.write(f"{f1_score:<15.4f}")
                        else:
                            f.write(f"{'FAILED':<15}")
                    
                    f.write("\n")
                
                f.write("\n")
            
            # Best performing combinations
            f.write("\n" + "=" * 70 + "\n")
            f.write("TOP PERFORMING COMBINATIONS (by F1 Macro)\n")
            f.write("=" * 70 + "\n\n")
            
            all_scores = []
            for combo_name, combo_results in all_results.items():
                for clf_name, result in combo_results.items():
                    if result is not None:
                        all_scores.append({
                            'combination': combo_name,
                            'classifier': clf_name,
                            'f1_macro': result['f1_macro'],
                            'accuracy': result['accuracy']
                        })
            
            # Sort by F1 macro descending
            all_scores.sort(key=lambda x: x['f1_macro'], reverse=True)
            
            f.write(f"{'Rank':<5} {'Combination':<30} {'Classifier':<20} {'F1 Macro':<12} {'Accuracy':<12}\n")
            f.write("-" * 80 + "\n")
            
            for i, score in enumerate(all_scores[:20]):  # Top 20
                f.write(f"{i+1:<5} {score['combination']:<30} {score['classifier']:<20} "
                       f"{score['f1_macro']:<12.4f} {score['accuracy']:<12.4f}\n")
    
    def run_complete_experiment(self):
        """Run complete MNIST classification experiment for all feature types"""
        logging.info("Starting Complete MNIST Classification Experiment")
        logging.info("This will generate 18 classification reports for each feature type")
        logging.info("Total experiments: 3 feature types × 3 train files × 6 test files = 54 combinations")
        
        feature_types = ['HOG', 'LBP', 'SIFT_BoW']
        
        for feature_type in feature_types:
            logging.info(f"\n{'='*60}")
            logging.info(f"Processing Feature Type: {feature_type}")
            logging.info(f"{'='*60}")

            # If SIFT_BoW, use the specialized SIFTBowClassification integrated above
            if feature_type == 'SIFT_BoW':
                logging.info("Detected SIFT_BoW - delegating to integrated SIFTBowClassification module")
                sift_runner = SIFTBowClassification()
                sift_runner.run_all_experiments()
                continue

            # Check if feature directory exists
            feature_path = os.path.join(self.base_path, feature_type)
            if not os.path.exists(feature_path):
                logging.error(f"Feature directory not found: {feature_path}")
                continue

            # Run experiments using existing flow (HOG, LBP)
            self.run_experiment_for_feature_type(feature_type)

        logging.info("EXPERIMENT COMPLETED SUCCESSFULLY!")
        logging.info(f"Results saved to: {self.results_path}")
        logging.info("Each feature type folder contains 18 individual result files and 1 summary file")
        logging.info(f"{'='*60}")

def main():
    """Main execution function"""
    experiment = MNISTClassificationExperiment()
    experiment.run_complete_experiment()

if __name__ == "__main__":
    main()