# Advanced Unsupervised Learning

This notebook demonstrates the comprehensive unsupervised learning capabilities of the sklearn-mastery project, including sophisticated clustering algorithms, dimensionality reduction techniques, anomaly detection, association rule mining, manifold learning, and unsupervised feature learning methods.

## Table of Contents
1. [Setup and Imports](#setup)
2. [Results Saving Infrastructure](#results-setup)
3. [Advanced Clustering Algorithms](#clustering)
4. [Dimensionality Reduction Techniques](#dimensionality)
5. [Anomaly Detection Methods](#anomaly)
6. [Association Rule Mining](#association)
7. [Manifold Learning](#manifold)
8. [Cluster Validation and Evaluation](#validation)
9. [Unsupervised Feature Learning](#feature-learning)
10. [Save All Results](#save-results)

## 1. Setup and Imports {#setup}

In [None]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import adjusted_rand_score, silhouette_score, calinski_harabasz_score
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
from sklearn.decomposition import PCA, FactorAnalysis, FastICA, NMF, DictionaryLearning
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

# Enhanced imports for results saving
import os
from pathlib import Path
import joblib
import datetime
import pickle
import json
import time
import sklearn

In [None]:
# Project imports
import sys
sys.path.append('../src')

from data.generators import SyntheticDataGenerator
from models.unsupervised.clustering import *
from models.unsupervised.dimensionality_reduction import *
from evaluation.metrics import ModelEvaluator
from evaluation.visualization import ModelVisualizationSuite

# Configure plotting
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette('husl')

print("✅ All imports successful!")

## 2. Results Saving Infrastructure {#results-setup}

In [None]:
# Setup results directories and saving functions
def setup_results_directories():
    """Create results directory structure if it doesn't exist."""
    base_dir = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent
    results_dir = base_dir / 'results'
    
    # Create subdirectories
    directories = ['figures', 'models', 'reports', 'experiments', 'clustering', 'features', 'manifold', 'anomaly']
    for directory in directories:
        (results_dir / directory).mkdir(parents=True, exist_ok=True)
    
    return results_dir

def get_timestamp():
    """Get current timestamp for file naming."""
    return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Setup results directories
results_dir = setup_results_directories()
figures_dir = results_dir / 'figures'
models_dir = results_dir / 'models'
clustering_dir = results_dir / 'clustering'
features_dir = results_dir / 'features'
manifold_dir = results_dir / 'manifold'
anomaly_dir = results_dir / 'anomaly'
experiments_dir = results_dir / 'experiments'
reports_dir = results_dir / 'reports'

print(f"Unsupervised learning results will be saved to: {results_dir}")

# Enhanced figure saving for unsupervised learning
def save_unsupervised_figure(fig, name, description="", category="general", dpi=300):
    """Save unsupervised learning figure with proper naming and metadata."""
    timestamp = get_timestamp()
    filename = f"{timestamp}_unsupervised_{category}_{name}.png"
    filepath = figures_dir / filename
    
    fig.savefig(filepath, dpi=dpi, bbox_inches='tight', facecolor='white')
    print(f"📊 Unsupervised learning figure saved: {filepath}")
    
    # Save metadata
    metadata = {
        'filename': filename,
        'description': description,
        'timestamp': timestamp,
        'notebook': '04_unsupervised_learning',
        'category': f'unsupervised_{category}'
    }
    
    metadata_file = figures_dir / f"{timestamp}_unsupervised_{category}_{name}_metadata.json"
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    return filepath

# Enhanced clustering model saving
def save_clustering_model(model, name, description="", metadata=None):
    """Save clustering model with comprehensive metadata."""
    timestamp = get_timestamp()
    filename = f"{timestamp}_clustering_{name}.joblib"
    filepath = clustering_dir / filename
    
    # Save model
    joblib.dump(model, filepath)
    print(f"🔄 Clustering model saved: {filepath}")
    
    # Save comprehensive metadata
    model_metadata = {
        'filename': filename,
        'description': description,
        'timestamp': timestamp,
        'notebook': '04_unsupervised_learning',
        'model_type': 'clustering',
        'algorithm': model.__class__.__name__,
        'sklearn_version': sklearn.__version__ if 'sklearn' in globals() else 'unknown'
    }
    
    if metadata:
        model_metadata.update(metadata)
    
    metadata_file = clustering_dir / f"{timestamp}_clustering_{name}_metadata.json"
    with open(metadata_file, 'w') as f:
        json.dump(model_metadata, f, indent=2, default=str)
    
    return filepath

# Enhanced dimensionality reduction model saving
def save_reduction_model(model, name, description="", metadata=None):
    """Save dimensionality reduction model with metadata."""
    timestamp = get_timestamp()
    filename = f"{timestamp}_reduction_{name}.joblib"
    filepath = models_dir / filename
    
    # Save model
    joblib.dump(model, filepath)
    print(f"📉 Dimensionality reduction model saved: {filepath}")
    
    # Save metadata
    model_metadata = {
        'filename': filename,
        'description': description,
        'timestamp': timestamp,
        'notebook': '04_unsupervised_learning',
        'model_type': 'dimensionality_reduction',
        'algorithm': model.__class__.__name__
    }
    
    if metadata:
        model_metadata.update(metadata)
    
    metadata_file = models_dir / f"{timestamp}_reduction_{name}_metadata.json"
    with open(metadata_file, 'w') as f:
        json.dump(model_metadata, f, indent=2, default=str)
    
    return filepath

# Enhanced feature learning model saving
def save_feature_learning_model(model, name, description="", metadata=None):
    """Save feature learning model with metadata."""
    timestamp = get_timestamp()
    filename = f"{timestamp}_features_{name}.joblib"
    filepath = features_dir / filename
    
    # Save model
    joblib.dump(model, filepath)
    print(f"🎓 Feature learning model saved: {filepath}")
    
    # Save metadata
    model_metadata = {
        'filename': filename,
        'description': description,
        'timestamp': timestamp,
        'notebook': '04_unsupervised_learning',
        'model_type': 'feature_learning',
        'algorithm': model.__class__.__name__
    }
    
    if metadata:
        model_metadata.update(metadata)
    
    metadata_file = features_dir / f"{timestamp}_features_{name}_metadata.json"
    with open(metadata_file, 'w') as f:
        json.dump(model_metadata, f, indent=2, default=str)
    
    return filepath

# Enhanced save report function
def save_report(content, name, description="", format='txt'):
    """Save report with proper naming."""
    timestamp = get_timestamp()
    filename = f"{timestamp}_unsupervised_{name}.{format}"
    filepath = reports_dir / filename
    
    if format == 'txt':
        with open(filepath, 'w') as f:
            f.write(content)
    elif format == 'json':
        with open(filepath, 'w') as f:
            json.dump(content, f, indent=2, default=str)
    
    print(f"📄 Unsupervised learning report saved: {filepath}")
    return filepath

def save_experiment_results(experiment_name, results, description="", technique_type="general"):
    """Save experiment results with detailed configuration."""
    timestamp = get_timestamp()
    filename = f"{timestamp}_{technique_type}_{experiment_name}.json"
    filepath = experiments_dir / filename
    
    experiment_data = {
        'experiment_name': experiment_name,
        'description': description,
        'technique_type': technique_type,
        'timestamp': timestamp,
        'notebook': '04_unsupervised_learning',
        'results': results
    }
    
    with open(filepath, 'w') as f:
        json.dump(experiment_data, f, indent=2, default=str)
    
    print(f"💾 Saved experiment results: {filepath}")
    return filepath

print("✅ Results saving infrastructure ready!")

## 3. Advanced Clustering Algorithms {#clustering}

Let's explore sophisticated clustering algorithms that can handle complex data patterns.

In [None]:
# Generate diverse clustering datasets
print("🎯 Generating Clustering Datasets...")

generator = SyntheticDataGenerator(random_state=42)

# 1. Standard clustering dataset
X_blobs, y_blobs = generator.clustering_dataset(
    n_samples=800,
    n_features=2,
    n_clusters=4,
    cluster_std=1.5
)

print(f"Blob clustering dataset: {X_blobs.shape}, {len(np.unique(y_blobs))} clusters")

# 2. Complex shapes dataset
X_moons, y_moons = generator.complex_shapes(
    n_samples=600,
    shape_type='moons',
    noise=0.1
)

print(f"Moons dataset: {X_moons.shape}")

# 3. High-dimensional clustering data
X_highdim, y_highdim = generator.clustering_dataset(
    n_samples=1000,
    n_features=50,
    n_clusters=5,
    cluster_std=2.0
)

print(f"High-dimensional clustering dataset: {X_highdim.shape}, {len(np.unique(y_highdim))} clusters")

# 4. Varied density clusters
X_varied, y_varied = generator.varied_density_clusters(
    n_samples=800,
    n_features=2
)

print(f"Varied density clustering dataset: {X_varied.shape}")

# Visualize datasets
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Blob clusters
scatter1 = axes[0, 0].scatter(X_blobs[:, 0], X_blobs[:, 1], c=y_blobs, cmap='viridis', alpha=0.7)
axes[0, 0].set_title('Standard Blob Clusters')
axes[0, 0].set_xlabel('Feature 1')
axes[0, 0].set_ylabel('Feature 2')
axes[0, 0].grid(True, alpha=0.3)

# Moons
scatter2 = axes[0, 1].scatter(X_moons[:, 0], X_moons[:, 1], c=y_moons, cmap='viridis', alpha=0.7)
axes[0, 1].set_title('Two Moons')
axes[0, 1].set_xlabel('Feature 1')
axes[0, 1].set_ylabel('Feature 2')
axes[0, 1].grid(True, alpha=0.3)

# High-dimensional (first 2 components)
scatter3 = axes[1, 0].scatter(X_highdim[:, 0], X_highdim[:, 1], c=y_highdim, cmap='viridis', alpha=0.7)
axes[1, 0].set_title('High-Dimensional Clusters (2D projection)')
axes[1, 0].set_xlabel('Feature 1')
axes[1, 0].set_ylabel('Feature 2')
axes[1, 0].grid(True, alpha=0.3)

# Varied density
scatter4 = axes[1, 1].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied, cmap='viridis', alpha=0.7)
axes[1, 1].set_title('Varied Density Clusters')
axes[1, 1].set_xlabel('Feature 1')
axes[1, 1].set_ylabel('Feature 2')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
save_unsupervised_figure(fig, "clustering_datasets_overview", 
                        "Overview of different clustering datasets generated", "clustering")
plt.show()

print("\n✨ Clustering datasets generated and visualized!")

### 3.1 Advanced Clustering Models

In [None]:
# Test advanced clustering algorithms
print("🚀 Testing Advanced Clustering Algorithms...")

# Initialize clustering algorithms with fallbacks to sklearn implementations
clustering_algorithms = {}

# Try to use custom implementations, fall back to sklearn
try:
    clustering_algorithms['Adaptive KMeans'] = AdaptiveKMeans(random_state=42)
except:
    clustering_algorithms['K-Means'] = KMeans(n_clusters=4, random_state=42)

try:
    clustering_algorithms['Hierarchical Enhanced'] = HierarchicalEnhanced()
except:
    clustering_algorithms['Agglomerative'] = AgglomerativeClustering(n_clusters=4)

try:
    clustering_algorithms['DBSCAN Enhanced'] = DBSCANEnhanced()
except:
    clustering_algorithms['DBSCAN'] = DBSCAN(eps=0.5, min_samples=5)

try:
    clustering_algorithms['Gaussian Mixture Enhanced'] = GaussianMixtureEnhanced(random_state=42)
except:
    clustering_algorithms['Gaussian Mixture'] = GaussianMixture(n_components=4, random_state=42)

try:
    clustering_algorithms['Spectral Enhanced'] = SpectralEnhanced(random_state=42)
except:
    clustering_algorithms['Spectral Clustering'] = SpectralClustering(n_clusters=4, random_state=42)

# Test datasets for different algorithms
test_datasets = {
    'Blob Clusters': (X_blobs, y_blobs),
    'Complex Shapes': (X_moons, y_moons),
    'Varied Density': (X_varied, y_varied)
}

clustering_results = {}

for dataset_name, (X, y_true) in test_datasets.items():
    print(f"\n--- Testing on {dataset_name} ---")
    clustering_results[dataset_name] = {}
    
    for algo_name, algorithm in clustering_algorithms.items():
        try:
            # Fit clustering algorithm
            if hasattr(algorithm, 'fit_predict'):
                y_pred = algorithm.fit_predict(X)
            else:
                algorithm.fit(X)
                y_pred = algorithm.predict(X) if hasattr(algorithm, 'predict') else algorithm.labels_
            
            # Calculate clustering metrics
            n_clusters = len(np.unique(y_pred[y_pred >= 0]))  # Exclude noise points (-1)
            
            # Silhouette score (only if we have more than 1 cluster)
            if n_clusters > 1 and len(y_pred[y_pred >= 0]) > 1:
                sil_score = silhouette_score(X, y_pred)
                ch_score = calinski_harabasz_score(X, y_pred)
            else:
                sil_score = -1
                ch_score = -1
            
            # Adjusted Rand Index (if true labels available)
            ari_score = adjusted_rand_score(y_true, y_pred)
            
            clustering_results[dataset_name][algo_name] = {
                'y_pred': y_pred,
                'n_clusters': n_clusters,
                'silhouette_score': sil_score,
                'calinski_harabasz_score': ch_score,
                'adjusted_rand_index': ari_score,
                'noise_points': np.sum(y_pred == -1)
            }
            
            print(f"  {algo_name}:")
            print(f"    Clusters found: {n_clusters}")
            print(f"    Silhouette Score: {sil_score:.3f}" if sil_score > -1 else "    Silhouette Score: N/A")
            print(f"    Adjusted Rand Index: {ari_score:.3f}")
            print(f"    Noise points: {np.sum(y_pred == -1)}")
            
        except Exception as e:
            print(f"  {algo_name}: ❌ Failed - {str(e)}")
            clustering_results[dataset_name][algo_name] = {'error': str(e)}

print("\n✨ Advanced clustering algorithms tested!")

### 3.2 Clustering Visualization

In [None]:
# Visualize clustering results
print("📊 Visualizing Clustering Results...")

# Create comprehensive clustering visualization
for dataset_name, (X, y_true) in test_datasets.items():
    if X.shape[1] == 2:  # Only visualize 2D datasets
        print(f"\nVisualizing results for {dataset_name}...")
        
        # Count successful algorithms
        successful_algos = [algo for algo, result in clustering_results[dataset_name].items() 
                           if 'error' not in result]
        
        if successful_algos:
            n_algos = len(successful_algos)
            n_cols = min(3, n_algos)
            n_rows = (n_algos + n_cols - 1) // n_cols
            
            fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
            if n_rows == 1:
                axes = axes.reshape(1, -1) if n_algos > 1 else [axes]
            axes = axes.ravel() if n_algos > 1 else [axes[0]]
            
            for i, algo_name in enumerate(successful_algos):
                result = clustering_results[dataset_name][algo_name]
                y_pred = result['y_pred']
                
                # Plot clustering result
                scatter = axes[i].scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis', alpha=0.7)
                
                # Highlight noise points if any
                noise_mask = y_pred == -1
                if np.any(noise_mask):
                    axes[i].scatter(X[noise_mask, 0], X[noise_mask, 1], 
                                  c='red', marker='x', s=50, label='Noise')
                    axes[i].legend()
                
                # Add colorbar
                plt.colorbar(scatter, ax=axes[i])
                
                # Title with metrics
                sil_score = result.get('silhouette_score', -1)
                ari_score = result.get('adjusted_rand_index', -1)
                n_clusters = result.get('n_clusters', 0)
                
                title = f"{algo_name}\nClusters: {n_clusters}, ARI: {ari_score:.3f}"
                if sil_score > -1:
                    title += f", Sil: {sil_score:.3f}"
                
                axes[i].set_title(title)
                axes[i].set_xlabel('Feature 1')
                axes[i].set_ylabel('Feature 2')
                axes[i].grid(True, alpha=0.3)
            
            # Hide unused subplots
            for i in range(len(successful_algos), len(axes)):
                axes[i].set_visible(False)
            
            plt.suptitle(f'Clustering Results - {dataset_name}', fontsize=16)
            plt.tight_layout()
            save_unsupervised_figure(fig, f"clustering_results_{dataset_name.lower().replace(' ', '_')}", 
                                   f"Clustering algorithm comparison on {dataset_name} dataset", "clustering")
            plt.show()

print("\n✨ Clustering visualization complete!")

### 3.3 Clustering Performance Analysis

In [None]:
# Comprehensive clustering performance analysis
print("📈 Clustering Performance Analysis...")

# Create performance comparison
performance_data = []

for dataset_name in clustering_results.keys():
    for algo_name, result in clustering_results[dataset_name].items():
        if 'error' not in result:
            performance_data.append({
                'Dataset': dataset_name,
                'Algorithm': algo_name,
                'Clusters_Found': result.get('n_clusters', 0),
                'Silhouette_Score': result.get('silhouette_score', -1),
                'Calinski_Harabasz_Score': result.get('calinski_harabasz_score', -1),
                'Adjusted_Rand_Index': result.get('adjusted_rand_index', -1),
                'Noise_Points': result.get('noise_points', 0)
            })

if performance_data:
    performance_df = pd.DataFrame(performance_data)
    
    print("\n📊 Performance Summary:")
    print(performance_df.round(3).to_string(index=False))
    
    # Create performance visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Silhouette Score comparison
    valid_sil = performance_df[performance_df['Silhouette_Score'] > -1]
    if not valid_sil.empty:
        sns.barplot(data=valid_sil, x='Algorithm', y='Silhouette_Score', 
                   hue='Dataset', ax=axes[0, 0])
        axes[0, 0].set_title('Silhouette Score by Algorithm and Dataset')
        axes[0, 0].tick_params(axis='x', rotation=45)
        axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Adjusted Rand Index comparison
    valid_ari = performance_df[performance_df['Adjusted_Rand_Index'] > -1]
    if not valid_ari.empty:
        sns.barplot(data=valid_ari, x='Algorithm', y='Adjusted_Rand_Index', 
                   hue='Dataset', ax=axes[0, 1])
        axes[0, 1].set_title('Adjusted Rand Index by Algorithm and Dataset')
        axes[0, 1].tick_params(axis='x', rotation=45)
        axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Number of clusters found
    sns.barplot(data=performance_df, x='Algorithm', y='Clusters_Found', 
               hue='Dataset', ax=axes[1, 0])
    axes[1, 0].set_title('Number of Clusters Found')
    axes[1, 0].tick_params(axis='x', rotation=45)
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Noise points detected
    sns.barplot(data=performance_df, x='Algorithm', y='Noise_Points', 
               hue='Dataset', ax=axes[1, 1])
    axes[1, 1].set_title('Noise Points Detected')
    axes[1, 1].tick_params(axis='x', rotation=45)
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    save_unsupervised_figure(fig, "clustering_performance_analysis", 
                           "Comprehensive clustering performance metrics across algorithms and datasets", "clustering")
    plt.show()
    
    # Best algorithm per dataset
    print("\n🏆 Best Algorithm per Dataset (by ARI):")
    for dataset in performance_df['Dataset'].unique():
        dataset_data = performance_df[performance_df['Dataset'] == dataset]
        if not dataset_data.empty:
            best_algo = dataset_data.loc[dataset_data['Adjusted_Rand_Index'].idxmax()]
            print(f"  {dataset}: {best_algo['Algorithm']} (ARI: {best_algo['Adjusted_Rand_Index']:.3f})")

print("\n✨ Clustering performance analysis complete!")

## 4. Dimensionality Reduction Techniques {#dimensionality}

Let's explore advanced dimensionality reduction methods for visualization and feature extraction.

In [None]:
# Test dimensionality reduction techniques
print("🔍 Testing Dimensionality Reduction Techniques...")

# Use high-dimensional dataset for reduction
X_highdim_scaled = StandardScaler().fit_transform(X_highdim)

# Initialize dimensionality reduction algorithms with fallbacks
reduction_algorithms = {}

# Try custom implementations, fall back to sklearn
try:
    reduction_algorithms['Enhanced PCA'] = EnhancedPCA(n_components=2, random_state=42)
except:
    reduction_algorithms['PCA'] = PCA(n_components=2, random_state=42)

try:
    reduction_algorithms['Adaptive t-SNE'] = AdaptiveTSNE(n_components=2, random_state=42)
except:
    reduction_algorithms['t-SNE'] = TSNE(n_components=2, random_state=42, perplexity=30)

try:
    reduction_algorithms['UMAP Enhanced'] = UMAPEnhanced(n_components=2, random_state=42)
except:
    try:
        import umap
        reduction_algorithms['UMAP'] = umap.UMAP(n_components=2, random_state=42)
    except:
        print("UMAP not available, skipping...")

try:
    reduction_algorithms['Autoencoder Reduction'] = AutoencoderReduction(encoding_dim=2, random_state=42)
except:
    reduction_algorithms['Factor Analysis'] = FactorAnalysis(n_components=2, random_state=42)

try:
    reduction_algorithms['Manifold Learning'] = ManifoldLearning(method='isomap', n_components=2)
except:
    reduction_algorithms['Isomap'] = Isomap(n_components=2)

reduction_results = {}
reduction_times = {}

for algo_name, algorithm in reduction_algorithms.items():
    print(f"\n--- Testing {algo_name} ---")
    
    try:
        # Time the reduction
        start_time = time.time()
        X_reduced = algorithm.fit_transform(X_highdim_scaled)
        reduction_time = time.time() - start_time
        
        reduction_results[algo_name] = X_reduced
        reduction_times[algo_name] = reduction_time
        
        print(f"  Reduction time: {reduction_time:.3f}s")
        print(f"  Output shape: {X_reduced.shape}")
        print(f"  Variance preserved: {np.var(X_reduced):.3f}")
        
        # Calculate reconstruction error if possible
        if hasattr(algorithm, 'inverse_transform'):
            try:
                X_reconstructed = algorithm.inverse_transform(X_reduced)
                reconstruction_error = np.mean((X_highdim_scaled - X_reconstructed) ** 2)
                print(f"  Reconstruction error: {reconstruction_error:.3f}")
            except:
                print(f"  Reconstruction error: Not available")
        
        # Calculate explained variance for PCA-like methods
        if hasattr(algorithm, 'explained_variance_ratio_'):
            explained_var = np.sum(algorithm.explained_variance_ratio_)
            print(f"  Explained variance ratio: {explained_var:.3f}")
        
    except Exception as e:
        print(f"  ❌ Failed: {str(e)}")
        reduction_results[algo_name] = None

print("\n✨ Dimensionality reduction techniques tested!")

### 4.1 Dimensionality Reduction Visualization

In [None]:
# Visualize dimensionality reduction results
print("📊 Visualizing Dimensionality Reduction Results...")

# Filter successful reductions
successful_reductions = {k: v for k, v in reduction_results.items() if v is not None}

if successful_reductions:
    n_methods = len(successful_reductions)
    n_cols = min(3, n_methods)
    n_rows = (n_methods + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if n_rows == 1:
        axes = axes.reshape(1, -1) if n_methods > 1 else [axes]
    axes = axes.ravel() if n_methods > 1 else [axes[0]]
    
    for i, (method_name, X_reduced) in enumerate(successful_reductions.items()):
        # Create scatter plot colored by true clusters
        scatter = axes[i].scatter(X_reduced[:, 0], X_reduced[:, 1], 
                                c=y_highdim, cmap='viridis', alpha=0.7)
        
        axes[i].set_title(f'{method_name}\n(Time: {reduction_times.get(method_name, 0):.2f}s)')
        axes[i].set_xlabel('Component 1')
        axes[i].set_ylabel('Component 2')
        axes[i].grid(True, alpha=0.3)
        
        # Add colorbar
        plt.colorbar(scatter, ax=axes[i])
    
    # Hide unused subplots
    for i in range(len(successful_reductions), len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle('Dimensionality Reduction Results (50D → 2D)', fontsize=16)
    plt.tight_layout()
    save_unsupervised_figure(fig, "dimensionality_reduction_comparison", 
                           "Comparison of dimensionality reduction techniques", "reduction")
    plt.show()
    
    # Performance comparison
    print("\n📈 Dimensionality Reduction Performance:")
    print("=" * 60)
    print(f"{'Method':<20} {'Time (s)':<10} {'Variance':<12} {'Separation':<12}")
    print("=" * 60)
    
    for method_name, X_reduced in successful_reductions.items():
        time_taken = reduction_times.get(method_name, 0)
        variance = np.var(X_reduced)
        
        # Calculate cluster separation (silhouette score)
        try:
            separation = silhouette_score(X_reduced, y_highdim)
        except:
            separation = 0
        
        print(f"{method_name:<20} {time_taken:<10.3f} {variance:<12.3f} {separation:<12.3f}")
    
    print("=" * 60)

else:
    print("❌ No successful dimensionality reduction results to visualize.")

print("\n✨ Dimensionality reduction visualization complete!")

## 5. Anomaly Detection Methods {#anomaly}

Let's explore advanced anomaly detection techniques for identifying outliers and unusual patterns.

In [None]:
# Generate datasets with anomalies
print("🔍 Generating Anomaly Detection Datasets...")

# Normal data with outliers
np.random.seed(42)
n_samples = 1000
n_outliers = 50

# Generate normal data (2D for visualization)
X_normal = np.random.randn(n_samples - n_outliers, 2)

# Generate outliers (further from center)
X_outliers = np.random.uniform(low=-4, high=4, size=(n_outliers, 2))

# Combine data
X_anomaly = np.vstack([X_normal, X_outliers])
y_true_anomaly = np.hstack([np.ones(n_samples - n_outliers), -np.ones(n_outliers)])

# Scale the data
scaler_anomaly = StandardScaler()
X_anomaly_scaled = scaler_anomaly.fit_transform(X_anomaly)

print(f"Anomaly detection dataset: {X_anomaly.shape}")
print(f"Normal points: {np.sum(y_true_anomaly == 1)}")
print(f"Anomalous points: {np.sum(y_true_anomaly == -1)}")

# Visualize the anomaly dataset
plt.figure(figsize=(10, 8))
normal_mask = y_true_anomaly == 1
anomaly_mask = y_true_anomaly == -1

plt.scatter(X_anomaly[normal_mask, 0], X_anomaly[normal_mask, 1], 
           c='blue', alpha=0.6, label='Normal', s=30)
plt.scatter(X_anomaly[anomaly_mask, 0], X_anomaly[anomaly_mask, 1], 
           c='red', alpha=0.8, label='Anomalies', s=50, marker='x')

plt.title('Anomaly Detection Dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, alpha=0.3)
save_unsupervised_figure(plt.gcf(), "anomaly_detection_dataset", 
                       "Generated dataset with normal points and anomalies", "anomaly")
plt.show()

print("\n✨ Anomaly detection dataset generated!")

### 5.1 Anomaly Detection Algorithms

In [None]:
# Test anomaly detection algorithms
print("🚀 Testing Anomaly Detection Algorithms...")

# Initialize anomaly detection algorithms
anomaly_algorithms = {
    'Isolation Forest': IsolationForest(contamination=0.05, random_state=42),
    'One-Class SVM': OneClassSVM(nu=0.05),
    'Local Outlier Factor': LocalOutlierFactor(n_neighbors=20, contamination=0.05),
}

# Add Elliptic Envelope
try:
    from sklearn.covariance import EllipticEnvelope
    anomaly_algorithms['Elliptic Envelope'] = EllipticEnvelope(contamination=0.05, random_state=42)
except:
    pass

anomaly_results = {}

for algo_name, algorithm in anomaly_algorithms.items():
    print(f"\n--- Testing {algo_name} ---")
    
    try:
        # Fit and predict
        if algo_name == 'Local Outlier Factor':
            # LOF returns predictions directly
            y_pred_anomaly = algorithm.fit_predict(X_anomaly_scaled)
        else:
            # Other algorithms
            algorithm.fit(X_anomaly_scaled)
            y_pred_anomaly = algorithm.predict(X_anomaly_scaled)
        
        # Convert predictions to binary (1 for normal, -1 for anomaly)
        y_pred_binary = np.where(y_pred_anomaly == 1, 1, -1)
        
        # Calculate metrics
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
        
        accuracy = accuracy_score(y_true_anomaly, y_pred_binary)
        precision = precision_score(y_true_anomaly, y_pred_binary, pos_label=-1)
        recall = recall_score(y_true_anomaly, y_pred_binary, pos_label=-1)
        f1 = f1_score(y_true_anomaly, y_pred_binary, pos_label=-1)
        
        # Count predictions
        n_predicted_anomalies = np.sum(y_pred_binary == -1)
        n_true_anomalies = np.sum(y_true_anomaly == -1)
        
        anomaly_results[algo_name] = {
            'y_pred': y_pred_binary,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'n_predicted_anomalies': n_predicted_anomalies,
            'n_true_anomalies': n_true_anomalies
        }
        
        print(f"  Accuracy: {accuracy:.3f}")
        print(f"  Precision: {precision:.3f}")
        print(f"  Recall: {recall:.3f}")
        print(f"  F1-Score: {f1:.3f}")
        print(f"  Predicted anomalies: {n_predicted_anomalies}/{n_true_anomalies}")
        
    except Exception as e:
        print(f"  ❌ Failed: {str(e)}")
        anomaly_results[algo_name] = {'error': str(e)}

print("\n✨ Anomaly detection algorithms tested!")

### 5.2 Anomaly Detection Visualization

In [None]:
# Visualize anomaly detection results
print("📊 Visualizing Anomaly Detection Results...")

# Filter successful algorithms
successful_anomaly_algos = {k: v for k, v in anomaly_results.items() if 'error' not in v}

if successful_anomaly_algos:
    n_algos = len(successful_anomaly_algos)
    n_cols = min(2, n_algos)
    n_rows = (n_algos + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 6*n_rows))
    if n_rows == 1:
        axes = axes.reshape(1, -1) if n_algos > 1 else [axes]
    axes = axes.ravel() if n_algos > 1 else [axes[0]]
    
    for i, (algo_name, result) in enumerate(successful_anomaly_algos.items()):
        y_pred = result['y_pred']
        
        # True positives, false positives, etc.
        tp_mask = (y_true_anomaly == -1) & (y_pred == -1)  # True anomalies correctly identified
        fp_mask = (y_true_anomaly == 1) & (y_pred == -1)   # Normal points incorrectly flagged
        fn_mask = (y_true_anomaly == -1) & (y_pred == 1)   # Anomalies missed
        tn_mask = (y_true_anomaly == 1) & (y_pred == 1)    # Normal points correctly identified
        
        # Plot with different colors for different categories
        axes[i].scatter(X_anomaly[tn_mask, 0], X_anomaly[tn_mask, 1], 
                       c='lightblue', alpha=0.6, label='True Normal', s=30)
        axes[i].scatter(X_anomaly[tp_mask, 0], X_anomaly[tp_mask, 1], 
                       c='red', alpha=0.8, label='True Anomaly (Detected)', s=50, marker='x')
        axes[i].scatter(X_anomaly[fp_mask, 0], X_anomaly[fp_mask, 1], 
                       c='orange', alpha=0.8, label='False Positive', s=40, marker='s')
        axes[i].scatter(X_anomaly[fn_mask, 0], X_anomaly[fn_mask, 1], 
                       c='purple', alpha=0.8, label='False Negative', s=40, marker='^')
        
        # Title with metrics
        f1 = result['f1_score']
        precision = result['precision']
        recall = result['recall']
        
        axes[i].set_title(f'{algo_name}\nF1: {f1:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}')
        axes[i].set_xlabel('Feature 1')
        axes[i].set_ylabel('Feature 2')
        axes[i].legend(fontsize=8)
        axes[i].grid(True, alpha=0.3)
    
    # Hide unused subplots
    for i in range(len(successful_anomaly_algos), len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle('Anomaly Detection Results', fontsize=16)
    plt.tight_layout()
    save_unsupervised_figure(fig, "anomaly_detection_results", 
                           "Comparison of anomaly detection algorithm performance", "anomaly")
    plt.show()
    
    # Performance comparison
    print("\n📈 Anomaly Detection Performance:")
    print("=" * 80)
    print(f"{'Algorithm':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10}")
    print("=" * 80)
    
    for algo_name, result in successful_anomaly_algos.items():
        print(f"{algo_name:<20} {result['accuracy']:<10.3f} {result['precision']:<10.3f} "
              f"{result['recall']:<10.3f} {result['f1_score']:<10.3f}")
    
    print("=" * 80)
    
    # Best algorithm
    best_algo = max(successful_anomaly_algos.keys(), 
                   key=lambda x: successful_anomaly_algos[x]['f1_score'])
    print(f"\n🏆 Best Algorithm: {best_algo} (F1-Score: {successful_anomaly_algos[best_algo]['f1_score']:.3f})")

else:
    print("❌ No successful anomaly detection results to visualize.")

print("\n✨ Anomaly detection visualization complete!")

## 6. Association Rule Mining {#association}

Let's explore pattern discovery through association rule mining.

In [None]:
# Generate transaction data for association rule mining
print("🛒 Generating Transaction Data for Association Rule Mining...")

# Create synthetic market basket data
np.random.seed(42)
n_transactions = 1000
n_items = 20

# Item names
item_names = [f'Item_{i:02d}' for i in range(n_items)]

# Generate transactions with some correlations
transactions = []

for _ in range(n_transactions):
    # Start with random selection
    n_items_in_transaction = np.random.poisson(5) + 1
    n_items_in_transaction = min(n_items_in_transaction, n_items)
    
    # Add some correlation patterns
    transaction_items = set()
    
    # Base items selection
    base_items = np.random.choice(n_items, size=n_items_in_transaction, replace=False)
    
    for item in base_items:
        transaction_items.add(item)
        
        # Add correlated items with some probability
        if item == 0 and np.random.random() < 0.7:  # Item_00 often with Item_01
            transaction_items.add(1)
        if item == 2 and np.random.random() < 0.6:  # Item_02 often with Item_03 and Item_04
            transaction_items.add(3)
            if np.random.random() < 0.4:
                transaction_items.add(4)
        if item == 5 and np.random.random() < 0.5:  # Item_05 often with Item_06
            transaction_items.add(6)
    
    transactions.append(list(transaction_items))

print(f"Generated {len(transactions)} transactions")
print(f"Average items per transaction: {np.mean([len(t) for t in transactions]):.2f}")

# Convert to binary matrix for analysis
transaction_matrix = np.zeros((len(transactions), n_items))
for i, transaction in enumerate(transactions):
    for item in transaction:
        transaction_matrix[i, item] = 1

print(f"Transaction matrix shape: {transaction_matrix.shape}")
print("\n✨ Transaction data generated!")

### 6.1 Association Rule Mining Implementation

In [None]:
# Implement Apriori algorithm for association rule mining
print("⚡ Implementing Association Rule Mining...")

class AssociationRuleMiner:
    """Simple implementation of association rule mining."""
    
    def __init__(self, min_support=0.1, min_confidence=0.5):
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.frequent_itemsets = {}
        self.rules = []
    
    def calculate_support(self, itemset, transactions):
        """Calculate support for an itemset."""
        count = 0
        for transaction in transactions:
            if set(itemset).issubset(set(transaction)):
                count += 1
        return count / len(transactions)
    
    def find_frequent_itemsets(self, transactions):
        """Find frequent itemsets using Apriori algorithm."""
        n_items = max([max(t) for t in transactions if t]) + 1
        
        # Find frequent 1-itemsets
        frequent_1_itemsets = []
        for item in range(n_items):
            support = self.calculate_support([item], transactions)
            if support >= self.min_support:
                frequent_1_itemsets.append(([item], support))
        
        self.frequent_itemsets[1] = frequent_1_itemsets
        print(f"Found {len(frequent_1_itemsets)} frequent 1-itemsets")
        
        # Find frequent k-itemsets for k > 1
        k = 2
        while True:
            candidates = self.generate_candidates(k)
            frequent_k_itemsets = []
            
            for candidate in candidates:
                support = self.calculate_support(candidate, transactions)
                if support >= self.min_support:
                    frequent_k_itemsets.append((candidate, support))
            
            if not frequent_k_itemsets:
                break
            
            self.frequent_itemsets[k] = frequent_k_itemsets
            print(f"Found {len(frequent_k_itemsets)} frequent {k}-itemsets")
            k += 1
            
            if k > 4:  # Limit to prevent excessive computation
                break
    
    def generate_candidates(self, k):
        """Generate candidate itemsets of size k."""
        if k not in self.frequent_itemsets or k-1 not in self.frequent_itemsets:
            return []
        
        prev_itemsets = [itemset for itemset, _ in self.frequent_itemsets[k-1]]
        candidates = []
        
        for i in range(len(prev_itemsets)):
            for j in range(i+1, len(prev_itemsets)):
                itemset1 = sorted(prev_itemsets[i])
                itemset2 = sorted(prev_itemsets[j])
                
                # Join if first k-2 items are the same
                if itemset1[:-1] == itemset2[:-1]:
                    candidate = sorted(list(set(itemset1 + itemset2)))
                    if len(candidate) == k:
                        candidates.append(candidate)
        
        return candidates
    
    def generate_rules(self, transactions):
        """Generate association rules from frequent itemsets."""
        self.rules = []
        
        for k in range(2, max(self.frequent_itemsets.keys()) + 1):
            for itemset, support in self.frequent_itemsets[k]:
                # Generate all possible rules from this itemset
                for i in range(1, len(itemset)):
                    for antecedent in self.combinations(itemset, i):
                        consequent = [item for item in itemset if item not in antecedent]
                        
                        # Calculate confidence
                        antecedent_support = self.calculate_support(antecedent, transactions)
                        if antecedent_support > 0:
                            confidence = support / antecedent_support
                            
                            if confidence >= self.min_confidence:
                                # Calculate lift
                                consequent_support = self.calculate_support(consequent, transactions)
                                lift = confidence / consequent_support if consequent_support > 0 else 0
                                
                                self.rules.append({
                                    'antecedent': antecedent,
                                    'consequent': consequent,
                                    'support': support,
                                    'confidence': confidence,
                                    'lift': lift
                                })
        
        # Sort rules by confidence
        self.rules.sort(key=lambda x: x['confidence'], reverse=True)
        print(f"Generated {len(self.rules)} association rules")
    
    def combinations(self, items, r):
        """Generate all combinations of r items from items list."""
        from itertools import combinations
        return list(combinations(items, r))
    
    def print_rules(self, n_rules=10):
        """Print top n association rules."""
        print(f"\nTop {min(n_rules, len(self.rules))} Association Rules:")
        print("=" * 80)
        print(f"{'Antecedent':<20} {'Consequent':<15} {'Support':<10} {'Confidence':<12} {'Lift':<8}")
        print("=" * 80)
        
        for i, rule in enumerate(self.rules[:n_rules]):
            antecedent_str = ', '.join([item_names[item] for item in rule['antecedent']])
            consequent_str = ', '.join([item_names[item] for item in rule['consequent']])
            
            print(f"{antecedent_str:<20} {consequent_str:<15} {rule['support']:<10.3f} "
                  f"{rule['confidence']:<12.3f} {rule['lift']:<8.3f}")

# Apply association rule mining
miner = AssociationRuleMiner(min_support=0.05, min_confidence=0.4)

print("\n--- Finding Frequent Itemsets ---")
miner.find_frequent_itemsets(transactions)

print("\n--- Generating Association Rules ---")
miner.generate_rules(transactions)

# Display results
miner.print_rules(15)

print("\n✨ Association rule mining complete!")

### 6.2 Association Rules Visualization

In [None]:
# Visualize association rule mining results
print("📊 Visualizing Association Rule Mining Results...")

if miner.rules:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Support vs Confidence scatter plot
    supports = [rule['support'] for rule in miner.rules]
    confidences = [rule['confidence'] for rule in miner.rules]
    lifts = [rule['lift'] for rule in miner.rules]
    
    scatter = axes[0, 0].scatter(supports, confidences, c=lifts, cmap='viridis', alpha=0.7, s=50)
    axes[0, 0].set_xlabel('Support')
    axes[0, 0].set_ylabel('Confidence')
    axes[0, 0].set_title('Association Rules: Support vs Confidence')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Add colorbar
    cbar = plt.colorbar(scatter, ax=axes[0, 0])
    cbar.set_label('Lift')
    
    # 2. Lift distribution
    axes[0, 1].hist(lifts, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 1].axvline(np.mean(lifts), color='red', linestyle='--', 
                      label=f'Mean: {np.mean(lifts):.2f}')
    axes[0, 1].axvline(1.0, color='green', linestyle='--', 
                      label='Lift = 1 (Independence)')
    axes[0, 1].set_xlabel('Lift')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Distribution of Lift Values')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Top rules visualization
    top_rules = miner.rules[:10]
    rule_labels = []
    rule_confidences = []
    
    for rule in top_rules:
        antecedent_str = ', '.join([item_names[item] for item in rule['antecedent']])
        consequent_str = ', '.join([item_names[item] for item in rule['consequent']])
        rule_label = f"{antecedent_str} → {consequent_str}"
        
        # Truncate long labels
        if len(rule_label) > 25:
            rule_label = rule_label[:22] + "..."
        
        rule_labels.append(rule_label)
        rule_confidences.append(rule['confidence'])
    
    bars = axes[1, 0].barh(range(len(rule_labels)), rule_confidences, alpha=0.7, color='lightgreen')
    axes[1, 0].set_yticks(range(len(rule_labels)))
    axes[1, 0].set_yticklabels(rule_labels)
    axes[1, 0].set_xlabel('Confidence')
    axes[1, 0].set_title('Top 10 Association Rules by Confidence')
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Item frequency analysis
    item_frequencies = np.sum(transaction_matrix, axis=0)
    item_indices = np.argsort(item_frequencies)[-15:]  # Top 15 items
    
    top_item_names = [item_names[i] for i in item_indices]
    top_item_freqs = item_frequencies[item_indices]
    
    bars = axes[1, 1].bar(range(len(top_item_names)), top_item_freqs, alpha=0.7, color='lightcoral')
    axes[1, 1].set_xticks(range(len(top_item_names)))
    axes[1, 1].set_xticklabels(top_item_names, rotation=45, ha='right')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Top 15 Most Frequent Items')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    save_unsupervised_figure(fig, "association_rule_mining_analysis", 
                           "Comprehensive analysis of association rule mining results", "association")
    plt.show()
    
    # Summary statistics
    print(f"\n📊 Association Rule Mining Summary:")
    print("=" * 50)
    print(f"Total transactions: {len(transactions)}")
    print(f"Total items: {n_items}")
    print(f"Average transaction size: {np.mean([len(t) for t in transactions]):.2f}")
    print(f"Frequent itemsets found:")
    for k, itemsets in miner.frequent_itemsets.items():
        print(f"  {k}-itemsets: {len(itemsets)}")
    print(f"Association rules generated: {len(miner.rules)}")
    print(f"Average confidence: {np.mean(confidences):.3f}")
    print(f"Average lift: {np.mean(lifts):.3f}")
    print("=" * 50)

print("\n✨ Association rule mining visualization complete!")

## 7. Manifold Learning {#manifold}

Explore advanced manifold learning techniques for non-linear dimensionality reduction.

In [None]:
# Generate manifold datasets
print("🌀 Generating Manifold Learning Datasets...")

def generate_swiss_roll_3d(n_samples=1000, noise=0.1):
    """Generate 3D Swiss Roll dataset."""
    t = 1.5 * np.pi * (1 + 2 * np.random.rand(n_samples))
    height = 21 * np.random.rand(n_samples)
    
    X = np.zeros((n_samples, 3))
    X[:, 0] = t * np.cos(t)
    X[:, 1] = height
    X[:, 2] = t * np.sin(t)
    
    X += noise * np.random.randn(n_samples, 3)
    
    return X, t

def generate_s_curve_3d(n_samples=1000, noise=0.1):
    """Generate 3D S-curve dataset."""
    t = 3 * np.pi * (np.random.rand(n_samples) - 0.5)
    height = 2 * np.random.rand(n_samples)
    
    X = np.zeros((n_samples, 3))
    X[:, 0] = np.sin(t)
    X[:, 1] = height
    X[:, 2] = np.sign(t) * (np.cos(t) - 1)
    
    X += noise * np.random.randn(n_samples, 3)
    
    return X, t

# Generate manifold datasets
X_swiss, t_swiss = generate_swiss_roll_3d(n_samples=800, noise=0.1)
X_scurve, t_scurve = generate_s_curve_3d(n_samples=800, noise=0.1)

# Create a sphere dataset
from sklearn.datasets import make_swiss_roll
X_sklearn_swiss, color_sklearn = make_swiss_roll(n_samples=800, noise=0.1, random_state=42)

manifold_datasets = {
    'Swiss Roll': (X_swiss, t_swiss),
    'S-Curve': (X_scurve, t_scurve),
    'sklearn Swiss Roll': (X_sklearn_swiss, color_sklearn)
}

print(f"Generated {len(manifold_datasets)} manifold datasets")

# Visualize 3D manifold datasets
fig = plt.figure(figsize=(18, 6))

for i, (name, (X, color)) in enumerate(manifold_datasets.items()):
    ax = fig.add_subplot(1, 3, i+1, projection='3d')
    
    scatter = ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap='viridis', alpha=0.7)
    ax.set_title(f'{name} (3D)')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    
    plt.colorbar(scatter, ax=ax, shrink=0.8)

plt.suptitle('3D Manifold Datasets', fontsize=16)
plt.tight_layout()
save_unsupervised_figure(fig, "manifold_datasets_3d", 
                       "3D visualization of manifold learning datasets", "manifold")
plt.show()

print("\n✨ Manifold datasets generated and visualized!")

### 7.1 Manifold Learning Algorithms

In [None]:
# Test manifold learning algorithms
print("🔄 Testing Manifold Learning Algorithms...")

# Initialize manifold learning algorithms
manifold_algorithms = {
    'Isomap': Isomap(n_components=2, n_neighbors=10),
    'Locally Linear Embedding': LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42),
    't-SNE': TSNE(n_components=2, random_state=42, perplexity=30),
    'PCA': PCA(n_components=2, random_state=42)
}

# Add spectral embedding if available
try:
    from sklearn.manifold import SpectralEmbedding
    manifold_algorithms['Spectral Embedding'] = SpectralEmbedding(n_components=2, random_state=42)
except:
    pass

# Add MDS if available
try:
    from sklearn.manifold import MDS
    manifold_algorithms['MDS'] = MDS(n_components=2, random_state=42)
except:
    pass

def calculate_trustworthiness(X_original, X_embedded, n_neighbors=5):
    """Calculate trustworthiness score for manifold learning."""
    from sklearn.neighbors import NearestNeighbors
    
    # Find neighbors in original space
    nbrs_original = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X_original)
    _, indices_original = nbrs_original.kneighbors(X_original)
    
    # Find neighbors in embedded space
    nbrs_embedded = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X_embedded)
    _, indices_embedded = nbrs_embedded.kneighbors(X_embedded)
    
    # Calculate trustworthiness
    n_samples = X_original.shape[0]
    trustworthiness = 0.0
    
    for i in range(n_samples):
        # Get k nearest neighbors (excluding the point itself)
        original_neighbors = set(indices_original[i, 1:])
        embedded_neighbors = set(indices_embedded[i, 1:])
        
        # Count how many embedded neighbors are also original neighbors
        intersection = len(original_neighbors.intersection(embedded_neighbors))
        trustworthiness += intersection / n_neighbors
    
    return trustworthiness / n_samples

manifold_results = {}

for dataset_name, (X, color) in manifold_datasets.items():
    print(f"\n--- Testing on {dataset_name} ---")
    manifold_results[dataset_name] = {}
    
    # Standardize the data
    X_scaled = StandardScaler().fit_transform(X)
    
    for algo_name, algorithm in manifold_algorithms.items():
        try:
            print(f"  Running {algo_name}...")
            
            start_time = time.time()
            X_embedded = algorithm.fit_transform(X_scaled)
            embedding_time = time.time() - start_time
            
            # Calculate trustworthiness
            trustworthiness = calculate_trustworthiness(X_scaled, X_embedded, n_neighbors=5)
            
            manifold_results[dataset_name][algo_name] = {
                'X_embedded': X_embedded,
                'embedding_time': embedding_time,
                'trustworthiness': trustworthiness,
                'color': color
            }
            
            print(f"    Time: {embedding_time:.3f}s, Trustworthiness: {trustworthiness:.3f}")
            
        except Exception as e:
            print(f"    ❌ Failed: {str(e)}")
            manifold_results[dataset_name][algo_name] = {'error': str(e)}

print("\n✨ Manifold learning algorithms tested!")

### 7.2 Manifold Learning Visualization

In [None]:
# Visualize manifold learning results
print("📊 Visualizing Manifold Learning Results...")

for dataset_name, results in manifold_results.items():
    print(f"\nVisualizing results for {dataset_name}...")
    
    # Count successful algorithms
    successful_algos = [(algo, result) for algo, result in results.items() 
                       if 'error' not in result]
    
    if successful_algos:
        n_algos = len(successful_algos)
        n_cols = min(3, n_algos)
        n_rows = (n_algos + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        if n_rows == 1:
            axes = axes.reshape(1, -1) if n_algos > 1 else [axes]
        axes = axes.ravel() if n_algos > 1 else [axes[0]]
        
        for i, (algo_name, result) in enumerate(successful_algos):
            X_embedded = result['X_embedded']
            color = result['color']
            
            scatter = axes[i].scatter(X_embedded[:, 0], X_embedded[:, 1], 
                                    c=color, cmap='viridis', alpha=0.7)
            
            # Title with metrics
            trustworthiness = result['trustworthiness']
            embedding_time = result['embedding_time']
            
            axes[i].set_title(f'{algo_name}\nTrustworthiness: {trustworthiness:.3f}\nTime: {embedding_time:.2f}s')
            axes[i].set_xlabel('Component 1')
            axes[i].set_ylabel('Component 2')
            axes[i].grid(True, alpha=0.3)
            
            plt.colorbar(scatter, ax=axes[i])
        
        # Hide unused subplots
        for i in range(len(successful_algos), len(axes)):
            axes[i].set_visible(False)
        
        plt.suptitle(f'Manifold Learning Results - {dataset_name}', fontsize=16)
        plt.tight_layout()
        save_unsupervised_figure(fig, f"manifold_learning_results_{dataset_name.lower().replace(' ', '_')}", 
                               f"Manifold learning algorithm comparison on {dataset_name} dataset", "manifold")
        plt.show()

print("\n✨ Manifold learning visualization complete!")

### 7.3 Manifold Learning Performance Analysis

In [None]:
# Comprehensive manifold learning performance analysis
print("📈 Manifold Learning Performance Analysis...")

# Create performance comparison
performance_data = []

for dataset_name in manifold_results.keys():
    for algo_name, result in manifold_results[dataset_name].items():
        if 'error' not in result:
            performance_data.append({
                'Dataset': dataset_name,
                'Algorithm': algo_name,
                'Trustworthiness': result['trustworthiness'],
                'Embedding_Time': result['embedding_time']
            })

if performance_data:
    performance_df = pd.DataFrame(performance_data)
    
    print("\n📊 Manifold Learning Performance Summary:")
    print(performance_df.round(3).to_string(index=False))
    
    # Create performance visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Trustworthiness comparison
    sns.barplot(data=performance_df, x='Algorithm', y='Trustworthiness', 
               hue='Dataset', ax=axes[0, 0])
    axes[0, 0].set_title('Trustworthiness by Algorithm and Dataset')
    axes[0, 0].tick_params(axis='x', rotation=45)
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Embedding time comparison
    sns.barplot(data=performance_df, x='Algorithm', y='Embedding_Time', 
               hue='Dataset', ax=axes[0, 1])
    axes[0, 1].set_title('Embedding Time by Algorithm and Dataset')
    axes[0, 1].tick_params(axis='x', rotation=45)
    axes[0, 1].set_ylabel('Time (seconds)')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Trustworthiness vs Time scatter
    algorithms = performance_df['Algorithm'].unique()
    colors = plt.cm.Set3(np.linspace(0, 1, len(algorithms)))
    
    for i, algo in enumerate(algorithms):
        algo_data = performance_df[performance_df['Algorithm'] == algo]
        axes[1, 0].scatter(algo_data['Embedding_Time'], algo_data['Trustworthiness'], 
                          color=colors[i], label=algo, s=100, alpha=0.7)
    
    axes[1, 0].set_xlabel('Embedding Time (seconds)')
    axes[1, 0].set_ylabel('Trustworthiness')
    axes[1, 0].set_title('Trustworthiness vs Embedding Time')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Average performance by algorithm
    avg_performance = performance_df.groupby('Algorithm').agg({
        'Trustworthiness': 'mean',
        'Embedding_Time': 'mean'
    }).reset_index()
    
    # Normalize both metrics to 0-1 scale for comparison
    avg_performance['Trustworthiness_norm'] = (avg_performance['Trustworthiness'] - 
                                              avg_performance['Trustworthiness'].min()) / \
                                             (avg_performance['Trustworthiness'].max() - 
                                              avg_performance['Trustworthiness'].min())
    
    avg_performance['Speed_norm'] = 1 - ((avg_performance['Embedding_Time'] - 
                                         avg_performance['Embedding_Time'].min()) / \
                                        (avg_performance['Embedding_Time'].max() - 
                                         avg_performance['Embedding_Time'].min()))
    
    # Combined score (quality + speed)
    avg_performance['Combined_Score'] = (avg_performance['Trustworthiness_norm'] + 
                                        avg_performance['Speed_norm']) / 2
    
    bars = axes[1, 1].bar(range(len(avg_performance)), avg_performance['Combined_Score'], 
                         color='lightblue', alpha=0.7)
    axes[1, 1].set_xlabel('Algorithm')
    axes[1, 1].set_ylabel('Combined Score (Quality + Speed)')
    axes[1, 1].set_title('Overall Algorithm Performance')
    axes[1, 1].set_xticks(range(len(avg_performance)))
    axes[1, 1].set_xticklabels(avg_performance['Algorithm'], rotation=45, ha='right')
    axes[1, 1].grid(True, alpha=0.3)
    
    # Add value labels
    for i, bar in enumerate(bars):
        axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                       f'{avg_performance.iloc[i]["Combined_Score"]:.3f}',
                       ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    save_unsupervised_figure(fig, "manifold_learning_performance_analysis", 
                           "Comprehensive manifold learning performance metrics", "manifold")
    plt.show()
    
    # Best algorithm per dataset
    print("\n🏆 Best Algorithm per Dataset (by Trustworthiness):")
    for dataset in performance_df['Dataset'].unique():
        dataset_data = performance_df[performance_df['Dataset'] == dataset]
        if not dataset_data.empty:
            best_algo = dataset_data.loc[dataset_data['Trustworthiness'].idxmax()]
            print(f"  {dataset}: {best_algo['Algorithm']} (Trustworthiness: {best_algo['Trustworthiness']:.3f})")

print("\n✨ Manifold learning performance analysis complete!")

## 8. Cluster Validation and Evaluation {#validation}

Comprehensive cluster validation and evaluation methods.

In [None]:
# Comprehensive cluster validation
print("✅ Comprehensive Cluster Validation and Evaluation...")

class ClusterValidator:
    """Comprehensive cluster validation toolkit."""
    
    def __init__(self):
        self.validation_results = {}
    
    def internal_validation(self, X, labels, algorithm_name="Unknown"):
        """Internal validation metrics (no ground truth required)."""
        
        # Remove noise points for validation
        valid_mask = labels >= 0
        X_valid = X[valid_mask]
        labels_valid = labels[valid_mask]
        
        if len(np.unique(labels_valid)) < 2:
            return {
                'silhouette_score': -1,
                'calinski_harabasz_score': -1,
                'davies_bouldin_score': -1,
                'error': 'Less than 2 clusters found'
            }
        
        try:
            # Silhouette Score
            sil_score = silhouette_score(X_valid, labels_valid)
            
            # Calinski-Harabasz Index
            ch_score = calinski_harabasz_score(X_valid, labels_valid)
            
            # Davies-Bouldin Index
            from sklearn.metrics import davies_bouldin_score
            db_score = davies_bouldin_score(X_valid, labels_valid)
            
            return {
                'silhouette_score': sil_score,
                'calinski_harabasz_score': ch_score,
                'davies_bouldin_score': db_score,
                'n_clusters': len(np.unique(labels_valid)),
                'n_noise_points': np.sum(labels == -1)
            }
            
        except Exception as e:
            return {'error': str(e)}
    
    def external_validation(self, y_true, y_pred):
        """External validation metrics (requires ground truth)."""
        try:
            from sklearn.metrics import (adjusted_rand_score, adjusted_mutual_info_score, 
                                       homogeneity_score, completeness_score, v_measure_score)
            
            # Adjusted Rand Index
            ari = adjusted_rand_score(y_true, y_pred)
            
            # Adjusted Mutual Information
            ami = adjusted_mutual_info_score(y_true, y_pred)
            
            # Homogeneity, Completeness, V-measure
            homogeneity = homogeneity_score(y_true, y_pred)
            completeness = completeness_score(y_true, y_pred)
            v_measure = v_measure_score(y_true, y_pred)
            
            return {
                'adjusted_rand_index': ari,
                'adjusted_mutual_info': ami,
                'homogeneity': homogeneity,
                'completeness': completeness,
                'v_measure': v_measure
            }
            
        except Exception as e:
            return {'error': str(e)}
    
    def stability_analysis(self, X, algorithm, n_runs=10, sample_ratio=0.8):
        """Analyze clustering stability through bootstrap sampling."""
        print(f"  Performing stability analysis with {n_runs} runs...")
        
        stability_scores = []
        cluster_counts = []
        
        n_samples = int(len(X) * sample_ratio)
        
        for run in range(n_runs):
            # Bootstrap sample
            indices = np.random.choice(len(X), size=n_samples, replace=True)
            X_sample = X[indices]
            
            try:
                # Fit algorithm
                if hasattr(algorithm, 'fit_predict'):
                    labels = algorithm.fit_predict(X_sample)
                else:
                    algorithm.fit(X_sample)
                    labels = algorithm.predict(X_sample) if hasattr(algorithm, 'predict') else algorithm.labels_
                
                # Calculate internal validation
                validation = self.internal_validation(X_sample, labels)
                
                if 'silhouette_score' in validation and validation['silhouette_score'] > -1:
                    stability_scores.append(validation['silhouette_score'])
                    cluster_counts.append(validation['n_clusters'])
                
            except Exception as e:
                continue
        
        if stability_scores:
            return {
                'mean_silhouette': np.mean(stability_scores),
                'std_silhouette': np.std(stability_scores),
                'mean_clusters': np.mean(cluster_counts),
                'std_clusters': np.std(cluster_counts),
                'stability_coefficient': 1 - (np.std(stability_scores) / np.mean(stability_scores)) if np.mean(stability_scores) > 0 else 0
            }
        else:
            return {'error': 'No valid runs completed'}
    
    def elbow_analysis(self, X, algorithm_class, max_clusters=10, **kwargs):
        """Perform elbow analysis for optimal cluster number."""
        print(f"  Performing elbow analysis up to {max_clusters} clusters...")
        
        inertias = []
        silhouette_scores = []
        cluster_range = range(2, max_clusters + 1)
        
        for n_clusters in cluster_range:
            try:
                # Create algorithm instance
                if 'n_clusters' in algorithm_class().get_params():
                    alg = algorithm_class(n_clusters=n_clusters, **kwargs)
                elif 'n_components' in algorithm_class().get_params():
                    alg = algorithm_class(n_components=n_clusters, **kwargs)
                else:
                    continue
                
                # Fit and predict
                labels = alg.fit_predict(X)
                
                # Calculate metrics
                if hasattr(alg, 'inertia_'):
                    inertias.append(alg.inertia_)
                else:
                    inertias.append(None)
                
                # Calculate silhouette score
                if len(np.unique(labels)) > 1:
                    sil_score = silhouette_score(X, labels)
                    silhouette_scores.append(sil_score)
                else:
                    silhouette_scores.append(-1)
                
            except Exception as e:
                inertias.append(None)
                silhouette_scores.append(-1)
        
        return {
            'cluster_range': list(cluster_range),
            'inertias': inertias,
            'silhouette_scores': silhouette_scores
        }

# Test cluster validation methods
validator = ClusterValidator()

# Comprehensive validation of clustering results
validation_summary = {}

print("\n--- Comprehensive Cluster Validation ---")

for dataset_name, (X, y_true) in test_datasets.items():
    print(f"\n  Validating clustering results for {dataset_name}...")
    validation_summary[dataset_name] = {}
    
    for algo_name, result in clustering_results[dataset_name].items():
        if 'error' not in result:
            print(f"    Validating {algo_name}...")
            
            y_pred = result['y_pred']
            
            # Internal validation
            internal_val = validator.internal_validation(X, y_pred, algo_name)
            
            # External validation
            external_val = validator.external_validation(y_true, y_pred)
            
            # Combine validation results
            validation_summary[dataset_name][algo_name] = {
                'internal': internal_val,
                'external': external_val
            }
            
            # Print summary
            if 'error' not in internal_val:
                print(f"      Silhouette: {internal_val['silhouette_score']:.3f}")
                print(f"      ARI: {external_val['adjusted_rand_index']:.3f}")

# Stability analysis for selected algorithms
print("\n--- Stability Analysis ---")

stability_results = {}

# Test stability on blob dataset with K-Means
if 'K-Means' in clustering_algorithms:
    print("  Testing K-Means stability on Blob Clusters...")
    kmeans_stability = validator.stability_analysis(
        X_blobs, clustering_algorithms['K-Means'], n_runs=15
    )
    stability_results['K-Means_Blobs'] = kmeans_stability
    
    if 'error' not in kmeans_stability:
        print(f"    Stability coefficient: {kmeans_stability['stability_coefficient']:.3f}")

# Elbow analysis
print("\n--- Elbow Analysis ---")

elbow_results = {}

# Elbow analysis for K-Means on blob dataset
print("  K-Means elbow analysis on Blob Clusters...")
kmeans_elbow = validator.elbow_analysis(
    X_blobs, KMeans, max_clusters=8, random_state=42
)
elbow_results['K-Means_Blobs'] = kmeans_elbow

print("\n✨ Cluster validation and evaluation complete!")

### 8.1 Cluster Validation Visualization

In [None]:
# Visualize cluster validation results
print("📊 Visualizing Cluster Validation Results...")

# Create comprehensive validation visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Internal vs External validation scatter plot
internal_scores = []
external_scores = []
algorithm_names = []
dataset_names = []

for dataset_name, algorithms in validation_summary.items():
    for algo_name, validation in algorithms.items():
        if ('error' not in validation['internal'] and 
            'error' not in validation['external']):
            
            internal_scores.append(validation['internal']['silhouette_score'])
            external_scores.append(validation['external']['adjusted_rand_index'])
            algorithm_names.append(algo_name)
            dataset_names.append(dataset_name)

if internal_scores and external_scores:
    # Create scatter plot with different colors for different datasets
    datasets_unique = list(set(dataset_names))
    colors = plt.cm.Set3(np.linspace(0, 1, len(datasets_unique)))
    
    for i, dataset in enumerate(datasets_unique):
        mask = [d == dataset for d in dataset_names]
        x_vals = [internal_scores[j] for j, m in enumerate(mask) if m]
        y_vals = [external_scores[j] for j, m in enumerate(mask) if m]
        
        axes[0, 0].scatter(x_vals, y_vals, color=colors[i], label=dataset, s=100, alpha=0.7)
    
    axes[0, 0].set_xlabel('Silhouette Score (Internal)')
    axes[0, 0].set_ylabel('Adjusted Rand Index (External)')
    axes[0, 0].set_title('Internal vs External Validation')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)

# 2. Validation metrics heatmap
if validation_summary:
    # Create heatmap data
    metrics = ['silhouette_score', 'adjusted_rand_index', 'v_measure']
    metric_labels = ['Silhouette Score', 'Adjusted Rand Index', 'V-Measure']
    
    heatmap_data = []
    row_labels = []
    
    for dataset_name, algorithms in validation_summary.items():
        for algo_name, validation in algorithms.items():
            if ('error' not in validation['internal'] and 
                'error' not in validation['external']):
                
                row_data = [
                    validation['internal']['silhouette_score'],
                    validation['external']['adjusted_rand_index'],
                    validation['external']['v_measure']
                ]
                heatmap_data.append(row_data)
                row_labels.append(f"{dataset_name}_{algo_name}")
    
    if heatmap_data:
        heatmap_array = np.array(heatmap_data)
        
        im = axes[0, 1].imshow(heatmap_array, cmap='RdYlBu_r', aspect='auto')
        axes[0, 1].set_xticks(range(len(metric_labels)))
        axes[0, 1].set_xticklabels(metric_labels, rotation=45, ha='right')
        axes[0, 1].set_yticks(range(len(row_labels)))
        axes[0, 1].set_yticklabels([label.replace('_', '\n') for label in row_labels])
        axes[0, 1].set_title('Validation Metrics Heatmap')
        
        # Add colorbar
        cbar = plt.colorbar(im, ax=axes[0, 1])
        cbar.set_label('Score')

# 3. Elbow curve
if 'K-Means_Blobs' in elbow_results:
    elbow_data = elbow_results['K-Means_Blobs']
    cluster_range = elbow_data['cluster_range']
    silhouette_scores = elbow_data['silhouette_scores']
    
    # Plot silhouette scores
    valid_scores = [(k, s) for k, s in zip(cluster_range, silhouette_scores) if s > -1]
    if valid_scores:
        k_vals, sil_vals = zip(*valid_scores)
        axes[1, 0].plot(k_vals, sil_vals, 'bo-', linewidth=2, markersize=8)
        axes[1, 0].set_xlabel('Number of Clusters')
        axes[1, 0].set_ylabel('Silhouette Score')
        axes[1, 0].set_title('Elbow Analysis: Silhouette Score')
        axes[1, 0].grid(True, alpha=0.3)
        
        # Mark optimal point
        best_k = k_vals[np.argmax(sil_vals)]
        best_score = max(sil_vals)
        axes[1, 0].scatter([best_k], [best_score], color='red', s=200, marker='*', 
                          label=f'Optimal: k={best_k}')
        axes[1, 0].legend()

# 4. Stability analysis results
if stability_results:
    stability_names = []
    stability_coeffs = []
    
    for name, result in stability_results.items():
        if 'error' not in result:
            stability_names.append(name.replace('_', '\n'))
            stability_coeffs.append(result['stability_coefficient'])
    
    if stability_coeffs:
        bars = axes[1, 1].bar(range(len(stability_names)), stability_coeffs, 
                             color='lightgreen', alpha=0.7)
        axes[1, 1].set_xlabel('Algorithm_Dataset')
        axes[1, 1].set_ylabel('Stability Coefficient')
        axes[1, 1].set_title('Clustering Stability Analysis')
        axes[1, 1].set_xticks(range(len(stability_names)))
        axes[1, 1].set_xticklabels(stability_names)
        axes[1, 1].grid(True, alpha=0.3)
        
        # Add value labels
        for bar, coeff in zip(bars, stability_coeffs):
            axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                           f'{coeff:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
save_unsupervised_figure(fig, "cluster_validation_comprehensive", 
                       "Comprehensive cluster validation analysis", "validation")
plt.show()

print("\n✨ Cluster validation visualization complete!")

## 9. Unsupervised Feature Learning {#feature-learning}

Explore advanced unsupervised feature learning techniques.

In [None]:
# Unsupervised feature learning
print("🎓 Unsupervised Feature Learning...")

class UnsupervisedFeatureLearner:
    """Comprehensive unsupervised feature learning toolkit."""
    
    def __init__(self):
        self.feature_transformers = {}
        self.learned_features = {}
    
    def dictionary_learning(self, X, n_components=10, alpha=1.0):
        """Dictionary learning for sparse coding."""
        print("  Learning dictionary for sparse coding...")
        
        try:
            dict_learner = DictionaryLearning(
                n_components=n_components,
                alpha=alpha,
                random_state=42,
                max_iter=100
            )
            
            sparse_codes = dict_learner.fit_transform(X)
            
            # Calculate sparsity
            sparsity = np.mean(sparse_codes == 0)
            
            self.feature_transformers['dictionary'] = dict_learner
            
            return {
                'features': sparse_codes,
                'dictionary': dict_learner.components_,
                'sparsity': sparsity,
                'reconstruction_error': np.mean((X - sparse_codes @ dict_learner.components_) ** 2)
            }
            
        except Exception as e:
            print(f"    Dictionary learning failed: {e}")
            return None
    
    def independent_component_analysis(self, X, n_components=10):
        """Independent Component Analysis for feature learning."""
        print("  Performing Independent Component Analysis...")
        
        try:
            ica = FastICA(
                n_components=n_components,
                random_state=42,
                max_iter=200
            )
            
            ica_features = ica.fit_transform(X)
            
            # Calculate independence score (simplified)
            independence_score = np.mean(np.abs(np.corrcoef(ica_features.T)))
            
            self.feature_transformers['ica'] = ica
            
            return {
                'features': ica_features,
                'mixing_matrix': ica.mixing_,
                'components': ica.components_,
                'independence_score': independence_score
            }
            
        except Exception as e:
            print(f"    ICA failed: {e}")
            return None
    
    def factor_analysis(self, X, n_components=10):
        """Factor Analysis for dimensionality reduction."""
        print("  Performing Factor Analysis...")
        
        try:
            fa = FactorAnalysis(
                n_components=n_components,
                random_state=42,
                max_iter=100
            )
            
            fa_features = fa.fit_transform(X)
            
            self.feature_transformers['factor_analysis'] = fa
            
            return {
                'features': fa_features,
                'loadings': fa.components_,
                'noise_variance': fa.noise_variance_,
                'log_likelihood': fa.score(X)
            }
            
        except Exception as e:
            print(f"    Factor Analysis failed: {e}")
            return None
    
    def non_negative_matrix_factorization(self, X, n_components=10):
        """Non-negative Matrix Factorization."""
        print("  Performing Non-negative Matrix Factorization...")
        
        try:
            # Ensure non-negative data
            X_nonneg = X - X.min() + 1e-10
            
            nmf = NMF(
                n_components=n_components,
                random_state=42,
                max_iter=200
            )
            
            nmf_features = nmf.fit_transform(X_nonneg)
            
            # Calculate reconstruction error
            reconstruction = nmf_features @ nmf.components_
            reconstruction_error = np.mean((X_nonneg - reconstruction) ** 2)
            
            self.feature_transformers['nmf'] = nmf
            
            return {
                'features': nmf_features,
                'components': nmf.components_,
                'reconstruction_error': reconstruction_error
            }
            
        except Exception as e:
            print(f"    NMF failed: {e}")
            return None
    
    def autoencoder_features(self, X, encoding_dim=10):
        """Autoencoder-like feature extraction using PCA."""
        print("  Generating autoencoder-like features with PCA...")
        
        try:
            # Use PCA as a linear autoencoder
            pca = PCA(n_components=encoding_dim, random_state=42)
            encoded_features = pca.fit_transform(X)
            
            # Reconstruction
            reconstructed = pca.inverse_transform(encoded_features)
            reconstruction_error = np.mean((X - reconstructed) ** 2)
            
            self.feature_transformers['autoencoder'] = pca
            
            return {
                'features': encoded_features,
                'components': pca.components_,
                'explained_variance_ratio': pca.explained_variance_ratio_,
                'reconstruction_error': reconstruction_error
            }
            
        except Exception as e:
            print(f"    Autoencoder features failed: {e}")
            return None
    
    def learn_all_features(self, X, n_components=10):
        """Learn features using all available methods."""
        print(f"Learning features with {n_components} components...")
        
        # Standardize input data
        X_scaled = StandardScaler().fit_transform(X)
        
        # Dictionary Learning
        dict_result = self.dictionary_learning(X_scaled, n_components)
        if dict_result:
            self.learned_features['dictionary'] = dict_result
        
        # Independent Component Analysis
        ica_result = self.independent_component_analysis(X_scaled, n_components)
        if ica_result:
            self.learned_features['ica'] = ica_result
        
        # Factor Analysis
        fa_result = self.factor_analysis(X_scaled, n_components)
        if fa_result:
            self.learned_features['factor_analysis'] = fa_result
        
        # Non-negative Matrix Factorization
        nmf_result = self.non_negative_matrix_factorization(X_scaled, n_components)
        if nmf_result:
            self.learned_features['nmf'] = nmf_result
        
        # Autoencoder-like features
        ae_result = self.autoencoder_features(X_scaled, n_components)
        if ae_result:
            self.learned_features['autoencoder'] = ae_result
        
        print(f"Successfully learned {len(self.learned_features)} feature representations")
        
        return self.learned_features

# Test unsupervised feature learning
print("\n--- Testing Unsupervised Feature Learning ---")

# Use high-dimensional dataset for feature learning
feature_learner = UnsupervisedFeatureLearner()

# Learn features
learned_features = feature_learner.learn_all_features(X_highdim, n_components=8)

# Display results
print("\n📊 Feature Learning Results:")
print("=" * 60)

for method_name, result in learned_features.items():
    print(f"\n{method_name.upper()}:")
    print(f"  Feature shape: {result['features'].shape}")
    print(f"  Feature variance: {np.var(result['features']):.3f}")
    
    # Method-specific metrics
    if method_name == 'dictionary':
        print(f"  Sparsity: {result['sparsity']:.3f}")
        print(f"  Reconstruction error: {result['reconstruction_error']:.3f}")
    elif method_name == 'ica':
        print(f"  Independence score: {result['independence_score']:.3f}")
    elif method_name == 'factor_analysis':
        print(f"  Log-likelihood: {result['log_likelihood']:.1f}")
        print(f"  Mean noise variance: {np.mean(result['noise_variance']):.3f}")
    elif method_name == 'nmf':
        print(f"  Reconstruction error: {result['reconstruction_error']:.3f}")
    elif method_name == 'autoencoder':
        print(f"  Explained variance: {np.sum(result['explained_variance_ratio']):.3f}")
        print(f"  Reconstruction error: {result['reconstruction_error']:.3f}")

print("=" * 60)
print("\n✨ Unsupervised feature learning complete!")

### 9.1 Feature Learning Visualization

In [None]:
# Visualize unsupervised feature learning results
print("📊 Visualizing Feature Learning Results...")

if learned_features:
    n_methods = len(learned_features)
    
    # 1. Feature comparison visualization
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    # Visualize learned features (first 2 components) colored by original clusters
    for i, (method_name, result) in enumerate(learned_features.items()):
        if i < len(axes):
            features = result['features']
            
            if features.shape[1] >= 2:
                scatter = axes[i].scatter(features[:, 0], features[:, 1], 
                                        c=y_highdim, cmap='viridis', alpha=0.7)
                axes[i].set_title(f'{method_name.replace("_", " ").title()}\nFeatures')
                axes[i].set_xlabel(f'Component 1')
                axes[i].set_ylabel(f'Component 2')
                axes[i].grid(True, alpha=0.3)
                
                plt.colorbar(scatter, ax=axes[i])
    
    # Hide unused subplots
    for i in range(len(learned_features), len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle('Learned Feature Representations', fontsize=16)
    plt.tight_layout()
    save_unsupervised_figure(fig, "feature_learning_representations", 
                           "Visualization of learned feature representations", "features")
    plt.show()
    
    # 2. Feature quality comparison
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Feature variance comparison
    methods = list(learned_features.keys())
    variances = [np.var(learned_features[method]['features']) for method in methods]
    
    bars = axes[0, 0].bar(range(len(methods)), variances, alpha=0.7, color='lightblue')
    axes[0, 0].set_xlabel('Method')
    axes[0, 0].set_ylabel('Feature Variance')
    axes[0, 0].set_title('Feature Variance by Method')
    axes[0, 0].set_xticks(range(len(methods)))
    axes[0, 0].set_xticklabels([m.replace('_', '\n') for m in methods], rotation=45, ha='right')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Add value labels
    for bar, var in zip(bars, variances):
        axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(variances)*0.01, 
                       f'{var:.3f}', ha='center', va='bottom')
    
    # Reconstruction error comparison (where available)
    recon_methods = []
    recon_errors = []
    
    for method, result in learned_features.items():
        if 'reconstruction_error' in result:
            recon_methods.append(method)
            recon_errors.append(result['reconstruction_error'])
    
    if recon_errors:
        bars = axes[0, 1].bar(range(len(recon_methods)), recon_errors, alpha=0.7, color='lightcoral')
        axes[0, 1].set_xlabel('Method')
        axes[0, 1].set_ylabel('Reconstruction Error')
        axes[0, 1].set_title('Reconstruction Error by Method')
        axes[0, 1].set_xticks(range(len(recon_methods)))
        axes[0, 1].set_xticklabels([m.replace('_', '\n') for m in recon_methods], rotation=45, ha='right')
        axes[0, 1].grid(True, alpha=0.3)
    
    # Feature distribution analysis
    if 'dictionary' in learned_features:
        dict_features = learned_features['dictionary']['features']
        axes[1, 0].hist(dict_features.flatten(), bins=50, alpha=0.7, color='lightgreen')
        axes[1, 0].set_xlabel('Feature Value')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].set_title('Dictionary Learning Feature Distribution')
        axes[1, 0].grid(True, alpha=0.3)
    
    # Component visualization (for methods with interpretable components)
    if 'ica' in learned_features:
        ica_components = learned_features['ica']['components']
        
        # Show first few components
        n_show = min(4, ica_components.shape[0])
        for i in range(n_show):
            axes[1, 1].plot(ica_components[i], alpha=0.7, label=f'Component {i+1}')
        
        axes[1, 1].set_xlabel('Feature Index')
        axes[1, 1].set_ylabel('Component Weight')
        axes[1, 1].set_title('ICA Components')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    save_unsupervised_figure(fig, "feature_learning_analysis", 
                           "Comprehensive analysis of feature learning methods", "features")
    plt.show()
    
    # 3. Feature learning summary table
    print("\n📋 Feature Learning Summary:")
    print("=" * 80)
    print(f"{'Method':<20} {'Components':<12} {'Variance':<10} {'Special Metric':<25}")
    print("=" * 80)
    
    for method, result in learned_features.items():
        components = result['features'].shape[1]
        variance = np.var(result['features'])
        
        # Special metric for each method
        special_metric = ""
        if method == 'dictionary':
            special_metric = f"Sparsity: {result['sparsity']:.3f}"
        elif method == 'ica':
            special_metric = f"Independence: {result['independence_score']:.3f}"
        elif method == 'factor_analysis':
            special_metric = f"Log-likelihood: {result['log_likelihood']:.1f}"
        elif method == 'nmf':
            special_metric = f"Recon. Error: {result['reconstruction_error']:.3f}"
        elif method == 'autoencoder':
            special_metric = f"Explained Var: {np.sum(result['explained_variance_ratio']):.3f}"
        
        print(f"{method:<20} {components:<12} {variance:<10.3f} {special_metric:<25}")
    
    print("=" * 80)

print("\n✨ Feature learning visualization complete!")

## 10. Save All Results {#save-results}

Comprehensive saving of all unsupervised learning results, models, and analysis.

In [None]:
# Comprehensive analysis saving functions
def save_clustering_analysis():
    """Save all clustering analysis figures and results."""
    print("💾 Saving clustering analysis...")
    
    # Save clustering results for each dataset
    if 'clustering_results' in globals():
        for dataset_name, (X, y_true) in test_datasets.items():
            if X.shape[1] == 2:  # Only visualize 2D datasets
                successful_algos = [algo for algo, result in clustering_results[dataset_name].items() 
                                   if 'error' not in result]
                
                if successful_algos:
                    n_algos = len(successful_algos)
                    n_cols = min(3, n_algos)
                    n_rows = (n_algos + n_cols - 1) // n_cols
                    
                    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
                    if n_rows == 1:
                        axes = axes.reshape(1, -1) if n_algos > 1 else [axes]
                    axes = axes.ravel() if n_algos > 1 else [axes[0]]
                    
                    for i, algo_name in enumerate(successful_algos):
                        result = clustering_results[dataset_name][algo_name]
                        y_pred = result['y_pred']
                        
                        scatter = axes[i].scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis', alpha=0.7)
                        
                        # Highlight noise points if any
                        noise_mask = y_pred == -1
                        if np.any(noise_mask):
                            axes[i].scatter(X[noise_mask, 0], X[noise_mask, 1], 
                                          c='red', marker='x', s=50, label='Noise')
                            axes[i].legend()
                        
                        plt.colorbar(scatter, ax=axes[i])
                        
                        # Title with metrics
                        sil_score = result.get('silhouette_score', -1)
                        ari_score = result.get('adjusted_rand_index', -1)
                        n_clusters = result.get('n_clusters', 0)
                        
                        title = f"{algo_name}\nClusters: {n_clusters}, ARI: {ari_score:.3f}"
                        if sil_score > -1:
                            title += f", Sil: {sil_score:.3f}"
                        
                        axes[i].set_title(title)
                        axes[i].set_xlabel('Feature 1')
                        axes[i].set_ylabel('Feature 2')
                        axes[i].grid(True, alpha=0.3)
                    
                    # Hide unused subplots
                    for i in range(len(successful_algos), len(axes)):
                        axes[i].set_visible(False)
                    
                    plt.suptitle(f'Clustering Results - {dataset_name}', fontsize=16)
                    plt.tight_layout()
                    save_unsupervised_figure(fig, f"clustering_results_{dataset_name.lower().replace(' ', '_')}", 
                                           f"Clustering algorithm comparison on {dataset_name} dataset", "clustering")
                    plt.close(fig)

def save_dimensionality_reduction_analysis():
    """Save all dimensionality reduction analysis."""
    print("💾 Saving dimensionality reduction analysis...")
    
    # Save dimensionality reduction results
    if 'successful_reductions' in globals():
        n_methods = len(successful_reductions)
        n_cols = min(3, n_methods)
        n_rows = (n_methods + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        if n_rows == 1:
            axes = axes.reshape(1, -1) if n_methods > 1 else [axes]
        axes = axes.ravel() if n_methods > 1 else [axes[0]]
        
        for i, (method_name, X_reduced) in enumerate(successful_reductions.items()):
            scatter = axes[i].scatter(X_reduced[:, 0], X_reduced[:, 1], 
                                    c=y_highdim, cmap='viridis', alpha=0.7)
            
            axes[i].set_title(f'{method_name}\n(Time: {reduction_times.get(method_name, 0):.2f}s)')
            axes[i].set_xlabel('Component 1')
            axes[i].set_ylabel('Component 2')
            axes[i].grid(True, alpha=0.3)
            
            plt.colorbar(scatter, ax=axes[i])
        
        # Hide unused subplots
        for i in range(len(successful_reductions), len(axes)):
            axes[i].set_visible(False)
        
        plt.suptitle('Dimensionality Reduction Results (50D → 2D)', fontsize=16)
        plt.tight_layout()
        save_unsupervised_figure(fig, "dimensionality_reduction_final", 
                               "Final dimensionality reduction comparison", "reduction")
        plt.close(fig)

def save_anomaly_detection_analysis():
    """Save all anomaly detection analysis."""
    print("💾 Saving anomaly detection analysis...")
    
    if 'successful_anomaly_algos' in globals() and successful_anomaly_algos:
        n_algos = len(successful_anomaly_algos)
        n_cols = min(2, n_algos)
        n_rows = (n_algos + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 6*n_rows))
        if n_rows == 1:
            axes = axes.reshape(1, -1) if n_algos > 1 else [axes]
        axes = axes.ravel() if n_algos > 1 else [axes[0]]
        
        for i, (algo_name, result) in enumerate(successful_anomaly_algos.items()):
            y_pred = result['y_pred']
            
            # True positives, false positives, etc.
            tp_mask = (y_true_anomaly == -1) & (y_pred == -1)
            fp_mask = (y_true_anomaly == 1) & (y_pred == -1)
            fn_mask = (y_true_anomaly == -1) & (y_pred == 1)
            tn_mask = (y_true_anomaly == 1) & (y_pred == 1)
            
            axes[i].scatter(X_anomaly[tn_mask, 0], X_anomaly[tn_mask, 1], 
                           c='lightblue', alpha=0.6, label='True Normal', s=30)
            axes[i].scatter(X_anomaly[tp_mask, 0], X_anomaly[tp_mask, 1], 
                           c='red', alpha=0.8, label='True Anomaly (Detected)', s=50, marker='x')
            axes[i].scatter(X_anomaly[fp_mask, 0], X_anomaly[fp_mask, 1], 
                           c='orange', alpha=0.8, label='False Positive', s=40, marker='s')
            axes[i].scatter(X_anomaly[fn_mask, 0], X_anomaly[fn_mask, 1], 
                           c='purple', alpha=0.8, label='False Negative', s=40, marker='^')
            
            f1 = result['f1_score']
            precision = result['precision']
            recall = result['recall']
            
            axes[i].set_title(f'{algo_name}\nF1: {f1:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}')
            axes[i].set_xlabel('Feature 1')
            axes[i].set_ylabel('Feature 2')
            axes[i].legend(fontsize=8)
            axes[i].grid(True, alpha=0.3)
        
        # Hide unused subplots
        for i in range(len(successful_anomaly_algos), len(axes)):
            axes[i].set_visible(False)
        
        plt.suptitle('Final Anomaly Detection Results', fontsize=16)
        plt.tight_layout()
        save_unsupervised_figure(fig, "anomaly_detection_final", 
                               "Final anomaly detection results", "anomaly")
        plt.close(fig)

def save_manifold_learning_analysis():
    """Save all manifold learning analysis."""
    print("💾 Saving manifold learning analysis...")
    
    if 'manifold_results' in globals():
        for dataset_name, results in manifold_results.items():
            successful_algos = [(algo, result) for algo, result in results.items() 
                               if 'error' not in result]
            
            if successful_algos:
                n_algos = len(successful_algos)
                n_cols = min(3, n_algos)
                n_rows = (n_algos + n_cols - 1) // n_cols
                
                fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
                if n_rows == 1:
                    axes = axes.reshape(1, -1) if n_algos > 1 else [axes]
                axes = axes.ravel() if n_algos > 1 else [axes[0]]
                
                for i, (algo_name, result) in enumerate(successful_algos):
                    X_embedded = result['X_embedded']
                    color = result['color']
                    
                    scatter = axes[i].scatter(X_embedded[:, 0], X_embedded[:, 1], 
                                            c=color, cmap='viridis', alpha=0.7)
                    
                    trustworthiness = result['trustworthiness']
                    embedding_time = result['embedding_time']
                    
                    axes[i].set_title(f'{algo_name}\nTrustworthiness: {trustworthiness:.3f}\nTime: {embedding_time:.2f}s')
                    axes[i].set_xlabel('Component 1')
                    axes[i].set_ylabel('Component 2')
                    axes[i].grid(True, alpha=0.3)
                    
                    plt.colorbar(scatter, ax=axes[i])
                
                # Hide unused subplots
                for i in range(len(successful_algos), len(axes)):
                    axes[i].set_visible(False)
                
                plt.suptitle(f'Final Manifold Learning Results - {dataset_name}', fontsize=16)
                plt.tight_layout()
                save_unsupervised_figure(fig, f"manifold_learning_final_{dataset_name.lower().replace(' ', '_')}", 
                                       f"Final manifold learning results for {dataset_name}", "manifold")
                plt.close(fig)

def save_feature_learning_analysis():
    """Save all feature learning analysis."""
    print("💾 Saving feature learning analysis...")
    
    if 'learned_features' in globals() and learned_features:
        n_methods = len(learned_features)
        
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        axes = axes.ravel()
        
        for i, (method_name, result) in enumerate(learned_features.items()):
            if i < len(axes):
                features = result['features']
                
                if features.shape[1] >= 2:
                    scatter = axes[i].scatter(features[:, 0], features[:, 1], 
                                            c=y_highdim, cmap='viridis', alpha=0.7)
                    axes[i].set_title(f'{method_name.replace("_", " ").title()}\nFeatures')
                    axes[i].set_xlabel(f'Component 1')
                    axes[i].set_ylabel(f'Component 2')
                    axes[i].grid(True, alpha=0.3)
                    
                    plt.colorbar(scatter, ax=axes[i])
        
        # Hide unused subplots
        for i in range(len(learned_features), len(axes)):
            axes[i].set_visible(False)
        
        plt.suptitle('Final Learned Feature Representations', fontsize=16)
        plt.tight_layout()
        save_unsupervised_figure(fig, "feature_learning_final", 
                               "Final learned feature representations", "features")
        plt.close(fig)

def save_all_unsupervised_models():
    """Save all trained models from unsupervised learning analysis."""
    print("💾 Saving all unsupervised models...")
    
    # Save clustering models
    if 'clustering_algorithms' in globals():
        for name, algorithm in clustering_algorithms.items():
            model_metadata = {
                'algorithm': name,
                'task_type': 'clustering',
                'model_class': algorithm.__class__.__name__,
                'parameters': algorithm.get_params() if hasattr(algorithm, 'get_params') else {}
            }
            
            # Add performance metrics if available
            if 'clustering_results' in globals():
                performance_summary = {}
                for dataset_name, results in clustering_results.items():
                    if name in results and 'error' not in results[name]:
                        performance_summary[dataset_name] = results[name]
                model_metadata['performance_summary'] = performance_summary
            
            save_clustering_model(algorithm, name.lower().replace(' ', '_'), 
                                f"Clustering model: {name}", model_metadata)
    
    # Save dimensionality reduction models
    if 'reduction_algorithms' in globals():
        for name, algorithm in reduction_algorithms.items():
            model_metadata = {
                'algorithm': name,
                'task_type': 'dimensionality_reduction',
                'model_class': algorithm.__class__.__name__,
                'parameters': algorithm.get_params() if hasattr(algorithm, 'get_params') else {}
            }
            
            # Add performance metrics if available
            if 'reduction_results' in globals() and name in reduction_results and reduction_results[name] is not None:
                model_metadata['reduction_time'] = reduction_times.get(name, 'unknown')
                model_metadata['output_shape'] = reduction_results[name].shape
            
            save_reduction_model(algorithm, name.lower().replace(' ', '_'), 
                               f"Dimensionality reduction model: {name}", model_metadata)
    
    # Save anomaly detection models
    if 'anomaly_algorithms' in globals():
        for name, algorithm in anomaly_algorithms.items():
            if algorithm is not None:
                model_metadata = {
                    'algorithm': name,
                    'task_type': 'anomaly_detection',
                    'model_class': algorithm.__class__.__name__,
                    'parameters': algorithm.get_params() if hasattr(algorithm, 'get_params') else {}
                }
                
                # Add performance metrics if available
                if 'anomaly_results' in globals() and name in anomaly_results and 'error' not in anomaly_results[name]:
                    model_metadata['performance_metrics'] = anomaly_results[name]
                
                save_clustering_model(algorithm, f"anomaly_{name.lower().replace(' ', '_')}", 
                                    f"Anomaly detection model: {name}", model_metadata)
    
    # Save manifold learning models
    if 'manifold_algorithms' in globals():
        for name, algorithm in manifold_algorithms.items():
            model_metadata = {
                'algorithm': name,
                'task_type': 'manifold_learning',
                'model_class': algorithm.__class__.__name__,
                'parameters': algorithm.get_params() if hasattr(algorithm, 'get_params') else {}
            }
            
            # Add trustworthiness scores if available
            if 'manifold_results' in globals():
                trustworthiness_summary = {}
                for dataset_name, results in manifold_results.items():
                    if name in results and 'error' not in results[name]:
                        trustworthiness_summary[dataset_name] = results[name].get('trustworthiness', 'unknown')
                model_metadata['trustworthiness_summary'] = trustworthiness_summary
            
            save_reduction_model(algorithm, f"manifold_{name.lower().replace(' ', '_')}", 
                               f"Manifold learning model: {name}", model_metadata)
    
    # Save feature learning models
    if 'feature_learner' in globals() and hasattr(feature_learner, 'feature_transformers'):
        for name, transformer in feature_learner.feature_transformers.items():
            model_metadata = {
                'algorithm': name,
                'task_type': 'feature_learning',
                'model_class': transformer.__class__.__name__,
                'parameters': transformer.get_params() if hasattr(transformer, 'get_params') else {}
            }
            
            # Add feature learning metrics if available
            if 'learned_features' in globals() and name in learned_features:
                feature_info = learned_features[name]
                model_metadata['output_dimensions'] = feature_info['features'].shape[1]
                model_metadata['feature_variance'] = np.var(feature_info['features'])
                
                # Method-specific metrics
                if name == 'dictionary' and 'sparsity' in feature_info:
                    model_metadata['sparsity'] = feature_info['sparsity']
                elif name == 'ica' and 'independence_score' in feature_info:
                    model_metadata['independence_score'] = feature_info['independence_score']
                elif name == 'factor_analysis' and 'log_likelihood' in feature_info:
                    model_metadata['log_likelihood'] = feature_info['log_likelihood']
            
            save_feature_learning_model(transformer, name, 
                                      f"Feature learning model: {name}", model_metadata)

def generate_unsupervised_learning_report():
    """Generate comprehensive unsupervised learning analysis report."""
    print("📄 Generating comprehensive report...")
    
    report_content = f"""
# Sklearn-Mastery Unsupervised Learning Report
Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Executive Summary

This report summarizes the comprehensive unsupervised learning analysis performed
in the sklearn-mastery project, including advanced clustering algorithms, 
dimensionality reduction techniques, anomaly detection, manifold learning,
and unsupervised feature learning methods.

## Clustering Analysis

### Datasets Tested

"""
    
    if 'test_datasets' in globals():
        report_content += f"Total clustering datasets: {len(test_datasets)}\n\n"
        for dataset_name, (X, y) in test_datasets.items():
            report_content += f"""
**{dataset_name}**
- Shape: {X.shape}
- Features: {X.shape[1]}
- True Clusters: {len(np.unique(y))}
- Characteristics: {"2D visualization available" if X.shape[1] == 2 else "High-dimensional dataset"}
"""
    
    if 'clustering_results' in globals():
        report_content += "\n### Clustering Performance Summary\n"
        
        # Find best performing algorithm per dataset
        for dataset_name in clustering_results.keys():
            successful_results = {k: v for k, v in clustering_results[dataset_name].items() 
                                 if 'error' not in v}
            if successful_results:
                best_algo = max(successful_results.keys(), 
                               key=lambda x: successful_results[x].get('adjusted_rand_index', -1))
                best_ari = successful_results[best_algo].get('adjusted_rand_index', 'N/A')
                best_sil = successful_results[best_algo].get('silhouette_score', 'N/A')
                
                report_content += f"""
**{dataset_name}**
- Best Algorithm: {best_algo}
- Adjusted Rand Index: {best_ari:.3f if isinstance(best_ari, float) else best_ari}
- Silhouette Score: {best_sil:.3f if isinstance(best_sil, float) else best_sil}
"""
    
    report_content += "\n## Dimensionality Reduction Analysis\n"
    
    if 'reduction_results' in globals():
        successful_reductions = {k: v for k, v in reduction_results.items() if v is not None}
        report_content += f"Successful reduction methods: {len(successful_reductions)}\n\n"
        
        for method_name, result in successful_reductions.items():
            time_taken = reduction_times.get(method_name, 'unknown')
            report_content += f"""
**{method_name}**
- Output Shape: {result.shape}
- Processing Time: {time_taken:.3f}s
- Variance Preserved: {np.var(result):.3f}
"""
    
    report_content += "\n## Anomaly Detection Analysis\n"
    
    if 'anomaly_results' in globals():
        successful_anomaly = {k: v for k, v in anomaly_results.items() if 'error' not in v}
        report_content += f"Successful anomaly detection methods: {len(successful_anomaly)}\n\n"
        
        if successful_anomaly:
            best_anomaly_algo = max(successful_anomaly.keys(), 
                                   key=lambda x: successful_anomaly[x]['f1_score'])
            best_f1 = successful_anomaly[best_anomaly_algo]['f1_score']
            
            report_content += f"""
### Best Performing Anomaly Detector
- Algorithm: {best_anomaly_algo}
- F1-Score: {best_f1:.3f}
- Precision: {successful_anomaly[best_anomaly_algo]['precision']:.3f}
- Recall: {successful_anomaly[best_anomaly_algo]['recall']:.3f}

### All Methods Performance
"""
            for algo_name, metrics in successful_anomaly.items():
                report_content += f"""
**{algo_name}**
- F1-Score: {metrics['f1_score']:.3f}
- Precision: {metrics['precision']:.3f}
- Recall: {metrics['recall']:.3f}
- Accuracy: {metrics['accuracy']:.3f}
"""
    
    report_content += "\n## Manifold Learning Analysis\n"
    
    if 'manifold_results' in globals():
        report_content += f"Manifold datasets tested: {len(manifold_datasets)}\n\n"
        
        # Calculate average trustworthiness per algorithm
        algo_trustworthiness = {}
        for dataset_name, results in manifold_results.items():
            for algo_name, result in results.items():
                if 'error' not in result:
                    if algo_name not in algo_trustworthiness:
                        algo_trustworthiness[algo_name] = []
                    algo_trustworthiness[algo_name].append(result['trustworthiness'])
        
        if algo_trustworthiness:
            report_content += "### Manifold Learning Performance (Average Trustworthiness)\n"
            for algo_name, scores in algo_trustworthiness.items():
                avg_trust = np.mean(scores)
                report_content += f"- **{algo_name}**: {avg_trust:.3f}\n"