In [None]:
# Essential imports for advanced ML implementations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_blobs, load_digits, fetch_20newsgroups
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.manifold import TSNE, UMAP
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torchvision
import torchvision.transforms as transforms

import gym
import stable_baselines3 as sb3
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv

import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


In [None]:
### 1.1 Advanced Stacked Ensemble with Meta-Learning

# Generate complex classification dataset
X, y = make_classification(n_samples=10000, n_features=20, n_informative=15, 
                          n_redundant=5, n_clusters_per_class=2, 
                          class_sep=0.8, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify=y, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

class AdvancedStackedEnsemble:
    def __init__(self):
        # Level 1 base learners (diverse algorithms)
        self.base_learners = {
            'rf': RandomForestClassifier(n_estimators=200, max_depth=10, 
                                       min_samples_split=5, random_state=42),
            'gb': GradientBoostingClassifier(n_estimators=100, max_depth=6,
                                           learning_rate=0.1, random_state=42),
            'svm': SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42),
            'lr': LogisticRegression(C=1.0, max_iter=1000, random_state=42)
        }
        
        # Level 2 meta-learner
        self.meta_learner = GradientBoostingClassifier(n_estimators=50, 
                                                      max_depth=3, 
                                                      learning_rate=0.05,
                                                      random_state=42)
        self.base_predictions = {}
        
    def fit(self, X, y):
        # Use cross-validation to generate meta-features
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        meta_features = np.zeros((X.shape[0], len(self.base_learners)))
        
        for i, (name, model) in enumerate(self.base_learners.items()):
            model.fit(X, y)  # Fit on full training data
            
            # Generate cross-validated predictions for meta-learning
            fold_predictions = np.zeros(X.shape[0])
            for train_idx, val_idx in cv.split(X, y):
                fold_model = type(model)(**model.get_params())
                fold_model.fit(X[train_idx], y[train_idx])
                fold_predictions[val_idx] = fold_model.predict_proba(X[val_idx])[:, 1]
            
            meta_features[:, i] = fold_predictions
            
        # Train meta-learner on meta-features
        self.meta_learner.fit(meta_features, y)
        
    def predict(self, X):
        # Generate base learner predictions
        base_predictions = np.zeros((X.shape[0], len(self.base_learners)))
        for i, (name, model) in enumerate(self.base_learners.items()):
            base_predictions[:, i] = model.predict_proba(X)[:, 1]
        
        # Use meta-learner to make final prediction
        return self.meta_learner.predict(base_predictions)
    
    def predict_proba(self, X):
        base_predictions = np.zeros((X.shape[0], len(self.base_learners)))
        for i, (name, model) in enumerate(self.base_learners.items()):
            base_predictions[:, i] = model.predict_proba(X)[:, 1]
        
        return self.meta_learner.predict_proba(base_predictions)

# Train and evaluate the stacked ensemble
stacked_ensemble = AdvancedStackedEnsemble()
stacked_ensemble.fit(X_train_scaled, y_train)

# Predictions
y_pred_stacked = stacked_ensemble.predict(X_test_scaled)
y_proba_stacked = stacked_ensemble.predict_proba(X_test_scaled)

print("=== Advanced Stacked Ensemble Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stacked):.4f}")
print(f"Cross-validation score: {cross_val_score(stacked_ensemble, X_train_scaled, y_train, cv=5).mean():.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_stacked))


In [None]:
### 1.2 Advanced Deep Neural Network with Residual Connections

class ResidualBlock(nn.Module):
    def __init__(self, in_features, out_features, dropout_rate=0.3):
        super(ResidualBlock, self).__init__()
        self.fc1 = nn.Linear(in_features, out_features)
        self.bn1 = nn.BatchNorm1d(out_features)
        self.fc2 = nn.Linear(out_features, out_features)
        self.bn2 = nn.BatchNorm1d(out_features)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU(inplace=True)
        
        # Skip connection
        self.skip_connection = nn.Linear(in_features, out_features) if in_features != out_features else nn.Identity()
        
    def forward(self, x):
        identity = self.skip_connection(x)
        
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        
        out += identity  # Residual connection
        out = self.relu(out)
        
        return out

class AdvancedResNet(nn.Module):
    def __init__(self, input_dim, num_classes, hidden_dims=[512, 256, 128], dropout_rate=0.3):
        super(AdvancedResNet, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dims[0]))
        layers.append(nn.BatchNorm1d(hidden_dims[0]))
        layers.append(nn.ReLU(inplace=True))
        layers.append(nn.Dropout(dropout_rate))
        
        # Residual blocks
        for dim in hidden_dims:
            layers.append(ResidualBlock(prev_dim if prev_dim != dim else dim, dim, dropout_rate))
            prev_dim = dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, num_classes))
        
        self.network = nn.Sequential(*layers)
        
        # Initialize weights
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        elif isinstance(module, nn.BatchNorm1d):
            nn.init.constant_(module.weight, 1)
            nn.init.constant_(module.bias, 0)
    
    def forward(self, x):
        return self.network(x)

# Advanced training loop with learning rate scheduling and early stopping
class AdvancedTrainer:
    def __init__(self, model, device='cpu'):
        self.model = model.to(device)
        self.device = device
        self.best_val_loss = float('inf')
        self.patience_counter = 0
        
    def train(self, train_loader, val_loader, epochs=100, lr=0.001, patience=10):
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(self.model.parameters(), lr=lr, weight_decay=0.01)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                        factor=0.5, patience=5, verbose=True)
        
        train_losses, val_losses = [], []
        train_accs, val_accs = [], []
        
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            train_loss, train_correct = 0, 0
            train_total = 0
            
            for batch_x, batch_y in train_loader:
                batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
                
                optimizer.zero_grad()
                outputs = self.model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                
                optimizer.step()
                
                train_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                train_total += batch_y.size(0)
                train_correct += (predicted == batch_y).sum().item()
            
            # Validation phase
            self.model.eval()
            val_loss, val_correct = 0, 0
            val_total = 0
            
            with torch.no_grad():
                for batch_x, batch_y in val_loader:
                    batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
                    outputs = self.model(batch_x)
                    loss = criterion(outputs, batch_y)
                    
                    val_loss += loss.item()
                    _, predicted = torch.max(outputs.data, 1)
                    val_total += batch_y.size(0)
                    val_correct += (predicted == batch_y).sum().item()
            
            # Calculate averages
            avg_train_loss = train_loss / len(train_loader)
            avg_val_loss = val_loss / len(val_loader)
            train_acc = 100 * train_correct / train_total
            val_acc = 100 * val_correct / val_total
            
            train_losses.append(avg_train_loss)
            val_losses.append(avg_val_loss)
            train_accs.append(train_acc)
            val_accs.append(val_acc)
            
            # Learning rate scheduling
            scheduler.step(avg_val_loss)
            
            # Early stopping
            if avg_val_loss < self.best_val_loss:
                self.best_val_loss = avg_val_loss
                self.patience_counter = 0
                # Save best model
                torch.save(self.model.state_dict(), 'best_model.pth')
            else:
                self.patience_counter += 1
                
            if epoch % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, '
                      f'Val Loss: {avg_val_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')
            
            if self.patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break
                
        # Load best model
        self.model.load_state_dict(torch.load('best_model.pth'))
        
        return {
            'train_losses': train_losses,
            'val_losses': val_losses,
            'train_accs': train_accs,
            'val_accs': val_accs
        }

# Prepare data for PyTorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.LongTensor(y_test)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Further split training data for validation
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

# Initialize and train the advanced ResNet
resnet_model = AdvancedResNet(input_dim=X_train_scaled.shape[1], num_classes=2)
trainer = AdvancedTrainer(resnet_model, device)

print("=== Training Advanced ResNet ===")
history = trainer.train(train_loader, val_loader, epochs=50, lr=0.001, patience=10)

# Evaluate on test set
resnet_model.eval()
test_correct = 0
test_total = 0
test_predictions = []
test_probas = []

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = resnet_model(batch_x)
        probas = F.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs.data, 1)
        
        test_total += batch_y.size(0)
        test_correct += (predicted == batch_y).sum().item()
        test_predictions.extend(predicted.cpu().numpy())
        test_probas.extend(probas.cpu().numpy())

resnet_accuracy = 100 * test_correct / test_total
print(f"\n=== Advanced ResNet Results ===")
print(f"Test Accuracy: {resnet_accuracy:.2f}%")
print(f"Classification Report:")
print(classification_report(y_test, test_predictions))

# Plot training history
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(history['train_losses'], label='Train Loss')
plt.plot(history['val_losses'], label='Val Loss')
plt.title('Loss Curves')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(history['train_accs'], label='Train Accuracy')
plt.plot(history['val_accs'], label='Val Accuracy')
plt.title('Accuracy Curves')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()

plt.subplot(1, 3, 3)
# Confusion matrix
cm = confusion_matrix(y_test, test_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - ResNet')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.show()


In [None]:
### 2.1 Variational Autoencoder (VAE) for Generative Modeling

class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim=400, latent_dim=20):
        super(VAE, self).__init__()
        
        # Encoder
        self.encoder_input = nn.Linear(input_dim, hidden_dim)
        self.encoder_hidden = nn.Linear(hidden_dim, hidden_dim)
        self.mu_layer = nn.Linear(hidden_dim, latent_dim)
        self.logvar_layer = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder
        self.decoder_input = nn.Linear(latent_dim, hidden_dim)
        self.decoder_hidden = nn.Linear(hidden_dim, hidden_dim)
        self.decoder_output = nn.Linear(hidden_dim, input_dim)
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def encode(self, x):
        h = self.relu(self.encoder_input(x))
        h = self.relu(self.encoder_hidden(h))
        mu = self.mu_layer(h)
        logvar = self.logvar_layer(h)
        return mu, logvar
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z):
        h = self.relu(self.decoder_input(z))
        h = self.relu(self.decoder_hidden(h))
        return self.sigmoid(self.decoder_output(h))
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon_x = self.decode(z)
        return recon_x, mu, logvar

def vae_loss_function(recon_x, x, mu, logvar, beta=1.0):
    # Reconstruction loss (BCE)
    BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    
    # KL divergence loss
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    
    return BCE + beta * KLD, BCE, KLD

# Generate complex dataset for unsupervised learning
np.random.seed(42)
n_samples = 5000
n_features = 50

# Create complex multi-modal data
data1 = np.random.multivariate_normal([2, 2] + [0]*(n_features-2), np.eye(n_features), n_samples//3)
data2 = np.random.multivariate_normal([-2, -2] + [0]*(n_features-2), np.eye(n_features), n_samples//3)  
data3 = np.random.multivariate_normal([0, 3] + [0]*(n_features-2), np.eye(n_features), n_samples//3)

unsupervised_data = np.vstack([data1, data2, data3])

# Normalize data to [0, 1] for VAE
from sklearn.preprocessing import MinMaxScaler
scaler_unsup = MinMaxScaler()
unsupervised_data_scaled = scaler_unsup.fit_transform(unsupervised_data)

# Prepare data for VAE
X_unsup_tensor = torch.FloatTensor(unsupervised_data_scaled)
unsup_dataset = TensorDataset(X_unsup_tensor, X_unsup_tensor)  # For autoencoder, input = target
unsup_loader = DataLoader(unsup_dataset, batch_size=128, shuffle=True)

# Initialize and train VAE
vae_model = VAE(input_dim=n_features, hidden_dim=400, latent_dim=10)
vae_optimizer = optim.Adam(vae_model.parameters(), lr=0.001)

print("=== Training Variational Autoencoder ===")
vae_model.train()
train_losses = []

for epoch in range(100):
    total_loss = 0
    total_bce = 0
    total_kld = 0
    
    for batch_data, _ in unsup_loader:
        vae_optimizer.zero_grad()
        
        recon_batch, mu, logvar = vae_model(batch_data)
        loss, bce, kld = vae_loss_function(recon_batch, batch_data, mu, logvar)
        
        loss.backward()
        vae_optimizer.step()
        
        total_loss += loss.item()
        total_bce += bce.item()
        total_kld += kld.item()
    
    avg_loss = total_loss / len(unsup_loader)
    avg_bce = total_bce / len(unsup_loader)
    avg_kld = total_kld / len(unsup_loader)
    
    train_losses.append(avg_loss)
    
    if epoch % 20 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {avg_loss:.4f}, BCE: {avg_bce:.4f}, KLD: {avg_kld:.4f}')

# Generate new samples from the learned latent space
vae_model.eval()
with torch.no_grad():
    # Sample from latent space
    z_sample = torch.randn(100, 10)  # 100 samples from 10D latent space
    generated_samples = vae_model.decode(z_sample).numpy()
    
    # Get latent representations of original data
    mu, logvar = vae_model.encode(X_unsup_tensor)
    latent_representations = mu.detach().numpy()

print(f"Generated {generated_samples.shape[0]} new samples")
print(f"Latent representation shape: {latent_representations.shape}")

# Visualize latent space (first 2 dimensions)
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(train_losses)
plt.title('VAE Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 3, 2)
plt.scatter(latent_representations[:, 0], latent_representations[:, 1], alpha=0.6, s=20)
plt.title('Latent Space Representation (First 2D)')
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')

plt.subplot(1, 3, 3)
# Compare original vs reconstructed data (first feature)
with torch.no_grad():
    reconstructed, _, _ = vae_model(X_unsup_tensor[:500])
    reconstructed = reconstructed.numpy()

plt.scatter(unsupervised_data_scaled[:500, 0], reconstructed[:500, 0], alpha=0.6)
plt.plot([0, 1], [0, 1], 'r--', lw=2)
plt.title('Original vs Reconstructed (Feature 1)')
plt.xlabel('Original')
plt.ylabel('Reconstructed')

plt.tight_layout()
plt.show()


In [None]:
### 2.2 Advanced Clustering and Dimensionality Reduction

from sklearn.cluster import DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.neighbors import kneighbors_graph
import umap

class AdvancedClusteringPipeline:
    def __init__(self, data):
        self.data = data
        self.results = {}
        
    def spectral_clustering_advanced(self, n_clusters=3, gamma=1.0):
        """Advanced spectral clustering with RBF kernel"""
        # Build similarity matrix using RBF kernel
        from sklearn.metrics.pairwise import rbf_kernel
        similarity_matrix = rbf_kernel(self.data, gamma=gamma)
        
        spectral = SpectralClustering(n_clusters=n_clusters, 
                                    affinity='precomputed',
                                    assign_labels='discretize',
                                    random_state=42)
        labels = spectral.fit_predict(similarity_matrix)
        
        silhouette_avg = silhouette_score(self.data, labels)
        self.results['spectral'] = {
            'labels': labels,
            'silhouette_score': silhouette_avg,
            'n_clusters': n_clusters
        }
        
        return labels, silhouette_avg
    
    def gaussian_mixture_model(self, max_components=10):
        """GMM with automatic component selection using BIC"""
        bic_scores = []
        aic_scores = []
        n_components_range = range(1, max_components + 1)
        
        best_gmm = None
        best_bic = float('inf')
        
        for n_components in n_components_range:
            gmm = GaussianMixture(n_components=n_components, 
                                covariance_type='full',
                                random_state=42)
            gmm.fit(self.data)
            
            bic = gmm.bic(self.data)
            aic = gmm.aic(self.data)
            
            bic_scores.append(bic)
            aic_scores.append(aic)
            
            if bic < best_bic:
                best_bic = bic
                best_gmm = gmm
        
        labels = best_gmm.predict(self.data)
        silhouette_avg = silhouette_score(self.data, labels)
        
        self.results['gmm'] = {
            'labels': labels,
            'silhouette_score': silhouette_avg,
            'n_components': best_gmm.n_components,
            'bic_scores': bic_scores,
            'aic_scores': aic_scores,
            'model': best_gmm
        }
        
        return labels, silhouette_avg, best_gmm
    
    def dbscan_advanced(self, eps_range=np.linspace(0.1, 2.0, 20)):
        """DBSCAN with automatic epsilon selection"""
        best_score = -1
        best_eps = None
        best_labels = None
        
        for eps in eps_range:
            dbscan = DBSCAN(eps=eps, min_samples=5)
            labels = dbscan.fit_predict(self.data)
            
            # Skip if all points are noise or all points are in one cluster
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters < 2:
                continue
                
            silhouette_avg = silhouette_score(self.data, labels)
            
            if silhouette_avg > best_score:
                best_score = silhouette_avg
                best_eps = eps
                best_labels = labels
        
        if best_labels is not None:
            self.results['dbscan'] = {
                'labels': best_labels,
                'silhouette_score': best_score,
                'eps': best_eps,
                'n_clusters': len(set(best_labels)) - (1 if -1 in best_labels else 0)
            }
        
        return best_labels, best_score
    
    def advanced_dimensionality_reduction(self):
        """Multiple dimensionality reduction techniques"""
        
        # PCA
        pca = PCA(n_components=2, random_state=42)
        data_pca = pca.fit_transform(self.data)
        
        # t-SNE
        tsne = TSNE(n_components=2, random_state=42, perplexity=30)
        data_tsne = tsne.fit_transform(self.data)
        
        # UMAP
        umap_reducer = umap.UMAP(n_components=2, random_state=42)
        data_umap = umap_reducer.fit_transform(self.data)
        
        self.results['dimensionality_reduction'] = {
            'pca': data_pca,
            'tsne': data_tsne,
            'umap': data_umap,
            'pca_explained_variance': pca.explained_variance_ratio_
        }
        
        return data_pca, data_tsne, data_umap

# Apply advanced clustering pipeline
print("=== Advanced Clustering Pipeline ===")
clustering_pipeline = AdvancedClusteringPipeline(unsupervised_data_scaled)

# Spectral Clustering
print("Performing Spectral Clustering...")
spectral_labels, spectral_score = clustering_pipeline.spectral_clustering_advanced(n_clusters=3)
print(f"Spectral Clustering - Silhouette Score: {spectral_score:.4f}")

# Gaussian Mixture Model
print("Performing Gaussian Mixture Model...")
gmm_labels, gmm_score, best_gmm = clustering_pipeline.gaussian_mixture_model(max_components=8)
print(f"GMM - Best n_components: {best_gmm.n_components}, Silhouette Score: {gmm_score:.4f}")

# DBSCAN
print("Performing DBSCAN...")
dbscan_labels, dbscan_score = clustering_pipeline.dbscan_advanced()
if dbscan_labels is not None:
    print(f"DBSCAN - Best eps: {clustering_pipeline.results['dbscan']['eps']:.3f}, Silhouette Score: {dbscan_score:.4f}")

# Dimensionality Reduction
print("Performing Dimensionality Reduction...")
data_pca, data_tsne, data_umap = clustering_pipeline.advanced_dimensionality_reduction()

# Visualization
fig, axes = plt.subplots(3, 4, figsize=(20, 15))

# Original data true clusters (we know the structure from generation)
true_labels = np.concatenate([np.zeros(n_samples//3), np.ones(n_samples//3), np.full(n_samples//3, 2)])

# Row 1: PCA projections
axes[0, 0].scatter(data_pca[:, 0], data_pca[:, 1], c=true_labels, cmap='viridis', alpha=0.6)
axes[0, 0].set_title('PCA - True Clusters')
axes[0, 1].scatter(data_pca[:, 0], data_pca[:, 1], c=spectral_labels, cmap='viridis', alpha=0.6)
axes[0, 1].set_title(f'PCA - Spectral (Score: {spectral_score:.3f})')
axes[0, 2].scatter(data_pca[:, 0], data_pca[:, 1], c=gmm_labels, cmap='viridis', alpha=0.6)
axes[0, 2].set_title(f'PCA - GMM (Score: {gmm_score:.3f})')
if dbscan_labels is not None:
    axes[0, 3].scatter(data_pca[:, 0], data_pca[:, 1], c=dbscan_labels, cmap='viridis', alpha=0.6)
    axes[0, 3].set_title(f'PCA - DBSCAN (Score: {dbscan_score:.3f})')

# Row 2: t-SNE projections
axes[1, 0].scatter(data_tsne[:, 0], data_tsne[:, 1], c=true_labels, cmap='viridis', alpha=0.6)
axes[1, 0].set_title('t-SNE - True Clusters')
axes[1, 1].scatter(data_tsne[:, 0], data_tsne[:, 1], c=spectral_labels, cmap='viridis', alpha=0.6)
axes[1, 1].set_title('t-SNE - Spectral')
axes[1, 2].scatter(data_tsne[:, 0], data_tsne[:, 1], c=gmm_labels, cmap='viridis', alpha=0.6)
axes[1, 2].set_title('t-SNE - GMM')
if dbscan_labels is not None:
    axes[1, 3].scatter(data_tsne[:, 0], data_tsne[:, 1], c=dbscan_labels, cmap='viridis', alpha=0.6)
    axes[1, 3].set_title('t-SNE - DBSCAN')

# Row 3: UMAP projections
axes[2, 0].scatter(data_umap[:, 0], data_umap[:, 1], c=true_labels, cmap='viridis', alpha=0.6)
axes[2, 0].set_title('UMAP - True Clusters')
axes[2, 1].scatter(data_umap[:, 0], data_umap[:, 1], c=spectral_labels, cmap='viridis', alpha=0.6)
axes[2, 1].set_title('UMAP - Spectral')
axes[2, 2].scatter(data_umap[:, 0], data_umap[:, 1], c=gmm_labels, cmap='viridis', alpha=0.6)
axes[2, 2].set_title('UMAP - GMM')
if dbscan_labels is not None:
    axes[2, 3].scatter(data_umap[:, 0], data_umap[:, 1], c=dbscan_labels, cmap='viridis', alpha=0.6)
    axes[2, 3].set_title('UMAP - DBSCAN')

plt.tight_layout()
plt.show()

# Model selection plot for GMM
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(range(1, len(clustering_pipeline.results['gmm']['bic_scores']) + 1), 
         clustering_pipeline.results['gmm']['bic_scores'], 'bo-', label='BIC')
plt.plot(range(1, len(clustering_pipeline.results['gmm']['aic_scores']) + 1), 
         clustering_pipeline.results['gmm']['aic_scores'], 'ro-', label='AIC')
plt.xlabel('Number of Components')
plt.ylabel('Information Criterion')
plt.title('GMM Model Selection')
plt.legend()

plt.subplot(1, 2, 2)
pca_var_ratio = clustering_pipeline.results['dimensionality_reduction']['pca_explained_variance']
cumulative_var = np.cumsum(pca_var_ratio)
plt.bar(range(len(pca_var_ratio)), pca_var_ratio, alpha=0.6, label='Individual')
plt.plot(range(len(cumulative_var)), cumulative_var, 'ro-', label='Cumulative')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
### 3.1 Advanced Pseudo-labeling with Confidence Thresholding

class AdvancedPseudoLabeling:
    def __init__(self, base_model, confidence_threshold=0.9, max_iterations=10):
        self.base_model = base_model
        self.confidence_threshold = confidence_threshold
        self.max_iterations = max_iterations
        self.history = []
        
    def fit(self, X_labeled, y_labeled, X_unlabeled):
        # Initial training on labeled data
        current_model = type(self.base_model)(**self.base_model.get_params())
        current_model.fit(X_labeled, y_labeled)
        
        X_train = X_labeled.copy()
        y_train = y_labeled.copy()
        
        for iteration in range(self.max_iterations):
            # Predict on unlabeled data
            probas = current_model.predict_proba(X_unlabeled)
            max_probas = np.max(probas, axis=1)
            predictions = np.argmax(probas, axis=1)
            
            # Select high-confidence predictions
            confident_mask = max_probas >= self.confidence_threshold
            
            if np.sum(confident_mask) == 0:
                print(f"No confident predictions found at iteration {iteration+1}")
                break
                
            # Add pseudo-labeled samples to training set
            X_pseudo = X_unlabeled[confident_mask]
            y_pseudo = predictions[confident_mask]
            
            X_train = np.vstack([X_train, X_pseudo])
            y_train = np.hstack([y_train, y_pseudo])
            
            # Remove pseudo-labeled samples from unlabeled set
            X_unlabeled = X_unlabeled[~confident_mask]
            
            # Retrain model
            current_model = type(self.base_model)(**self.base_model.get_params())
            current_model.fit(X_train, y_train)
            
            # Store history
            self.history.append({
                'iteration': iteration + 1,
                'n_pseudo_labeled': np.sum(confident_mask),
                'n_remaining_unlabeled': X_unlabeled.shape[0],
                'avg_confidence': np.mean(max_probas[confident_mask])
            })
            
            print(f"Iteration {iteration+1}: Added {np.sum(confident_mask)} pseudo-labels "
                  f"(avg confidence: {np.mean(max_probas[confident_mask]):.3f})")
            
            if X_unlabeled.shape[0] == 0:
                break
        
        self.final_model = current_model
        return self
    
    def predict(self, X):
        return self.final_model.predict(X)
    
    def predict_proba(self, X):
        return self.final_model.predict_proba(X)

# Create semi-supervised dataset
# Use only a small portion of labels (10%)
n_labeled = int(0.1 * len(X_train_scaled))
labeled_indices = np.random.choice(len(X_train_scaled), n_labeled, replace=False)
unlabeled_indices = np.setdiff1d(np.arange(len(X_train_scaled)), labeled_indices)

X_semi_labeled = X_train_scaled[labeled_indices]
y_semi_labeled = y_train[labeled_indices]
X_semi_unlabeled = X_train_scaled[unlabeled_indices]
y_semi_unlabeled_true = y_train[unlabeled_indices]  # For evaluation only

print(f"Semi-supervised setup:")
print(f"Labeled samples: {len(X_semi_labeled)}")
print(f"Unlabeled samples: {len(X_semi_unlabeled)}")

# Train baseline model (supervised only)
baseline_model = RandomForestClassifier(n_estimators=100, random_state=42)
baseline_model.fit(X_semi_labeled, y_semi_labeled)
baseline_accuracy = accuracy_score(y_test, baseline_model.predict(X_test_scaled))

# Train pseudo-labeling model
print("\n=== Training Pseudo-labeling Model ===")
pseudo_labeler = AdvancedPseudoLabeling(
    base_model=RandomForestClassifier(n_estimators=100, random_state=42),
    confidence_threshold=0.8,
    max_iterations=5
)

pseudo_labeler.fit(X_semi_labeled, y_semi_labeled, X_semi_unlabeled)
pseudo_accuracy = accuracy_score(y_test, pseudo_labeler.predict(X_test_scaled))

print(f"\nResults:")
print(f"Baseline (supervised only): {baseline_accuracy:.4f}")
print(f"Pseudo-labeling: {pseudo_accuracy:.4f}")
print(f"Improvement: {pseudo_accuracy - baseline_accuracy:.4f}")


In [None]:
### 3.2 Graph-based Semi-supervised Learning

from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.neighbors import kneighbors_graph

class GraphBasedSemiSupervised:
    def __init__(self, method='label_propagation', kernel='rbf', gamma=1.0, n_neighbors=7):
        self.method = method
        self.kernel = kernel
        self.gamma = gamma
        self.n_neighbors = n_neighbors
        self.model = None
        
    def fit(self, X_labeled, y_labeled, X_unlabeled):
        # Combine labeled and unlabeled data
        X_combined = np.vstack([X_labeled, X_unlabeled])
        y_combined = np.hstack([y_labeled, np.full(len(X_unlabeled), -1)])  # -1 for unlabeled
        
        if self.method == 'label_propagation':
            self.model = LabelPropagation(kernel=self.kernel, gamma=self.gamma)
        elif self.method == 'label_spreading':
            self.model = LabelSpreading(kernel=self.kernel, gamma=self.gamma, alpha=0.2)
        
        self.model.fit(X_combined, y_combined)
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)

# Compare different graph-based methods
print("=== Graph-based Semi-supervised Learning ===")

methods = [
    ('Label Propagation (RBF)', 'label_propagation', 'rbf', 1.0),
    ('Label Propagation (KNN)', 'label_propagation', 'knn', 7),
    ('Label Spreading (RBF)', 'label_spreading', 'rbf', 1.0),
    ('Label Spreading (KNN)', 'label_spreading', 'knn', 7)
]

graph_results = {}

for name, method, kernel, param in methods:
    if kernel == 'rbf':
        graph_model = GraphBasedSemiSupervised(method=method, kernel=kernel, gamma=param)
    else:
        graph_model = GraphBasedSemiSupervised(method=method, kernel=kernel, n_neighbors=int(param))
    
    graph_model.fit(X_semi_labeled, y_semi_labeled, X_semi_unlabeled)
    accuracy = accuracy_score(y_test, graph_model.predict(X_test_scaled))
    
    graph_results[name] = {
        'accuracy': accuracy,
        'model': graph_model
    }
    
    print(f"{name}: {accuracy:.4f}")

# Find best graph method
best_graph_method = max(graph_results.items(), key=lambda x: x[1]['accuracy'])
print(f"\nBest graph method: {best_graph_method[0]} ({best_graph_method[1]['accuracy']:.4f})")

# Visualization of semi-supervised results
plt.figure(figsize=(18, 12))

# Plot 1: Pseudo-labeling history
plt.subplot(3, 3, 1)
iterations = [h['iteration'] for h in pseudo_labeler.history]
n_pseudo = [h['n_pseudo_labeled'] for h in pseudo_labeler.history]
plt.bar(iterations, n_pseudo)
plt.title('Pseudo-labels Added per Iteration')
plt.xlabel('Iteration')
plt.ylabel('Number of Pseudo-labels')

# Plot 2: Confidence evolution
plt.subplot(3, 3, 2)
confidences = [h['avg_confidence'] for h in pseudo_labeler.history]
plt.plot(iterations, confidences, 'bo-')
plt.title('Average Confidence per Iteration')
plt.xlabel('Iteration')
plt.ylabel('Average Confidence')

# Plot 3: Method comparison
plt.subplot(3, 3, 3)
methods_comp = ['Baseline'] + list(graph_results.keys()) + ['Pseudo-labeling']
accuracies_comp = [baseline_accuracy] + [result['accuracy'] for result in graph_results.values()] + [pseudo_accuracy]
bars = plt.bar(range(len(methods_comp)), accuracies_comp)
plt.title('Semi-supervised Methods Comparison')
plt.ylabel('Accuracy')
plt.xticks(range(len(methods_comp)), methods_comp, rotation=45)

# Color bars
for i, bar in enumerate(bars):
    if i == 0:
        bar.set_color('red')  # Baseline
    elif i == len(bars) - 1:
        bar.set_color('green')  # Best method
    else:
        bar.set_color('blue')

# Plots 4-6: Label propagation visualization on 2D projection
data_combined_2d = data_pca[:len(X_train_scaled)]  # Use PCA projection from earlier
test_data_2d = scaler.transform(X_test)
test_data_2d = PCA(n_components=2).fit(X_train_scaled).transform(test_data_2d)

# Get the best graph model predictions on the combined dataset
best_model = best_graph_method[1]['model']

plt.subplot(3, 3, 4)
plt.scatter(data_combined_2d[labeled_indices, 0], data_combined_2d[labeled_indices, 1], 
           c=y_semi_labeled, cmap='viridis', s=60, alpha=0.8, edgecolors='black')
plt.scatter(data_combined_2d[unlabeled_indices, 0], data_combined_2d[unlabeled_indices, 1], 
           c='gray', s=20, alpha=0.5)
plt.title('Original Labeled Data')
plt.xlabel('PC1')
plt.ylabel('PC2')

plt.subplot(3, 3, 5)
# Predict labels for unlabeled data using best graph method
unlabeled_predictions = best_model.predict(X_semi_unlabeled)
plt.scatter(data_combined_2d[labeled_indices, 0], data_combined_2d[labeled_indices, 1], 
           c=y_semi_labeled, cmap='viridis', s=60, alpha=0.8, edgecolors='black')
plt.scatter(data_combined_2d[unlabeled_indices, 0], data_combined_2d[unlabeled_indices, 1], 
           c=unlabeled_predictions, cmap='viridis', s=20, alpha=0.7)
plt.title(f'After {best_graph_method[0]}')
plt.xlabel('PC1')
plt.ylabel('PC2')

plt.subplot(3, 3, 6)
plt.scatter(data_combined_2d[labeled_indices, 0], data_combined_2d[labeled_indices, 1], 
           c=y_semi_labeled, cmap='viridis', s=60, alpha=0.8, edgecolors='black')
plt.scatter(data_combined_2d[unlabeled_indices, 0], data_combined_2d[unlabeled_indices, 1], 
           c=y_semi_unlabeled_true, cmap='viridis', s=20, alpha=0.7)
plt.title('True Labels (for comparison)')
plt.xlabel('PC1')
plt.ylabel('PC2')

# Plots 7-9: Performance analysis
plt.subplot(3, 3, 7)
# Accuracy vs percentage of labeled data
labeled_percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
baseline_accs = []
graph_accs = []

for pct in labeled_percentages:
    n_lab = int(pct * len(X_train_scaled))
    lab_idx = np.random.choice(len(X_train_scaled), n_lab, replace=False)
    unlab_idx = np.setdiff1d(np.arange(len(X_train_scaled)), lab_idx)
    
    # Baseline
    base_temp = RandomForestClassifier(n_estimators=50, random_state=42)
    base_temp.fit(X_train_scaled[lab_idx], y_train[lab_idx])
    base_acc = accuracy_score(y_test, base_temp.predict(X_test_scaled))
    baseline_accs.append(base_acc)
    
    # Graph method
    graph_temp = GraphBasedSemiSupervised(method='label_propagation', kernel='rbf', gamma=1.0)
    graph_temp.fit(X_train_scaled[lab_idx], y_train[lab_idx], X_train_scaled[unlab_idx])
    graph_acc = accuracy_score(y_test, graph_temp.predict(X_test_scaled))
    graph_accs.append(graph_acc)

plt.plot(labeled_percentages, baseline_accs, 'r-o', label='Supervised Only')
plt.plot(labeled_percentages, graph_accs, 'b-o', label='Semi-supervised')
plt.title('Performance vs Labeled Data %')
plt.xlabel('Percentage of Labeled Data')
plt.ylabel('Accuracy')
plt.legend()

# Plot 8: Confusion matrices
plt.subplot(3, 3, 8)
cm_baseline = confusion_matrix(y_test, baseline_model.predict(X_test_scaled))
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Baseline Confusion Matrix')

plt.subplot(3, 3, 9)
cm_best = confusion_matrix(y_test, best_model.predict(X_test_scaled))
sns.heatmap(cm_best, annot=True, fmt='d', cmap='Greens', cbar=False)
plt.title(f'{best_graph_method[0]} Confusion Matrix')

plt.tight_layout()
plt.show()

print(f"\n=== Semi-supervised Learning Summary ===")
print(f"Baseline accuracy (10% labeled): {baseline_accuracy:.4f}")
print(f"Best semi-supervised method: {best_graph_method[0]}")
print(f"Best semi-supervised accuracy: {best_graph_method[1]['accuracy']:.4f}")
print(f"Improvement over baseline: {best_graph_method[1]['accuracy'] - baseline_accuracy:.4f}")


In [None]:
### 4.1 Contrastive Learning (SimCLR-style)

class ContrastiveLearningModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, projection_dim=128):
        super(ContrastiveLearningModel, self).__init__()
        
        # Encoder network
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        # Projection head
        self.projection_head = nn.Sequential(
            nn.Linear(hidden_dim, projection_dim),
            nn.ReLU(),
            nn.Linear(projection_dim, projection_dim)
        )
        
    def forward(self, x):
        h = self.encoder(x)
        z = self.projection_head(h)
        return F.normalize(z, dim=1)  # L2 normalize
    
    def get_representation(self, x):
        """Get encoded representation without projection"""
        return self.encoder(x)

def create_augmented_pairs(X, noise_factor=0.1):
    """Create augmented pairs by adding noise"""
    batch_size = X.shape[0]
    
    # First augmentation: add Gaussian noise
    aug1 = X + torch.randn_like(X) * noise_factor
    
    # Second augmentation: add different Gaussian noise
    aug2 = X + torch.randn_like(X) * noise_factor
    
    # Create positive pairs
    pairs = torch.cat([aug1, aug2], dim=0)
    labels = torch.cat([torch.arange(batch_size), torch.arange(batch_size)], dim=0)
    
    return pairs, labels

def contrastive_loss(z, labels, temperature=0.1):
    """NT-Xent loss (Normalized Temperature-scaled Cross Entropy)"""
    batch_size = labels.shape[0] // 2
    
    # Compute similarity matrix
    sim_matrix = torch.mm(z, z.t()) / temperature
    
    # Create mask for positive pairs
    mask = torch.eye(batch_size * 2).bool()
    sim_matrix.masked_fill_(mask, -float('inf'))
    
    # Positive pairs are at positions (i, i+batch_size) and (i+batch_size, i)
    pos_indices = torch.cat([torch.arange(batch_size, 2*batch_size), 
                            torch.arange(batch_size)], dim=0)
    
    # Get positive similarities
    pos_sim = sim_matrix[torch.arange(2*batch_size), pos_indices].unsqueeze(1)
    
    # Compute loss
    logits = torch.cat([pos_sim, sim_matrix], dim=1)
    labels_contrastive = torch.zeros(2*batch_size, dtype=torch.long)
    
    return F.cross_entropy(logits, labels_contrastive)

# Train contrastive model
print("=== Training Contrastive Learning Model ===")

# Use the same dataset but without labels for self-supervised learning
X_self_supervised = torch.FloatTensor(X_train_scaled)
self_sup_loader = DataLoader(TensorDataset(X_self_supervised), batch_size=128, shuffle=True)

contrastive_model = ContrastiveLearningModel(input_dim=X_train_scaled.shape[1])
contrastive_optimizer = optim.Adam(contrastive_model.parameters(), lr=0.001)

contrastive_losses = []
contrastive_model.train()

for epoch in range(50):
    total_loss = 0
    n_batches = 0
    
    for batch in self_sup_loader:
        X_batch = batch[0]
        
        # Create augmented pairs
        augmented_pairs, pair_labels = create_augmented_pairs(X_batch)
        
        contrastive_optimizer.zero_grad()
        
        # Forward pass
        z = contrastive_model(augmented_pairs)
        loss = contrastive_loss(z, pair_labels)
        
        loss.backward()
        contrastive_optimizer.step()
        
        total_loss += loss.item()
        n_batches += 1
    
    avg_loss = total_loss / n_batches
    contrastive_losses.append(avg_loss)
    
    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/50], Contrastive Loss: {avg_loss:.4f}')

# Extract learned representations
contrastive_model.eval()
with torch.no_grad():
    train_representations = contrastive_model.get_representation(X_train_tensor).numpy()
    test_representations = contrastive_model.get_representation(X_test_tensor).numpy()

print(f"Learned representation shape: {train_representations.shape}")

# Evaluate representations with a linear classifier
linear_classifier = LogisticRegression(random_state=42, max_iter=1000)
linear_classifier.fit(train_representations, y_train)
contrastive_accuracy = accuracy_score(y_test, linear_classifier.predict(test_representations))

print(f"Contrastive Learning + Linear Classifier Accuracy: {contrastive_accuracy:.4f}")


In [None]:
### 4.2 Masked Autoencoder for Self-supervised Learning

class MaskedAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, latent_dim=128, mask_ratio=0.3):
        super(MaskedAutoencoder, self).__init__()
        self.input_dim = input_dim
        self.mask_ratio = mask_ratio
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, latent_dim)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, input_dim)
        )
        
    def create_mask(self, batch_size):
        """Create random mask for input features"""
        mask = torch.ones(batch_size, self.input_dim)
        n_masked = int(self.input_dim * self.mask_ratio)
        
        for i in range(batch_size):
            # Randomly select features to mask
            masked_indices = torch.randperm(self.input_dim)[:n_masked]
            mask[i, masked_indices] = 0
            
        return mask
    
    def forward(self, x):
        batch_size = x.shape[0]
        
        # Create mask
        mask = self.create_mask(batch_size)
        
        # Apply mask (set masked features to 0)
        masked_x = x * mask
        
        # Encode
        latent = self.encoder(masked_x)
        
        # Decode
        reconstructed = self.decoder(latent)
        
        return reconstructed, mask, latent
    
    def get_representation(self, x):
        """Get latent representation without masking"""
        return self.encoder(x)

def masked_reconstruction_loss(reconstructed, original, mask):
    """Compute reconstruction loss only on masked features"""
    # Focus on reconstructing only the masked parts
    masked_positions = (mask == 0)
    
    if masked_positions.sum() == 0:
        return torch.tensor(0.0, requires_grad=True)
    
    # MSE loss only on masked features
    loss = F.mse_loss(reconstructed[masked_positions], original[masked_positions])
    return loss

# Train Masked Autoencoder
print("=== Training Masked Autoencoder ===")

mae_model = MaskedAutoencoder(input_dim=X_train_scaled.shape[1], mask_ratio=0.4)
mae_optimizer = optim.Adam(mae_model.parameters(), lr=0.001)

mae_losses = []
mae_model.train()

for epoch in range(100):
    total_loss = 0
    n_batches = 0
    
    for batch in self_sup_loader:
        X_batch = batch[0]
        
        mae_optimizer.zero_grad()
        
        # Forward pass
        reconstructed, mask, latent = mae_model(X_batch)
        loss = masked_reconstruction_loss(reconstructed, X_batch, mask)
        
        loss.backward()
        mae_optimizer.step()
        
        total_loss += loss.item()
        n_batches += 1
    
    avg_loss = total_loss / n_batches
    mae_losses.append(avg_loss)
    
    if epoch % 20 == 0:
        print(f'Epoch [{epoch+1}/100], MAE Loss: {avg_loss:.4f}')

# Extract learned representations from MAE
mae_model.eval()
with torch.no_grad():
    mae_train_representations = mae_model.get_representation(X_train_tensor).numpy()
    mae_test_representations = mae_model.get_representation(X_test_tensor).numpy()

# Evaluate MAE representations
mae_classifier = LogisticRegression(random_state=42, max_iter=1000)
mae_classifier.fit(mae_train_representations, y_train)
mae_accuracy = accuracy_score(y_test, mae_classifier.predict(mae_test_representations))

print(f"Masked Autoencoder + Linear Classifier Accuracy: {mae_accuracy:.4f}")


In [None]:
### 4.3 Self-supervised Learning Analysis and Visualization

# Compare all representation learning methods
print("=== Self-supervised Learning Comparison ===")

# Baseline: PCA
pca_baseline = PCA(n_components=128, random_state=42)
pca_train_rep = pca_baseline.fit_transform(X_train_scaled)
pca_test_rep = pca_baseline.transform(X_test_scaled)
pca_classifier = LogisticRegression(random_state=42, max_iter=1000)
pca_classifier.fit(pca_train_rep, y_train)
pca_accuracy = accuracy_score(y_test, pca_classifier.predict(pca_test_rep))

# Baseline: Random projections
from sklearn.random_projection import GaussianRandomProjection
rp_baseline = GaussianRandomProjection(n_components=128, random_state=42)
rp_train_rep = rp_baseline.fit_transform(X_train_scaled)
rp_test_rep = rp_baseline.transform(X_test_scaled)
rp_classifier = LogisticRegression(random_state=42, max_iter=1000)
rp_classifier.fit(rp_train_rep, y_train)
rp_accuracy = accuracy_score(y_test, rp_classifier.predict(rp_test_rep))

# Compare methods
methods_comparison = {
    'PCA': pca_accuracy,
    'Random Projection': rp_accuracy,
    'VAE (Latent)': accuracy_score(y_test, LogisticRegression(random_state=42, max_iter=1000).fit(latent_representations, y_train).predict(latent_representations[len(y_train):])),
    'Contrastive Learning': contrastive_accuracy,
    'Masked Autoencoder': mae_accuracy,
    'Supervised ResNet': resnet_accuracy / 100  # Convert from percentage
}

print("Representation Learning Methods Comparison:")
for method, acc in methods_comparison.items():
    print(f"{method}: {acc:.4f}")

# Visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot 1: Training losses
axes[0, 0].plot(contrastive_losses, label='Contrastive Loss')
axes[0, 0].set_title('Contrastive Learning Training')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()

axes[0, 1].plot(mae_losses, label='MAE Loss', color='orange')
axes[0, 1].set_title('Masked Autoencoder Training')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Reconstruction Loss')
axes[0, 1].legend()

# Plot 2: Method comparison
methods_list = list(methods_comparison.keys())
accuracies_list = list(methods_comparison.values())
bars = axes[0, 2].bar(range(len(methods_list)), accuracies_list)
axes[0, 2].set_title('Representation Learning Methods')
axes[0, 2].set_ylabel('Accuracy')
axes[0, 2].set_xticks(range(len(methods_list)))
axes[0, 2].set_xticklabels(methods_list, rotation=45)

# Color bars by performance
max_acc = max(accuracies_list)
for i, (bar, acc) in enumerate(zip(bars, accuracies_list)):
    if acc == max_acc:
        bar.set_color('green')
    elif 'supervised' in methods_list[i].lower():
        bar.set_color('red')
    else:
        bar.set_color('blue')

# Plot 3-5: t-SNE visualization of different representations
tsne_viz = TSNE(n_components=2, random_state=42, perplexity=30)

# Contrastive representations
contrastive_2d = tsne_viz.fit_transform(train_representations[:1000])  # Subsample for speed
axes[1, 0].scatter(contrastive_2d[:, 0], contrastive_2d[:, 1], c=y_train[:1000], cmap='viridis', alpha=0.6)
axes[1, 0].set_title('Contrastive Learning (t-SNE)')

# MAE representations
mae_2d = tsne_viz.fit_transform(mae_train_representations[:1000])
axes[1, 1].scatter(mae_2d[:, 0], mae_2d[:, 1], c=y_train[:1000], cmap='viridis', alpha=0.6)
axes[1, 1].set_title('Masked Autoencoder (t-SNE)')

# PCA baseline
pca_2d = tsne_viz.fit_transform(pca_train_rep[:1000])
axes[1, 2].scatter(pca_2d[:, 0], pca_2d[:, 1], c=y_train[:1000], cmap='viridis', alpha=0.6)
axes[1, 2].set_title('PCA Baseline (t-SNE)')

plt.tight_layout()
plt.show()

# Feature importance analysis for self-supervised methods
print("\n=== Feature Importance Analysis ===")

# Analyze which features are most important for each method
def compute_feature_importance(representations, labels):
    """Compute mutual information as feature importance"""
    from sklearn.feature_selection import mutual_info_classif
    return mutual_info_classif(representations, labels, random_state=42)

contrastive_importance = compute_feature_importance(train_representations, y_train)
mae_importance = compute_feature_importance(mae_train_representations, y_train)

# Plot feature importance
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.bar(range(len(contrastive_importance)), sorted(contrastive_importance, reverse=True))
plt.title('Contrastive Learning\nFeature Importance')
plt.xlabel('Feature Index (sorted)')
plt.ylabel('Mutual Information')

plt.subplot(1, 3, 2)
plt.bar(range(len(mae_importance)), sorted(mae_importance, reverse=True))
plt.title('Masked Autoencoder\nFeature Importance')
plt.xlabel('Feature Index (sorted)')
plt.ylabel('Mutual Information')

plt.subplot(1, 3, 3)
# Correlation between different representation methods
corr_contrastive_mae = np.corrcoef(train_representations.T, mae_train_representations.T)[:len(train_representations.T), len(train_representations.T):]
plt.imshow(corr_contrastive_mae, cmap='coolwarm', aspect='auto')
plt.title('Cross-correlation:\nContrastive vs MAE Features')
plt.xlabel('MAE Features')
plt.ylabel('Contrastive Features')
plt.colorbar()

plt.tight_layout()
plt.show()

print("\n=== Self-supervised Learning Summary ===")
print(f"Best self-supervised method: {max(methods_comparison.items(), key=lambda x: x[1])[0]}")
print(f"Best self-supervised accuracy: {max(methods_comparison.values()):.4f}")
print(f"Improvement over PCA: {max(methods_comparison.values()) - pca_accuracy:.4f}")
print(f"Gap to supervised learning: {methods_comparison['Supervised ResNet'] - max([v for k, v in methods_comparison.items() if 'supervised' not in k.lower()]):.4f}")


In [None]:
### 5.1 Custom Environment and Advanced RL Algorithms

# First, let's create a custom environment for portfolio optimization
class PortfolioEnvironment:
    def __init__(self, data, initial_balance=10000, window_size=20):
        self.data = data  # Price data
        self.initial_balance = initial_balance
        self.window_size = window_size
        self.current_step = 0
        self.balance = initial_balance
        self.shares = 0
        self.net_worth = initial_balance
        self.max_net_worth = initial_balance
        self.history = []
        
    def reset(self):
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.shares = 0
        self.net_worth = self.initial_balance
        self.max_net_worth = self.initial_balance
        self.history = []
        return self._get_state()
    
    def _get_state(self):
        # Return recent price history and current portfolio state
        start_idx = max(0, self.current_step - self.window_size)
        price_history = self.data[start_idx:self.current_step]
        
        # Normalize prices
        if len(price_history) > 1:
            price_history = (price_history - price_history[0]) / price_history[0]
        else:
            price_history = np.array([0])
            
        # Pad if necessary
        if len(price_history) < self.window_size:
            price_history = np.pad(price_history, (self.window_size - len(price_history), 0))
        
        # Current portfolio state
        portfolio_state = np.array([
            self.balance / self.initial_balance,
            self.shares * self.data[self.current_step - 1] / self.initial_balance,
            self.net_worth / self.max_net_worth
        ])
        
        return np.concatenate([price_history, portfolio_state])
    
    def step(self, action):
        current_price = self.data[self.current_step]
        
        # Action: 0=hold, 1=buy, 2=sell
        if action == 1 and self.balance > current_price:  # Buy
            shares_to_buy = self.balance // current_price
            self.shares += shares_to_buy
            self.balance -= shares_to_buy * current_price
            
        elif action == 2 and self.shares > 0:  # Sell
            self.balance += self.shares * current_price
            self.shares = 0
        
        # Update net worth
        self.net_worth = self.balance + self.shares * current_price
        
        # Calculate reward (return since last step)
        if len(self.history) > 0:
            reward = (self.net_worth - self.history[-1]) / self.history[-1]
        else:
            reward = 0
        
        # Penalty for large drawdowns
        if self.net_worth < 0.5 * self.max_net_worth:
            reward -= 0.1
            
        self.max_net_worth = max(self.max_net_worth, self.net_worth)
        self.history.append(self.net_worth)
        
        self.current_step += 1
        done = self.current_step >= len(self.data) - 1
        
        return self._get_state(), reward, done, {}

# Generate synthetic stock price data
np.random.seed(42)
n_days = 1000
initial_price = 100
prices = [initial_price]

for _ in range(n_days - 1):
    change = np.random.normal(0.001, 0.02)  # Small positive drift with volatility
    new_price = prices[-1] * (1 + change)
    prices.append(max(new_price, 1))  # Ensure price doesn't go negative

prices = np.array(prices)

# Create environment
env = PortfolioEnvironment(prices)

print("=== Custom Portfolio Environment Created ===")
print(f"Price data shape: {prices.shape}")
print(f"State space dimension: {len(env.reset())}")
print(f"Action space: 3 (Hold, Buy, Sell)")

# Plot price data
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.plot(prices)
plt.title('Synthetic Stock Price Data')
plt.xlabel('Days')
plt.ylabel('Price ($)')

plt.subplot(1, 2, 2)
plt.plot(np.diff(prices) / prices[:-1])
plt.title('Daily Returns')
plt.xlabel('Days')
plt.ylabel('Return')
plt.tight_layout()
plt.show()


In [None]:
### 5.2 Advanced Deep Q-Network (DQN) Implementation

class AdvancedDQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dims=[256, 256, 128]):
        super(AdvancedDQN, self).__init__()
        
        layers = []
        prev_dim = state_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.1)
            ])
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, action_dim))
        self.network = nn.Sequential(*layers)
        
        # Initialize weights
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
    
    def forward(self, x):
        return self.network(x)

class ExperienceReplayBuffer:
    def __init__(self, capacity=10000):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size):
        import random
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

class DQNAgent:
    def __init__(self, state_dim, action_dim, lr=0.001, gamma=0.99, epsilon=1.0, 
                 epsilon_decay=0.995, epsilon_min=0.01, target_update_freq=100):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.target_update_freq = target_update_freq
        
        # Networks
        self.q_network = AdvancedDQN(state_dim, action_dim)
        self.target_network = AdvancedDQN(state_dim, action_dim)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        
        # Experience replay
        self.replay_buffer = ExperienceReplayBuffer()
        
        # Update target network
        self.target_network.load_state_dict(self.q_network.state_dict())
        
        self.step_count = 0
        self.losses = []
        
    def act(self, state, training=True):
        if training and np.random.random() < self.epsilon:
            return np.random.randint(self.action_dim)
        
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            q_values = self.q_network(state_tensor)
            return q_values.argmax().item()
    
    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)
    
    def replay(self, batch_size=32):
        if len(self.replay_buffer) < batch_size:
            return
        
        # Sample batch
        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
        
        # Convert to tensors
        state = torch.FloatTensor(state)
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        next_state = torch.FloatTensor(next_state)
        done = torch.BoolTensor(done)
        
        # Current Q values
        current_q_values = self.q_network(state).gather(1, action.unsqueeze(1))
        
        # Next Q values from target network
        with torch.no_grad():
            next_q_values = self.target_network(next_state).max(1)[0]
            target_q_values = reward + (self.gamma * next_q_values * ~done)
        
        # Compute loss
        loss = F.mse_loss(current_q_values.squeeze(), target_q_values)
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.losses.append(loss.item())
        
        # Update target network
        self.step_count += 1
        if self.step_count % self.target_update_freq == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())
        
        # Decay epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Train DQN Agent
print("=== Training DQN Agent ===")

state_dim = len(env.reset())
action_dim = 3  # Hold, Buy, Sell
agent = DQNAgent(state_dim, action_dim)

# Training parameters
episodes = 500
max_steps_per_episode = 200

# Training metrics
episode_rewards = []
episode_lengths = []
portfolio_values = []

for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    steps = 0
    
    for step in range(min(max_steps_per_episode, len(prices) - env.window_size - 1)):
        action = agent.act(state, training=True)
        next_state, reward, done, _ = env.step(action)
        
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        steps += 1
        
        if done:
            break
    
    # Train the agent
    if len(agent.replay_buffer) > 1000:
        for _ in range(10):  # Multiple training steps per episode
            agent.replay()
    
    episode_rewards.append(total_reward)
    episode_lengths.append(steps)
    portfolio_values.append(env.net_worth)
    
    if episode % 50 == 0:
        avg_reward = np.mean(episode_rewards[-50:])
        avg_portfolio = np.mean(portfolio_values[-50:])
        print(f"Episode {episode}, Avg Reward: {avg_reward:.4f}, "
              f"Avg Portfolio Value: ${avg_portfolio:.2f}, Epsilon: {agent.epsilon:.3f}")

print(f"Training completed! Final portfolio value: ${env.net_worth:.2f}")
print(f"Total return: {(env.net_worth / env.initial_balance - 1) * 100:.2f}%")


In [None]:
### 5.3 RL Analysis and Comparison with Baselines

# Evaluate trained agent
print("=== Evaluating Trained DQN Agent ===")

# Test the agent (no exploration)
test_env = PortfolioEnvironment(prices)
state = test_env.reset()
test_rewards = []
test_actions = []
test_portfolio_values = [test_env.net_worth]

for step in range(len(prices) - test_env.window_size - 1):
    action = agent.act(state, training=False)  # No exploration
    next_state, reward, done, _ = test_env.step(action)
    
    test_rewards.append(reward)
    test_actions.append(action)
    test_portfolio_values.append(test_env.net_worth)
    
    if done:
        break
    state = next_state

# Compare with baselines
class SimpleBaselines:
    @staticmethod
    def buy_and_hold(prices, initial_balance=10000):
        shares = initial_balance // prices[0]
        final_value = shares * prices[-1] + (initial_balance - shares * prices[0])
        return final_value
    
    @staticmethod
    def random_strategy(prices, initial_balance=10000, seed=42):
        np.random.seed(seed)
        env = PortfolioEnvironment(prices, initial_balance)
        state = env.reset()
        
        for step in range(len(prices) - env.window_size - 1):
            action = np.random.randint(3)
            next_state, reward, done, _ = env.step(action)
            if done:
                break
            state = next_state
        
        return env.net_worth
    
    @staticmethod
    def momentum_strategy(prices, initial_balance=10000, window=5):
        env = PortfolioEnvironment(prices, initial_balance)
        state = env.reset()
        
        for step in range(len(prices) - env.window_size - 1):
            current_idx = env.current_step
            if current_idx > window:
                recent_return = (prices[current_idx-1] - prices[current_idx-window-1]) / prices[current_idx-window-1]
                if recent_return > 0.02:  # Strong positive momentum
                    action = 1  # Buy
                elif recent_return < -0.02:  # Strong negative momentum
                    action = 2  # Sell
                else:
                    action = 0  # Hold
            else:
                action = 0  # Hold
                
            next_state, reward, done, _ = env.step(action)
            if done:
                break
            state = next_state
        
        return env.net_worth

# Compare strategies
buy_hold_value = SimpleBaselines.buy_and_hold(prices, 10000)
random_value = SimpleBaselines.random_strategy(prices, 10000)
momentum_value = SimpleBaselines.momentum_strategy(prices, 10000)
dqn_value = test_env.net_worth

strategies = {
    'Buy & Hold': buy_hold_value,
    'Random Strategy': random_value,
    'Momentum Strategy': momentum_value,
    'DQN Agent': dqn_value
}

print("=== Strategy Comparison ===")
for strategy, value in strategies.items():
    return_pct = (value / 10000 - 1) * 100
    print(f"{strategy}: ${value:.2f} ({return_pct:+.2f}%)")

# Visualization
fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# Plot 1: Training progress
axes[0, 0].plot(episode_rewards)
axes[0, 0].set_title('DQN Training Rewards')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Total Reward')

# Plot 2: Portfolio values during training
axes[0, 1].plot(portfolio_values)
axes[0, 1].axhline(y=10000, color='r', linestyle='--', label='Initial Value')
axes[0, 1].set_title('Portfolio Value During Training')
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Portfolio Value ($)')
axes[0, 1].legend()

# Plot 3: Strategy comparison
strategy_names = list(strategies.keys())
strategy_values = list(strategies.values())
bars = axes[0, 2].bar(range(len(strategy_names)), strategy_values)
axes[0, 2].set_title('Final Portfolio Values')
axes[0, 2].set_ylabel('Portfolio Value ($)')
axes[0, 2].set_xticks(range(len(strategy_names)))
axes[0, 2].set_xticklabels(strategy_names, rotation=45)
axes[0, 2].axhline(y=10000, color='r', linestyle='--', alpha=0.7)

# Color bars
for i, (bar, value) in enumerate(zip(bars, strategy_values)):
    if value == max(strategy_values):
        bar.set_color('green')
    elif 'DQN' in strategy_names[i]:
        bar.set_color('blue')
    else:
        bar.set_color('gray')

# Plot 4: Test portfolio evolution
axes[1, 0].plot(prices[test_env.window_size:test_env.window_size+len(test_portfolio_values)], 
               alpha=0.7, label='Stock Price')
ax_twin = axes[1, 0].twinx()
ax_twin.plot(test_portfolio_values, color='red', label='Portfolio Value')
axes[1, 0].set_title('DQN Agent Performance')
axes[1, 0].set_xlabel('Time Steps')
axes[1, 0].set_ylabel('Stock Price', color='blue')
ax_twin.set_ylabel('Portfolio Value ($)', color='red')
axes[1, 0].legend(loc='upper left')
ax_twin.legend(loc='upper right')

# Plot 5: Action distribution
action_counts = np.bincount(test_actions, minlength=3)
action_labels = ['Hold', 'Buy', 'Sell']
axes[1, 1].pie(action_counts, labels=action_labels, autopct='%1.1f%%')
axes[1, 1].set_title('DQN Agent Action Distribution')

# Plot 6: Training loss
if agent.losses:
    axes[1, 2].plot(agent.losses)
    axes[1, 2].set_title('DQN Training Loss')
    axes[1, 2].set_xlabel('Training Step')
    axes[1, 2].set_ylabel('Loss')

plt.tight_layout()
plt.show()

# Risk-adjusted performance metrics
def calculate_sharpe_ratio(returns, risk_free_rate=0.02):
    excess_returns = np.array(returns) - risk_free_rate/252  # Daily risk-free rate
    return np.mean(excess_returns) / np.std(excess_returns) * np.sqrt(252)

def calculate_max_drawdown(portfolio_values):
    peak = portfolio_values[0]
    max_dd = 0
    for value in portfolio_values:
        if value > peak:
            peak = value
        drawdown = (peak - value) / peak
        if drawdown > max_dd:
            max_dd = drawdown
    return max_dd

# Calculate metrics for RL agent
rl_returns = np.diff(test_portfolio_values) / test_portfolio_values[:-1]
rl_sharpe = calculate_sharpe_ratio(rl_returns)
rl_max_dd = calculate_max_drawdown(test_portfolio_values)

print(f"\n=== DQN Agent Risk Metrics ===")
print(f"Sharpe Ratio: {rl_sharpe:.3f}")
print(f"Maximum Drawdown: {rl_max_dd:.3f}")
print(f"Final Return: {(dqn_value/10000 - 1)*100:.2f}%")


In [None]:
### 6.1 Final Comparison and Key Insights

# Comprehensive comparison of all methods
print("=" * 80)
print("COMPREHENSIVE MACHINE LEARNING METHODS COMPARISON")
print("=" * 80)

print("\n1. SUPERVISED LEARNING RESULTS:")
print(f"   • Advanced Stacked Ensemble: {accuracy_score(y_test, y_pred_stacked):.4f}")
print(f"   • ResNet with Advanced Training: {resnet_accuracy/100:.4f}")
print("   Key Insight: Ensemble methods often outperform single models")

print("\n2. UNSUPERVISED LEARNING RESULTS:")
print(f"   • VAE Latent Space Quality: {len(latent_representations)} samples, {latent_representations.shape[1]}D latent space")
print(f"   • Best Clustering Method: {max(clustering_pipeline.results, key=lambda x: clustering_pipeline.results[x].get('silhouette_score', 0))}")
print("   Key Insight: Different techniques excel at different types of patterns")

print("\n3. SEMI-SUPERVISED LEARNING RESULTS:")
print(f"   • Baseline (10% labeled): {baseline_accuracy:.4f}")
print(f"   • Best Semi-supervised: {best_graph_method[0]} - {best_graph_method[1]['accuracy']:.4f}")
print(f"   • Improvement: {best_graph_method[1]['accuracy'] - baseline_accuracy:.4f}")
print("   Key Insight: Significant gains possible with limited labeled data")

print("\n4. SELF-SUPERVISED LEARNING RESULTS:")
if 'methods_comparison' in locals():
    best_self_sup = max([v for k, v in methods_comparison.items() if 'supervised' not in k.lower()])
    best_self_sup_method = max([(k, v) for k, v in methods_comparison.items() if 'supervised' not in k.lower()], key=lambda x: x[1])
    print(f"   • Best Self-supervised: {best_self_sup_method[0]} - {best_self_sup:.4f}")
    print(f"   • Gap to full supervision: {methods_comparison['Supervised ResNet'] - best_self_sup:.4f}")
print("   Key Insight: Self-supervised methods can learn meaningful representations")

print("\n5. REINFORCEMENT LEARNING RESULTS:")
print(f"   • DQN Agent Return: {(dqn_value/10000 - 1)*100:.2f}%")
print(f"   • Best Strategy: {max(strategies.items(), key=lambda x: x[1])[0]}")
print(f"   • Risk-adjusted Performance (Sharpe): {rl_sharpe:.3f}")
print("   Key Insight: RL can learn complex sequential decision-making")

# Create a comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Accuracy comparison across supervised/semi-supervised methods
supervised_methods = ['Baseline RF', 'Stacked Ensemble', 'ResNet']
supervised_scores = [
    accuracy_score(y_test, RandomForestClassifier(random_state=42).fit(X_train_scaled, y_train).predict(X_test_scaled)),
    accuracy_score(y_test, y_pred_stacked),
    resnet_accuracy/100
]

semi_methods = ['10% Labeled', 'Pseudo-labeling', 'Best Graph Method']
semi_scores = [baseline_accuracy, pseudo_accuracy, best_graph_method[1]['accuracy']]

x_pos = np.arange(len(supervised_methods))
axes[0, 0].bar(x_pos - 0.2, supervised_scores, 0.4, label='Supervised', alpha=0.8)
axes[0, 0].bar(x_pos + 0.2, semi_scores, 0.4, label='Semi-supervised', alpha=0.8)
axes[0, 0].set_title('Supervised vs Semi-supervised Learning')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_xticks(x_pos)
axes[0, 0].set_xticklabels(['Basic', 'Advanced', 'Deep Learning'])
axes[0, 0].legend()

# Plot 2: Self-supervised representation quality
if 'methods_comparison' in locals():
    repr_methods = [k for k in methods_comparison.keys() if k not in ['Supervised ResNet']]
    repr_scores = [methods_comparison[k] for k in repr_methods]
    
    bars = axes[0, 1].bar(range(len(repr_methods)), repr_scores)
    axes[0, 1].set_title('Representation Learning Methods')
    axes[0, 1].set_ylabel('Linear Probe Accuracy')
    axes[0, 1].set_xticks(range(len(repr_methods)))
    axes[0, 1].set_xticklabels(repr_methods, rotation=45)
    
    # Highlight best method
    max_idx = repr_scores.index(max(repr_scores))
    bars[max_idx].set_color('gold')

# Plot 3: Clustering comparison
clustering_methods = []
clustering_scores = []
for method, results in clustering_pipeline.results.items():
    if 'silhouette_score' in results:
        clustering_methods.append(method.upper())
        clustering_scores.append(results['silhouette_score'])

if clustering_methods:
    axes[1, 0].bar(clustering_methods, clustering_scores)
    axes[1, 0].set_title('Unsupervised Clustering Methods')
    axes[1, 0].set_ylabel('Silhouette Score')
    axes[1, 0].tick_params(axis='x', rotation=45)

# Plot 4: RL strategy comparison
rl_methods = list(strategies.keys())
rl_returns = [(v/10000 - 1)*100 for v in strategies.values()]

bars = axes[1, 1].bar(rl_methods, rl_returns)
axes[1, 1].set_title('Reinforcement Learning Strategies')
axes[1, 1].set_ylabel('Return (%)')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].axhline(y=0, color='black', linestyle='-', alpha=0.3)

# Color best performing bar
max_idx = rl_returns.index(max(rl_returns))
bars[max_idx].set_color('green')

plt.tight_layout()
plt.show()

# Key takeaways and recommendations
print("\n" + "=" * 80)
print("KEY TAKEAWAYS AND RECOMMENDATIONS")
print("=" * 80)

print("\n🎯 WHEN TO USE EACH APPROACH:")
print("\n• SUPERVISED LEARNING:")
print("  - Use when you have abundant labeled data")
print("  - Stacking/ensembles for maximum performance")
print("  - Deep learning for complex patterns")

print("\n• UNSUPERVISED LEARNING:")
print("  - Exploratory data analysis and pattern discovery")
print("  - Dimensionality reduction before supervised learning")
print("  - Customer segmentation, anomaly detection")

print("\n• SEMI-SUPERVISED LEARNING:")
print("  - Limited labeled data but abundant unlabeled data")
print("  - Medical diagnosis, document classification")
print("  - Can provide 10-30% improvement over supervised-only")

print("\n• SELF-SUPERVISED LEARNING:")
print("  - Large datasets without labels")
print("  - Pre-training for downstream tasks")
print("  - Computer vision, NLP applications")

print("\n• REINFORCEMENT LEARNING:")
print("  - Sequential decision making")
print("  - Game playing, robotics, trading")
print("  - Optimization of long-term rewards")

print("\n🚀 ADVANCED TECHNIQUES DEMONSTRATED:")
print("• Residual connections and batch normalization")
print("• Experience replay and target networks")
print("• Contrastive learning and masked modeling")
print("• Graph-based semi-supervised learning")
print("• Variational autoencoders for generation")

print("\n💡 PRACTICAL RECOMMENDATIONS:")
print("• Start with simple baselines before complex methods")
print("• Use ensemble methods for critical applications")
print("• Consider semi-supervised when labels are expensive")
print("• Self-supervised pre-training often improves downstream performance")
print("• RL requires careful environment design and reward engineering")

print("\n" + "=" * 80)
print("NOTEBOOK COMPLETION: ALL ADVANCED ML TYPES DEMONSTRATED")
print("=" * 80)
