In [2]:
# Data gathering
import pandas as pd
import msoffcrypto
import io

def load_encrypted_excel(file_path: str, password: str) -> pd.DataFrame:
    with open(file_path, 'rb') as f:
        office_file = msoffcrypto.OfficeFile(f)
        office_file.load_key(password=password)
        decrypted = io.BytesIO()
        office_file.decrypt(decrypted)
        decrypted.seek(0)
        return pd.read_excel(decrypted)

# File configurations
files = [
    # Core Client & FNA Process Tables
    {"name": "client", "path": "client.xlsx", "password": "_XlN@a9)EVy1"},
    {"name": "provider", "path": "provider.xlsx", "password": "unT4d4GO#dX("},
    {"name": "emfc2fna", "path": "emfc2fna.xlsx", "password": "dQq9T%pC^?22"},
    {"name": "emfc2personalinformation", "path": "emfc2personalinformation.xlsx", "password": "ZqYmaFgC@Zv3"},
    {"name": "emfc2", "path": "emfc2.xlsx", "password": "79GYEd%l(2Bf"},
    {"name": "EMFC2Assets", "path": "EMFC2Assets.xlsx", "password": "!suNZ=%YA13k"},
    {"name": "emfc2portofolioinsurance", "path": "emfc2portofolioinsurance.xlsx", "password": "BcxM>wz*(hxF"},

    # Product & Solution Workflow
    {"name": "emfc2productsolution", "path": "emfc2productsolution.xlsx", "password": "@OFn7oA5!Joe"},
    {"name": "EMFC2ProductIntegrationApplication", "path": "EMFC2ProductIntegrationApplication.xlsx", "password": "(FZsw7#vz-bN"},
    {"name": "EMFC2ProductIntegrationLog", "path": "EMFC2ProductIntegrationLog.xlsx", "password": "?wcAx*P4n=9&"},

    # Product & Category Lookup Tables
    {"name": "ProductMainPlan", "path": "ProductMainPlan.xlsx", "password": ")XQ4ZDssowrA"},
    {"name": "ProductType", "path": "ProductType.xlsx", "password": "#9zCw?^-xTO?"},
    {"name": "ProductCategory", "path": "ProductCategory.xlsx", "password": "#F)cdAEOVJ@4"},
    {"name": "productsubcategory", "path": "productsubcategory.xlsx", "password": "y-^t$N9>%S%C"}
]

# Load all datasets into memory
datasets = {}

print("=== LOADING ALL DATASETS ===")
for file in files:
    print(f"Loading {file['name']}...", end=" ")
    try:
        datasets[file['name']] = load_encrypted_excel(file["path"], file["password"])
        shape = datasets[file['name']].shape
        print(f"✓ ({shape[0]:,} rows, {shape[1]} columns)")
    except Exception as e:
        print(f"✗ Error: {e}")

print(f"\nSuccessfully loaded {len(datasets)} datasets")
print("Available datasets:", list(datasets.keys()))


=== LOADING ALL DATASETS ===
Loading client... ✓ (45,688 rows, 49 columns)
Loading provider... ✓ (128 rows, 21 columns)
Loading emfc2fna... ✓ (51,772 rows, 31 columns)
Loading emfc2personalinformation... ✓ (52,305 rows, 37 columns)
Loading emfc2... ✓ (51,769 rows, 8 columns)
Loading EMFC2Assets... ✓ (50,500 rows, 39 columns)
Loading emfc2portofolioinsurance... ✓ (27,437 rows, 25 columns)
Loading emfc2productsolution... ✓ (43,501 rows, 25 columns)
Loading EMFC2ProductIntegrationApplication... ✓ (560 rows, 14 columns)
Loading EMFC2ProductIntegrationLog... ✓ (977 rows, 21 columns)
Loading ProductMainPlan... ✓ (1,532 rows, 22 columns)
Loading ProductType... ✓ (4 rows, 8 columns)
Loading ProductCategory... ✓ (10 rows, 6 columns)
Loading productsubcategory... ✓ (39 rows, 13 columns)

Successfully loaded 14 datasets
Available datasets: ['client', 'provider', 'emfc2fna', 'emfc2personalinformation', 'emfc2', 'EMFC2Assets', 'emfc2portofolioinsurance', 'emfc2productsolution', 'EMFC2ProductIntegra

In [3]:
print("\\n=== COLUMN HEADERS FOR EACH DATASET ===\\n")
for name, df in datasets.items():
    print(f"📄 Dataset: {name}")
    print(f"🧾 Columns ({len(df.columns)}):")
    for col in df.columns:
        print(f"  - {col} ({df[col].dtype})")
    print("-" * 40)


\n=== COLUMN HEADERS FOR EACH DATASET ===\n
📄 Dataset: client
🧾 Columns (49):
  - # (int64)
  - ClientId (object)
  - ClientName (object)
  - ClientMobileNumber (object)
  - ClientMNVerified (bool)
  - ClientMNVeriCode (float64)
  - ClientMNVeriCodeTime (datetime64[ns])
  - ClientEmail (object)
  - ClientContactPreferences (object)
  - ClientGender (object)
  - ClientDOB (datetime64[ns])
  - ClientCPFContributionCategoryId (object)
  - IDNumber (object)
  - Nationality (object)
  - SpokenLanguage (object)
  - WrittenLanguage (object)
  - Education (object)
  - EmploymentStatus (object)
  - Occupation (object)
  - MaritalStatus (object)
  - PrimaryAddress (object)
  - CorrespondingAddress (object)
  - IncomeRange (object)
  - AccompaniedbyTrustedIndividual (float64)
  - ClientInvitedDate (datetime64[ns])
  - ClientStatus (object)
  - RiskProfile (object)
  - RiskProfileSubmissionDate (datetime64[ns])
  - CKAProfile (object)
  - CARProfile (object)
  - CKACARSubmissionDate (datetime64[ns

In [4]:
"""
=== ENHANCED FINANCIAL PRODUCT RECOMMENDATION SYSTEM ===
Advanced ML implementation with state-of-the-art techniques for maximizing top-3 accuracy
Target: Improve from 79.7% to 88-92% top-3 accuracy through:
- Dynamic Weight Neural Network Ensemble (DW-NNE)
- Multi-level stacking with attention mechanisms
- Advanced imbalance handling with Borderline-SMOTE + cost-sensitive learning
- SHAP-based feature selection and financial-specific features
- Calibrated confidence scoring with temporal validation
"""

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Core ML libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, matthews_corrcoef
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Advanced ensemble and boosting
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import catboost as cb

# Imbalanced learning
from imblearn.over_sampling import BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline

# Deep learning and attention mechanisms
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Feature engineering and interpretability
import shap
import umap
from scipy import stats
from sklearn.cluster import KMeans

# Utilities
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import itertools
from collections import defaultdict
from typing import List, Dict, Tuple, Union
import optuna

from pandas.api.types import is_object_dtype, is_datetime64_any_dtype

class FinancialFeatureEngineer:
    """Advanced feature engineering specifically designed for financial products"""
    
    def __init__(self):
        self.feature_encoders = {}
        self.interaction_features = []
        self.temporal_features = []
        
    def create_financial_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create comprehensive financial features with domain expertise"""
        
        print("🔧 Creating advanced financial features...")
        
        # Risk-adjusted features
        df = self._create_risk_features(df)
        
        # Temporal behavioral features
        df = self._create_temporal_features(df)
        
        # Financial sophistication features
        df = self._create_sophistication_features(df)
        
        # Product affinity features
        df = self._create_affinity_features(df)
        
        # Interaction features (most important for financial products)
        df = self._create_interaction_features(df)
        
        return df
    
    def _create_risk_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create risk-adjusted features"""
        
        # Debt-to-income with volatility adjustment
        if 'Income_Numeric' in df.columns and 'Total_Annual_Premium' in df.columns:
            df['Debt_to_Income_Ratio'] = np.where(
                df['Income_Numeric'] > 0,
                df['Total_Annual_Premium'] / df['Income_Numeric'],
                0
            ).clip(0, 2)  # Cap at 200%
            
            # Risk-adjusted premium ratio
            df['Risk_Adjusted_Premium_Ratio'] = df['Debt_to_Income_Ratio'] * (
                1 + df.get('Protection_Gap_Score', 0)
            )
        
        # Wealth concentration index (Gini coefficient approximation)
        wealth_columns = ['Total_Liquid_Assets', 'Total_Investments', 'Total_CPF']
        available_wealth = [col for col in wealth_columns if col in df.columns]
        
        if len(available_wealth) >= 2:
            wealth_matrix = df[available_wealth].fillna(0)
            df['Wealth_Concentration'] = wealth_matrix.apply(
                lambda row: self._calculate_concentration_index(row.values), axis=1
            )
        
        # Life stage risk score
        if 'ClientAge' in df.columns and 'MaritalStatus' in df.columns:
            df['Life_Stage_Risk'] = df.apply(self._calculate_life_stage_risk, axis=1)
        
        return df
    
    def _create_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create temporal behavioral features"""
        
        # Financial journey progression
        if 'Client_Tenure_Years' in df.columns and 'EMFC_Count' in df.columns:
            df['Financial_Journey_Velocity'] = np.where(
                df['Client_Tenure_Years'] > 0,
                df['EMFC_Count'] / df['Client_Tenure_Years'],
                0
            )
        
        # Engagement momentum (recent activity vs historical)
        if 'Days_Since_Last_FNA' in df.columns:
            df['Engagement_Momentum'] = np.exp(-df['Days_Since_Last_FNA'] / 365)
            
            # Seasonal adjustment for financial planning
            current_month = datetime.now().month
            df['Seasonal_Propensity'] = np.where(
                current_month in ([1, 4, 10, 11, 12]),  # Tax planning months
                1.2,
                1.0
            )
        
        # Portfolio evolution score
        if 'Insurance_Evolution_Stage' in df.columns and 'Product_Diversity_Score' in df.columns:
            df['Portfolio_Evolution_Score'] = (
                df['Insurance_Evolution_Stage'] * 0.6 + 
                df['Product_Diversity_Score'] * 0.4
            )
        
        return df
    
    def _create_sophistication_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create financial sophistication indicators"""
        
        # Advanced sophistication score
        sophistication_components = {
            'Education': {'University': 3, 'Polytechnic': 2, 'Secondary': 1, 'Primary': 0},
            'Occupation': {'Professional': 3, 'Manager': 2, 'Executive': 2, 'Technician': 1}
        }
        
        df['Advanced_Sophistication_Score'] = 0
        
        for component, mapping in sophistication_components.items():
            if component in df.columns:
                if callable(mapping):
                    df['Advanced_Sophistication_Score'] += df[component].apply(mapping)
                else:
                    df['Advanced_Sophistication_Score'] += df[component].map(mapping).fillna(0)
        
        # Add Investment_Ratio component separately since it needs a lambda function
        if 'Investment_Ratio' in df.columns:
            df['Advanced_Sophistication_Score'] += df['Investment_Ratio'].apply(
                lambda x: 3 if x > 0.5 else 2 if x > 0.2 else 1 if x > 0 else 0
            )
        
        # Digital adoption sophistication
        if 'Digital_Adoption_Score' in df.columns and 'ClientAge' in df.columns:
            df['Age_Adjusted_Digital_Score'] = df['Digital_Adoption_Score'] * np.where(
                df['ClientAge'] < 35, 1.2,
                np.where(df['ClientAge'] < 50, 1.0, 0.8)
            )
        
        return df
    
    def _create_affinity_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create product affinity and cross-sell indicators"""
        
        # Product affinity matrix based on financial planning best practices
        affinity_rules = {
            'Has_Term_Life': {
                'next_products': ['Whole_Life', 'Investment_Linked', 'Critical_Illness'],
                'weights': [0.8, 0.6, 0.7]
            },
            'Has_Health_Insurance': {
                'next_products': ['Critical_Illness', 'Long_Term_Care', 'Disability_Income'],
                'weights': [0.9, 0.6, 0.5]
            },
            'Has_Investment_Products': {
                'next_products': ['Retirement_Planning', 'Endowment', 'Annuity'],
                'weights': [0.8, 0.5, 0.7]
            }
        }
        
        # Cross-sell propensity score
        df['Cross_Sell_Propensity'] = 0
        
        for current_product, rules in affinity_rules.items():
            if current_product in df.columns:
                for next_product, weight in zip(rules['next_products'], rules['weights']):
                    feature_name = f'Affinity_{current_product}_to_{next_product}'
                    if current_product in df.columns:  # Double check column exists
                        df[feature_name] = (df[current_product] * weight).fillna(0)
                        df['Cross_Sell_Propensity'] += df[feature_name]
        
        return df
    
    def _create_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create high-value interaction features for financial planning"""
        
        # Age × Income interaction with life stage adjustment
        if 'ClientAge' in df.columns and 'Income_Numeric' in df.columns:
            df['Age_Income_Life_Stage'] = (
                df['ClientAge'] * np.log1p(df['Income_Numeric']) * 
                df.get('Life_Stage_Risk', 1)
            )
        
        # Wealth × Risk interaction
        if 'Estimated_Net_Worth' in df.columns and 'Protection_Gap_Score' in df.columns:
            df['Wealth_Risk_Interaction'] = (
                np.log1p(df['Estimated_Net_Worth']) * df['Protection_Gap_Score']
            )
        
        # Tenure × Engagement interaction
        if 'Client_Tenure_Years' in df.columns and 'Engagement_Score' in df.columns:
            df['Loyalty_Engagement_Score'] = (
                df['Client_Tenure_Years'] * df['Engagement_Score']
            )
        
        # Sophistication × Gap interaction (sophisticated clients with gaps = high value)
        if 'Advanced_Sophistication_Score' in df.columns and 'Protection_Gap_Score' in df.columns:
            df['Sophisticated_Gap_Opportunity'] = (
                df['Advanced_Sophistication_Score'] * df['Protection_Gap_Score']
            )
        
        return df
    
    def _calculate_concentration_index(self, values: np.array) -> float:
        """Calculate wealth concentration using Gini coefficient approximation"""
        values = values[values > 0]  # Remove zeros
        if len(values) < 2:
            return 0
        
        sorted_values = np.sort(values)
        n = len(sorted_values)
        cumsum = np.cumsum(sorted_values)
        return (n + 1 - 2 * np.sum(cumsum) / cumsum[-1]) / n
    
    def _calculate_life_stage_risk(self, row) -> float:
        """Calculate life stage risk score"""
        age = row.get('ClientAge', 40)
        marital_status = str(row.get('MaritalStatus', '')).lower()
        
        base_risk = 1.0
        
        # Age-based risk
        if age < 30:
            base_risk += 0.2  # Young, building wealth
        elif age < 45:
            base_risk += 0.5  # Peak earning years
        elif age < 60:
            base_risk += 0.3  # Pre-retirement planning
        else:
            base_risk += 0.1  # Retirement phase
        
        # Marital status adjustment
        if 'married' in marital_status:
            base_risk += 0.3  # Family protection needs
        elif 'divorced' in marital_status:
            base_risk += 0.4  # Higher protection needs
        
        return base_risk


class DynamicWeightEnsemble:
    """Dynamic Weight Neural Network Ensemble with attention mechanisms"""
    
    def __init__(self, base_models: List = None, meta_models: List = None, attention_dim: int = 64):
        self.base_models = base_models or []
        self.meta_models = meta_models or []
        self.attention_dim = attention_dim
        self.attention_network = None
        self.attention_state_dict = None
        self.is_fitted = False
        self.n_classes = None
        self.base_predictions = None
        self.meta_predictions = None
        self.classes_ = None  # Add classes_ attribute for sklearn compatibility
        
    def get_params(self, deep=True):
        """Get parameters for this estimator"""
        return {
            'base_models': self.base_models,
            'meta_models': self.meta_models,
            'attention_dim': self.attention_dim
        }
    
    def set_params(self, **params):
        """Set parameters for this estimator"""
        for key, value in params.items():
            setattr(self, key, value)
        return self
        
import torch
import torch.nn as nn
import numpy as np
import joblib
from typing import Dict, Any

# FIX 1: Move AttentionNetwork to module level (outside any class)
class AttentionNetwork(nn.Module):
    """
    Standalone AttentionNetwork class that can be pickled
    """
    def __init__(self, input_dim: int, n_meta_models: int, attention_dim: int):
        super().__init__()
        self.input_dim = input_dim
        self.n_meta_models = n_meta_models
        self.attention_dim = attention_dim
        
        self.attention = nn.Sequential(
            nn.Linear(input_dim, attention_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(attention_dim, attention_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(attention_dim // 2, n_meta_models),
            nn.Softmax(dim=1)
        )
    
    def forward(self, x):
        return self.attention(x)


# FIX 2: Updated DynamicWeightEnsemble with better serialization
class PicklableDynamicWeightEnsemble:
    """
    Enhanced Dynamic Weight Ensemble that can be properly pickled
    """
    
    def __init__(self, base_models=None, meta_models=None, attention_dim=64):
        self.base_models = base_models or []
        self.meta_models = meta_models or []
        self.attention_dim = attention_dim
        self.attention_network = None
        self.attention_state_dict = None  # Store PyTorch state separately
        self.is_fitted = False
        self.n_classes = None
        self.base_predictions = None
        self.meta_predictions = None
        self.classes_ = None
        
    def get_params(self, deep=True):
        """Get parameters for this estimator"""
        return {
            'base_models': self.base_models,
            'meta_models': self.meta_models,
            'attention_dim': self.attention_dim
        }
    
    def set_params(self, **params):
        """Set parameters for this estimator"""
        for key, value in params.items():
            setattr(self, key, value)
        return self
        
    def fit(self, X, y):
        """Fit the dynamic weight ensemble"""
        
        print("🔧 Training Picklable Dynamic Weight Ensemble...")
        
        # Store classes for sklearn compatibility
        self.classes_ = np.unique(y)
        self.n_classes = len(self.classes_)
        
        # Step 1: Train base models
        self.base_predictions = np.zeros((X.shape[0], len(self.base_models), self.n_classes))
        
        for i, model in enumerate(self.base_models):
            print(f"   Training base model {i+1}/{len(self.base_models)}: {model.__class__.__name__}")
            model.fit(X, y)
            self.base_predictions[:, i, :] = model.predict_proba(X)
        
        # Step 2: Train meta models on base predictions
        meta_features = self.base_predictions.reshape(X.shape[0], -1)
        self.meta_predictions = np.zeros((X.shape[0], len(self.meta_models), self.n_classes))
        
        for i, meta_model in enumerate(self.meta_models):
            print(f"   Training meta model {i+1}/{len(self.meta_models)}: {meta_model.__class__.__name__}")
            meta_model.fit(meta_features, y)
            self.meta_predictions[:, i, :] = meta_model.predict_proba(meta_features)
        
        # Step 3: Train attention network
        self._build_and_train_attention_network(X, y)
        
        self.is_fitted = True
        return self
        
    def predict_proba(self, X):
        """Predict with dynamic weights"""
        
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")
        
        # Get base model predictions
        base_preds = np.zeros((X.shape[0], len(self.base_models), self.n_classes))
        for i, model in enumerate(self.base_models):
            base_preds[:, i, :] = model.predict_proba(X)
        
        # Get meta model predictions
        meta_features = base_preds.reshape(X.shape[0], -1)
        meta_preds = np.zeros((X.shape[0], len(self.meta_models), self.n_classes))
        for i, meta_model in enumerate(self.meta_models):
            meta_preds[:, i, :] = meta_model.predict_proba(meta_features)
        
        # Apply attention mechanism
        attention_weights = self._get_attention_weights(X)
        
        # Weighted combination of meta predictions
        final_predictions = np.zeros((X.shape[0], self.n_classes))
        for i in range(len(self.meta_models)):
            final_predictions += attention_weights[:, i:i+1] * meta_preds[:, i, :]
        
        return final_predictions
    
    def predict(self, X):
        """Predict class labels"""
        probas = self.predict_proba(X)
        return self.classes_[np.argmax(probas, axis=1)]
    
    def decision_function(self, X):
        """Decision function for sklearn compatibility"""
        return self.predict_proba(X)
    
    def _build_and_train_attention_network(self, X, y, epochs=100):
        """Build and train attention network with state dict storage"""
        
        # Create attention network
        self.attention_network = AttentionNetwork(
            input_dim=X.shape[1],
            n_meta_models=len(self.meta_models),
            attention_dim=self.attention_dim
        )
        
        # Prepare data
        X_tensor = torch.FloatTensor(X)
        y_tensor = torch.LongTensor(y)
        
        optimizer = torch.optim.Adam(self.attention_network.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss()
        
        # Training loop
        for epoch in range(epochs):
            optimizer.zero_grad()
            
            # Get attention weights
            attention_weights = self.attention_network(X_tensor)
            
            # Weighted combination of meta predictions
            weighted_preds = torch.zeros(X.shape[0], self.n_classes)
            for i in range(len(self.meta_models)):
                meta_pred_tensor = torch.FloatTensor(self.meta_predictions[:, i, :])
                weighted_preds += attention_weights[:, i:i+1] * meta_pred_tensor
            
            # Compute loss
            loss = criterion(weighted_preds, y_tensor)
            loss.backward()
            optimizer.step()
            
            if epoch % 20 == 0:
                print(f"      Attention training epoch {epoch}, loss: {loss.item():.4f}")
        
        # Store state dict for pickling
        self.attention_state_dict = self.attention_network.state_dict()
        print("   💾 Stored attention network state for pickling")
    
    def _get_attention_weights(self, X):
        """Get attention weights, reconstructing network if needed"""
        
        # Reconstruct attention network if not available
        if self.attention_network is None and self.attention_state_dict is not None:
            print("   🔧 Reconstructing attention network from saved state...")
            self.attention_network = AttentionNetwork(
                input_dim=X.shape[1],
                n_meta_models=len(self.meta_models),
                attention_dim=self.attention_dim
            )
            self.attention_network.load_state_dict(self.attention_state_dict)
            self.attention_network.eval()
        
        # Get attention weights
        X_tensor = torch.FloatTensor(X)
        with torch.no_grad():
            attention_weights = self.attention_network(X_tensor).numpy()
        
        return attention_weights
    
    def __getstate__(self):
        """Custom pickling - exclude PyTorch network, keep state dict"""
        state = self.__dict__.copy()
        # Remove the unpicklable PyTorch network
        state['attention_network'] = None
        return state
    
    def __setstate__(self, state):
        """Custom unpickling - restore everything except PyTorch network"""
        self.__dict__.update(state)

class AdvancedImbalanceHandler:
    """Advanced imbalance handling with financial domain expertise"""
    
    def __init__(self, strategy: str = 'hybrid'):
        self.strategy = strategy
        self.samplers = {}
        self.cost_matrix = None
        
    def create_financial_cost_matrix(self, y: np.ndarray, high_value_classes: List[str]) -> np.ndarray:
        """Create cost matrix based on financial product values"""
        
        n_classes = len(np.unique(y))
        cost_matrix = np.ones((n_classes, n_classes))
        
        # Higher cost for missing high-value products
        for high_val_class in high_value_classes:
            if high_val_class < n_classes:
                cost_matrix[:, high_val_class] = 10  # 10x cost for false negatives
                cost_matrix[high_val_class, high_val_class] = 0  # No cost for correct predictions
        
        # Moderate cost for cross-selling opportunities
        cross_sell_classes = [0, 1, 2]  # Top 3 most common products
        for cs_class in cross_sell_classes:
            if cs_class < n_classes:
                cost_matrix[:, cs_class] = 3
                cost_matrix[cs_class, cs_class] = 0
        
        self.cost_matrix = cost_matrix
        return cost_matrix
    
    def apply_sampling(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """Apply advanced sampling strategies"""
        
        # Check if we have enough samples for SMOTE
        min_samples_per_class = np.bincount(y).min()
        if min_samples_per_class < 6:  # SMOTE needs at least 6 samples
            print(f"   ⚠️  Minimum class has only {min_samples_per_class} samples, skipping SMOTE")
            return X, y
        
        if self.strategy == 'borderline_smote':
            sampler = BorderlineSMOTE(
                sampling_strategy='auto',
                k_neighbors=min(5, min_samples_per_class - 1),
                m_neighbors=min(10, min_samples_per_class - 1),
                random_state=42
            )
        elif self.strategy == 'adasyn':
            sampler = ADASYN(
                sampling_strategy='auto',
                n_neighbors=min(5, min_samples_per_class - 1),
                random_state=42
            )
        elif self.strategy == 'hybrid':
            # Combine BorderlineSMOTE with Tomek links for cleaning
            from imblearn.over_sampling import SMOTE
            sampler = SMOTETomek(
                smote=SMOTE(
                    k_neighbors=min(5, min_samples_per_class - 1), 
                    random_state=42
                ),
                random_state=42
            )
        else:
            return X, y
        
        try:
            print(f"🔧 Applying {self.strategy} sampling...")
            X_resampled, y_resampled = sampler.fit_resample(X, y)
            
            print(f"   Original shape: {X.shape}")
            print(f"   Resampled shape: {X_resampled.shape}")
            
            return X_resampled, y_resampled
        except Exception as e:
            print(f"   ⚠️  Sampling failed: {e}")
            print(f"   Using original data without resampling")
            return X, y


class SHAPFeatureSelector:
    """Enhanced SHAP Feature Selector with robust error handling and debugging"""
    
    def __init__(self, n_features: int = 50, method: str = 'importance'):
        self.n_features = n_features
        self.method = method
        self.selected_features = None
        self.feature_importance = None
        self.debug_info = {}

    def fit_select(self, X: pd.DataFrame, y: np.ndarray, model=None) -> pd.DataFrame:
        """
        Robust SHAP feature selection with comprehensive debugging and fallback mechanisms
        """
        print(f"🔍 Selecting top {self.n_features} features using SHAP...")
        
        # Store original info for debugging
        self.debug_info['original_shape'] = X.shape
        self.debug_info['original_columns'] = X.columns.tolist()
        
        # Clean data for SHAP
        X_clean = self._enhanced_sanitize_for_shap(X)
        
        # Debug: Check what sanitization did
        self.debug_info['cleaned_shape'] = X_clean.shape
        self.debug_info['cleaned_columns'] = X_clean.columns.tolist()
        
        print(f"   📊 Data shape: {X.shape} → {X_clean.shape}")
        
        # Cap requested features to available features
        self.n_features = min(self.n_features, X_clean.shape[1])
        
        # Prepare model if not provided
        if model is None:
            print("   🤖 Training XGBoost model for SHAP analysis...")
            model = XGBClassifier(
                n_estimators=100,
                max_depth=6,
                random_state=42,
                eval_metric='mlogloss',
                verbosity=0  # Reduce noise
            )
            # Ensure we fit on the same data format
            model.fit(X_clean, y)
        
        # Try multiple SHAP approaches
        shap_success = False
        
        # Approach 1: Standard TreeExplainer
        if not shap_success:
            shap_success = self._try_tree_explainer(X_clean, model)
        
        # Approach 2: Permutation-based SHAP (more robust)
        if not shap_success:
            shap_success = self._try_permutation_explainer(X_clean, y, model)
        
        # Approach 3: Linear approximation of tree model
        if not shap_success:
            shap_success = self._try_linear_approximation(X_clean, y)
        
        # If all SHAP methods fail, use mutual information
        if not shap_success:
            print("   ⚠️  All SHAP methods failed, using Mutual Information")
            return self._fallback_mutual_information(X_clean, y)
        
        print(f"   ✅ Selected {len(self.selected_features)} features via SHAP")
        print("   Top 5 features:", self.selected_features[:5])
        
        return X_clean[self.selected_features]
    
    def _enhanced_sanitize_for_shap(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Enhanced sanitization with better handling of edge cases
        """
        X_clean = X.copy()
        
        # Track column changes
        original_cols = set(X_clean.columns)
        
        for col in X_clean.columns:
            try:
                series = X_clean[col]
                
                # Handle datetime columns
                if is_datetime64_any_dtype(series):
                    X_clean[col] = pd.to_numeric(series, errors='coerce')
                    continue
                
                # Handle already numeric columns
                if pd.api.types.is_numeric_dtype(series):
                    # Fill any remaining NaN values
                    X_clean[col] = series.fillna(series.median())
                    continue
                
                # Handle object columns
                if series.dtype == 'object':
                    # Try to convert to numeric first
                    numeric_converted = pd.to_numeric(series, errors='coerce')
                    
                    if not numeric_converted.isna().all():
                        # If some values converted successfully, use numeric
                        X_clean[col] = numeric_converted.fillna(numeric_converted.median())
                    else:
                        # Otherwise, use label encoding
                        try:
                            # Handle lists, tuples, and other iterables
                            processed_series = series.apply(lambda x: 
                                len(x) if isinstance(x, (list, tuple, np.ndarray)) 
                                else len(str(x)) if pd.notna(x) else 0
                            )
                            X_clean[col] = processed_series
                        except:
                            # Final fallback: simple factorization
                            X_clean[col] = pd.factorize(series.astype(str))[0]
                
                # Ensure no infinite values
                if pd.api.types.is_numeric_dtype(X_clean[col]):
                    X_clean[col] = X_clean[col].replace([np.inf, -np.inf], np.nan)
                    X_clean[col] = X_clean[col].fillna(X_clean[col].median())
                
            except Exception as e:
                print(f"   ⚠️  Issue processing column {col}: {e}")
                # Drop problematic columns as last resort
                X_clean = X_clean.drop(columns=[col])
        
        # Ensure all columns are numeric
        for col in X_clean.columns:
            if not pd.api.types.is_numeric_dtype(X_clean[col]):
                try:
                    X_clean[col] = pd.to_numeric(X_clean[col], errors='coerce')
                    X_clean[col] = X_clean[col].fillna(0)
                except:
                    X_clean = X_clean.drop(columns=[col])
        
        # Final validation
        final_cols = set(X_clean.columns)
        dropped_cols = original_cols - final_cols
        if dropped_cols:
            print(f"   📝 Dropped problematic columns: {list(dropped_cols)}")
        
        # Ensure we have numeric data only
        X_clean = X_clean.select_dtypes(include=[np.number])
        
        return X_clean
    
    def _try_tree_explainer(self, X_clean: pd.DataFrame, model) -> bool:
        """
        Try standard TreeExplainer approach
        """
        try:
            print("   🌳 Trying TreeExplainer...")
            
            # Create explainer
            explainer = shap.TreeExplainer(model)
            
            # Use smaller sample for efficiency
            sample_size = min(500, len(X_clean))
            X_sample = X_clean.sample(n=sample_size, random_state=42)
            
            print(f"   📊 Computing SHAP values for {sample_size} samples...")
            print(f"   📊 Model expects {model.n_features_in_} features, data has {X_sample.shape[1]}")
            
            # Ensure feature count matches
            if hasattr(model, 'n_features_in_') and model.n_features_in_ != X_sample.shape[1]:
                print(f"   ⚠️  Feature count mismatch: model={model.n_features_in_}, data={X_sample.shape[1]}")
                return False
            
            # Get SHAP values with proper error handling
            shap_values = explainer.shap_values(X_sample.values)
            
            # Debug: Print SHAP values structure
            if isinstance(shap_values, list):
                print(f"   📊 SHAP values: list of {len(shap_values)} arrays, each shape {shap_values[0].shape}")
                # Multi-class case: average across classes
                shap_import = np.mean([np.abs(sv).mean(axis=0) for sv in shap_values], axis=0)
            else:
                print(f"   📊 SHAP values shape: {shap_values.shape}")
                if shap_values.ndim == 3:
                    # New format: (n_samples, n_classes, n_features)
                    shap_import = np.abs(shap_values).mean(axis=(0, 2))
                elif shap_values.ndim == 2:
                    # Binary or regression: (n_samples, n_features)
                    shap_import = np.abs(shap_values).mean(axis=0)
                else:
                    print(f"   ⚠️  Unexpected SHAP values dimension: {shap_values.ndim}")
                    return False
            
            print(f"   📊 Feature importance shape: {shap_import.shape}")
            print(f"   📊 Number of columns: {len(X_clean.columns)}")
            
            # Ensure shapes match
            if len(shap_import) != len(X_clean.columns):
                print(f"   ⚠️  Shape mismatch: importance={len(shap_import)}, columns={len(X_clean.columns)}")
                return False
            
            # Create importance DataFrame
            importance_df = pd.DataFrame({
                'feature': X_clean.columns,
                'importance': shap_import
            }).sort_values('importance', ascending=False)
            
            # Store results
            self.selected_features = importance_df.head(self.n_features)['feature'].tolist()
            self.feature_importance = importance_df
            self.debug_info['shap_method'] = 'TreeExplainer'
            
            return True
            
        except Exception as e:
            print(f"   ⚠️  TreeExplainer failed: {e}")
            return False
    
    def _try_permutation_explainer(self, X_clean: pd.DataFrame, y: np.ndarray, model) -> bool:
        """
        Try permutation-based SHAP (more robust but slower)
        """
        try:
            print("   🔄 Trying Permutation-based explanation...")
            
            # Use smaller sample for permutation
            sample_size = min(200, len(X_clean))
            X_sample = X_clean.sample(n=sample_size, random_state=42)
            
            # Create permutation explainer
            explainer = shap.Explainer(model.predict_proba, X_sample)
            
            # Get SHAP values
            shap_values = explainer(X_sample)
            
            # Extract importance values
            if hasattr(shap_values, 'values'):
                if shap_values.values.ndim == 3:
                    # Multi-class: average across classes and samples
                    shap_import = np.abs(shap_values.values).mean(axis=(0, 1))
                else:
                    # Binary: average across samples
                    shap_import = np.abs(shap_values.values).mean(axis=0)
            else:
                return False
            
            # Create importance DataFrame
            importance_df = pd.DataFrame({
                'feature': X_clean.columns,
                'importance': shap_import
            }).sort_values('importance', ascending=False)
            
            # Store results
            self.selected_features = importance_df.head(self.n_features)['feature'].tolist()
            self.feature_importance = importance_df
            self.debug_info['shap_method'] = 'Permutation'
            
            return True
            
        except Exception as e:
            print(f"   ⚠️  Permutation explainer failed: {e}")
            return False
    
    def _try_linear_approximation(self, X_clean: pd.DataFrame, y: np.ndarray) -> bool:
        """
        Try linear approximation using feature importance from tree models
        """
        try:
            print("   📈 Trying linear approximation of feature importance...")
            
            # Use tree-based feature importance as proxy
            model = RandomForestClassifier(
                n_estimators=100,
                random_state=42,
                n_jobs=-1
            )
            model.fit(X_clean, y)
            
            # Get feature importance
            feature_importance = model.feature_importances_
            
            # Create importance DataFrame
            importance_df = pd.DataFrame({
                'feature': X_clean.columns,
                'importance': feature_importance
            }).sort_values('importance', ascending=False)
            
            # Store results
            self.selected_features = importance_df.head(self.n_features)['feature'].tolist()
            self.feature_importance = importance_df
            self.debug_info['shap_method'] = 'Linear_Approximation'
            
            return True
            
        except Exception as e:
            print(f"   ⚠️  Linear approximation failed: {e}")
            return False
    
    def _fallback_mutual_information(self, X_clean: pd.DataFrame, y: np.ndarray) -> pd.DataFrame:
        """
        Fallback to mutual information feature selection
        """
        print("   🔄 Using Mutual Information feature selection...")
        
        try:
            # Calculate mutual information scores
            mi_scores = mutual_info_classif(
                X_clean, y, 
                discrete_features='auto',
                random_state=42
            )
            
            # Create importance DataFrame
            importance_df = pd.DataFrame({
                'feature': X_clean.columns,
                'importance': mi_scores
            }).sort_values('importance', ascending=False)
            
            # Store results
            self.selected_features = importance_df.head(self.n_features)['feature'].tolist()
            self.feature_importance = importance_df
            self.debug_info['shap_method'] = 'Mutual_Information'
            
            print(f"   ✅ Selected {len(self.selected_features)} features via MI")
            print("   Top 5 features:", self.selected_features[:5])
            
            return X_clean[self.selected_features]
            
        except Exception as e:
            print(f"   ⚠️  Mutual Information failed: {e}")
            # Last resort: select top features by variance
            return self._last_resort_selection(X_clean)
    
    def _last_resort_selection(self, X_clean: pd.DataFrame) -> pd.DataFrame:
        """
        Last resort: select features by variance
        """
        print("   📊 Last resort: selecting by variance...")
        
        # Calculate variance for each feature
        variances = X_clean.var()
        
        # Select top features by variance
        top_features = variances.nlargest(self.n_features).index.tolist()
        
        self.selected_features = top_features
        self.feature_importance = pd.DataFrame({
            'feature': top_features,
            'importance': variances[top_features].values
        })
        self.debug_info['shap_method'] = 'Variance_Selection'
        
        print(f"   ✅ Selected {len(self.selected_features)} features by variance")
        
        return X_clean[self.selected_features]
    
    def get_debug_info(self) -> Dict:
        """
        Get debugging information about the feature selection process
        """
        return self.debug_info

class TemporalValidator:
    """Advanced temporal validation for financial ML"""
    
    def __init__(self, n_splits: int = 5, embargo_days: int = 21):
        self.n_splits = n_splits
        self.embargo_days = embargo_days
        
    def time_series_split_with_embargo(self, X: pd.DataFrame, y: np.ndarray, 
                                     date_column: str = 'DateCreated'):
        """Create time series splits with embargo periods"""
        
        if date_column not in X.columns:
            # Fallback to regular time series split
            tscv = TimeSeriesSplit(n_splits=self.n_splits)
            for train_idx, test_idx in tscv.split(X):
                yield train_idx, test_idx
            return
        
        # Sort by date
        X_sorted = X.sort_values(date_column)
        y_sorted = y[X_sorted.index]
        dates = pd.to_datetime(X_sorted[date_column])
        
        # Create splits with embargo
        n_samples = len(X_sorted)
        test_size = n_samples // (self.n_splits + 1)
        
        for i in range(self.n_splits):
            # Define test period
            test_start = (i + 1) * test_size
            test_end = test_start + test_size
            
            if test_end > n_samples:
                break
            
            # Add embargo period
            embargo_date = dates.iloc[test_start - 1] + timedelta(days=self.embargo_days)
            valid_test_mask = dates.iloc[test_start:test_end] > embargo_date
            
            train_idx = X_sorted.iloc[:test_start].index.tolist()
            test_idx = X_sorted.iloc[test_start:test_end][valid_test_mask].index.tolist()
            
            if len(test_idx) > 0:
                yield train_idx, test_idx

class AttentionNetwork(nn.Module):
    def __init__(self, input_dim: int, n_meta_models: int, attention_dim: int):
        super().__init__()
        self.input_dim = input_dim
        self.n_meta_models = n_meta_models
        self.attention_dim = attention_dim
        
        self.attention = nn.Sequential(
            nn.Linear(input_dim, attention_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(attention_dim, attention_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(attention_dim // 2, n_meta_models),
            nn.Softmax(dim=1)
        )
    
    def forward(self, x):
        return self.attention(x)

class EnhancedFinancialMLSystem:
    """Complete enhanced financial ML system"""
    
    def __init__(self):
        self.feature_engineer = FinancialFeatureEngineer()
        self.feature_selector = SHAPFeatureSelector(n_features=60)
        self.imbalance_handler = AdvancedImbalanceHandler(strategy='hybrid')
        self.temporal_validator = TemporalValidator()
        self.ensemble = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.is_fitted = False
        
    def prepare_data(self, df: pd.DataFrame, target_column: str = 'Target_SubCategory'):
        """Complete data preparation pipeline"""
        
        print("🚀 Starting enhanced data preparation...")
        
        # Separate features and target
        if target_column not in df.columns:
            raise ValueError(f"Target column '{target_column}' not found in DataFrame")
            
        y = df[target_column]
        X = df.drop(columns=[target_column, 'ClientId', 'DateCreated'], errors='ignore')
        
        # Feature engineering
        X_enhanced = self.feature_engineer.create_financial_features(X)
        
        # Handle missing values
        numeric_columns = X_enhanced.select_dtypes(include=[np.number]).columns
        categorical_columns = X_enhanced.select_dtypes(include=['object']).columns
        
        # Fill numeric missing values with median
        if len(numeric_columns) > 0:
            X_enhanced[numeric_columns] = X_enhanced[numeric_columns].fillna(
                X_enhanced[numeric_columns].median()
            )
        
        # Fill categorical missing values with mode
        if len(categorical_columns) > 0:
            for col in categorical_columns:
                mode_value = X_enhanced[col].mode()
                if len(mode_value) > 0:
                    X_enhanced[col] = X_enhanced[col].fillna(mode_value.iloc[0])
                else:
                    X_enhanced[col] = X_enhanced[col].fillna('Unknown')
        
        # Encode categorical variables
        for col in categorical_columns:
            if col in X_enhanced.columns:
                le = LabelEncoder()
                X_enhanced[col] = le.fit_transform(X_enhanced[col].astype(str))
        
        # Encode target
        y_encoded = self.label_encoder.fit_transform(y)
        
        print(f"   ✅ Prepared data: {X_enhanced.shape[0]} samples, {X_enhanced.shape[1]} features")
        
        return X_enhanced, y_encoded
    
    def build_enhanced_ensemble(self, n_classes: int):
        """Build the enhanced ensemble architecture"""
        
        print("🏗️ Building enhanced ensemble architecture...")
        
        # Level-0: Diverse base learners with optimized parameters
        base_models = [
            RandomForestClassifier(
                n_estimators=500,
                max_depth=15,
                min_samples_split=20,
                min_samples_leaf=10,
                max_features='sqrt',
                class_weight='balanced_subsample',
                random_state=42,
                n_jobs=-1
            ),
            XGBClassifier(
                n_estimators=300,
                max_depth=8,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=1,
                reg_lambda=1,
                random_state=42,
                n_jobs=-1
            ),
            LGBMClassifier(
                n_estimators=300,
                max_depth=12,
                learning_rate=0.05,
                num_leaves=31,
                min_child_samples=20,
                subsample=0.8,
                colsample_bytree=0.8,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            ),
            cb.CatBoostClassifier(
                iterations=200,
                depth=8,
                learning_rate=0.05,
                random_seed=42,
                verbose=False,
                auto_class_weights='Balanced'
            ),
            MLPClassifier(
                hidden_layer_sizes=(256, 128, 64),
                alpha=0.01,
                learning_rate_init=0.001,
                max_iter=300,
                random_state=42
            )
        ]
        
        # Level-1: Meta models
        meta_models = [
            LogisticRegression(
                C=1.0,
                multi_class='multinomial',
                max_iter=1000,
                class_weight='balanced',
                random_state=42
            ),
            XGBClassifier(
                n_estimators=100,
                max_depth=4,
                learning_rate=0.05,
                random_state=42
            ),
            MLPClassifier(
                hidden_layer_sizes=(64, 32),
                alpha=0.01,
                max_iter=200,
                random_state=42
            )
        ]
        
        # Build dynamic weight ensemble
        self.ensemble = DynamicWeightEnsemble(
            base_models=base_models,
            meta_models=meta_models,
            attention_dim=64
        )
        
        print(f"   ✅ Built ensemble with {len(base_models)} base models and {len(meta_models)} meta models")
    
    def train(self, X: pd.DataFrame, y: np.ndarray, use_temporal_validation: bool = True):
        """Train the complete system"""
        
        print("🎯 Training Enhanced Financial ML System...")
        
        # Feature selection
        X_selected = self.feature_selector.fit_select(X, y)
        
        # Scale features
        X_scaled = pd.DataFrame(
            self.scaler.fit_transform(X_selected),
            columns=X_selected.columns,
            index=X_selected.index
        )
        
        # Handle class imbalance
        X_resampled, y_resampled = self.imbalance_handler.apply_sampling(
            X_scaled.values, y
        )
        
        # Build ensemble
        self.build_enhanced_ensemble(len(np.unique(y)))
        
        # Train ensemble
        self.ensemble.fit(X_resampled, y_resampled)
        
        # Alternative calibration approach to avoid the cloning issue
        print("🎯 Creating calibrated predictions...")
        try:
            # Try standard calibration first
            self.calibrated_ensemble = CalibratedClassifierCV(
                self.ensemble,
                method='sigmoid',
                cv=3
            )
            self.calibrated_ensemble.fit(X_scaled.values, y)
            print("   ✅ Standard calibration successful")
            
        except Exception as e:
            print(f"   ⚠️  Standard calibration failed: {e}")
            print("   Using ensemble without calibration (still very effective)")
            
            # Use the ensemble directly without calibration
            self.calibrated_ensemble = self.ensemble
        
        self.is_fitted = True
        print("   ✅ Training complete!")
    
    def predict_with_confidence(self, X: pd.DataFrame, k: int = 3) -> List[Tuple]:
        """
        FIXED: Predict top-k recommendations with proper categorical handling
        """
        
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")
        
        print("🔧 Creating advanced financial features...")
        
        # Step 1: Apply feature engineering (same as training)
        X_enhanced = self.feature_engineer.create_financial_features(X.copy())
        
        # Step 2: Handle categorical variables BEFORE feature selection
        # This is the missing piece that causes the error!
        
        # Handle missing values first
        numeric_columns = X_enhanced.select_dtypes(include=[np.number]).columns
        categorical_columns = X_enhanced.select_dtypes(include=['object']).columns
        
        # Fill numeric missing values with median (use training medians if available)
        if len(numeric_columns) > 0:
            for col in numeric_columns:
                if col in X_enhanced.columns:
                    if hasattr(self, 'training_medians') and col in self.training_medians:
                        # Use training median if available
                        X_enhanced[col] = X_enhanced[col].fillna(self.training_medians[col])
                    else:
                        # Fallback to current median
                        X_enhanced[col] = X_enhanced[col].fillna(X_enhanced[col].median())
        
        # Handle categorical variables (CRITICAL FIX)
        if len(categorical_columns) > 0:
            for col in categorical_columns:
                if col in X_enhanced.columns:
                    # Fill missing values first
                    mode_value = X_enhanced[col].mode()
                    if len(mode_value) > 0:
                        X_enhanced[col] = X_enhanced[col].fillna(mode_value.iloc[0])
                    else:
                        X_enhanced[col] = X_enhanced[col].fillna('Unknown')
                    
                    # Encode categorical variables
                    if hasattr(self, 'label_encoders') and col in self.label_encoders:
                        # Use training encoder if available
                        le = self.label_encoders[col]
                        try:
                            # Handle unseen categories
                            X_enhanced[col] = X_enhanced[col].apply(
                                lambda x: le.transform([x])[0] if x in le.classes_ else 0
                            )
                        except:
                            # If encoding fails, create new encoder
                            le_new = LabelEncoder()
                            X_enhanced[col] = le_new.fit_transform(X_enhanced[col].astype(str))
                    else:
                        # Create new encoder (not ideal but works)
                        le = LabelEncoder()
                        X_enhanced[col] = le.fit_transform(X_enhanced[col].astype(str))
        
        # Step 3: Select only the features used in training
        try:
            X_selected = X_enhanced[self.feature_selector.selected_features]
        except KeyError as e:
            print(f"   ⚠️  Missing features in prediction data: {e}")
            # Handle missing features by creating them with default values
            missing_features = set(self.feature_selector.selected_features) - set(X_enhanced.columns)
            for feature in missing_features:
                X_enhanced[feature] = 0  # Default value for missing features
            X_selected = X_enhanced[self.feature_selector.selected_features]
        
        # Step 4: Scale features using training scaler
        X_scaled = self.scaler.transform(X_selected)
        
        # Step 5: Get predictions
        pred_proba = self.calibrated_ensemble.predict_proba(X_scaled)
        
        # Step 6: Generate recommendations
        recommendations = []
        for i, proba in enumerate(pred_proba):
            # Get top-k predictions
            top_k_indices = np.argsort(proba)[-k:][::-1]
            top_k_probs = proba[top_k_indices]
            top_k_classes = self.label_encoder.inverse_transform(top_k_indices)
            
            # Add business reasoning
            client_recs = []
            for j, (class_name, confidence) in enumerate(zip(top_k_classes, top_k_probs)):
                reasoning = self._generate_business_reasoning(X.iloc[i], class_name, confidence)
                client_recs.append((class_name, confidence, reasoning))
            
            recommendations.append(client_recs)
        
        return recommendations

    def _generate_business_reasoning(self, client_data: pd.Series, product: str, confidence: float) -> str:
        """Generate business reasoning for recommendations"""
        
        reasons = []
        
        # Age-based reasoning
        age = client_data.get('ClientAge', 40)
        if age < 35 and 'Term' in product:
            reasons.append("Young professional - basic protection priority")
        elif age > 50 and 'Retirement' in product:
            reasons.append("Pre-retirement planning phase")
        elif 35 <= age <= 50 and 'Investment' in product:
            reasons.append("Peak earning years - wealth accumulation opportunity")
        
        # Income-based reasoning
        income = client_data.get('Income_Numeric', 0)
        if income > 100000 and 'Investment' in product:
            reasons.append("High income - sophisticated investment suitable")
        elif income < 50000 and 'Shield' in product:
            reasons.append("Essential healthcare protection need")
        
        # Gap analysis reasoning
        gap_score = client_data.get('Protection_Gap_Score', 0)
        if gap_score > 0.5:
            reasons.append("Significant protection gaps identified")
        
        # Sophistication reasoning
        sophistication = client_data.get('Advanced_Sophistication_Score', 0)
        if sophistication > 5 and 'Investment' in product:
            reasons.append("High financial sophistication - complex products suitable")
        
        # Default reasoning
        if not reasons:
            if confidence > 0.7:
                reasons.append("Strong profile match based on comprehensive analysis")
            elif confidence > 0.4:
                reasons.append("Good fit based on client profile")
            else:
                reasons.append("Potential opportunity for financial planning discussion")
        
        return "; ".join(reasons)
    
    def evaluate_comprehensive(self, X: pd.DataFrame, y: np.ndarray, 
                             use_temporal_validation: bool = True) -> Dict:
        """Comprehensive model evaluation with business metrics"""
        
        print("📊 Running comprehensive evaluation...")
        
        if use_temporal_validation and 'DateCreated' in X.columns:
            cv_scores = []
            for train_idx, test_idx in self.temporal_validator.time_series_split_with_embargo(X, y):
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]
                
                # Train on fold
                temp_system = EnhancedFinancialMLSystem()
                X_train_prep, y_train_prep = temp_system.prepare_data(
                    pd.concat([X_train, pd.DataFrame({'Target_SubCategory': y_train})], axis=1)
                )
                temp_system.train(X_train_prep, y_train_prep, use_temporal_validation=False)
                
                # Evaluate on fold
                X_test_prep, y_test_prep = temp_system.prepare_data(
                    pd.concat([X_test, pd.DataFrame({'Target_SubCategory': y_test})], axis=1)
                )
                
                fold_metrics = self._calculate_metrics(temp_system, X_test_prep, y_test_prep)
                cv_scores.append(fold_metrics)
            
            # Average metrics across folds
            avg_metrics = {}
            for metric in cv_scores[0].keys():
                avg_metrics[metric] = np.mean([fold[metric] for fold in cv_scores])
                avg_metrics[f"{metric}_std"] = np.std([fold[metric] for fold in cv_scores])
            
            return avg_metrics
        
        else:
            # Single evaluation
            return self._calculate_metrics(self, X, y)
    
    def _calculate_metrics(self, model, X: pd.DataFrame, y: np.ndarray) -> Dict:
        """Calculate comprehensive metrics"""
        
        # Get predictions
        pred_proba = model.calibrated_ensemble.predict_proba(
            model.scaler.transform(X[model.feature_selector.selected_features])
        )
        y_pred = np.argmax(pred_proba, axis=1)
        
        # Basic metrics
        accuracy = accuracy_score(y, y_pred)
        mcc = matthews_corrcoef(y, y_pred)
        
        # Top-k accuracy
        def top_k_accuracy(y_true, y_pred_proba, k):
            top_k_preds = np.argsort(y_pred_proba, axis=1)[:, -k:]
            return np.mean([y_true[i] in top_k_preds[i] for i in range(len(y_true))])
        
        top1_acc = accuracy
        top3_acc = top_k_accuracy(y, pred_proba, k=3)
        top5_acc = top_k_accuracy(y, pred_proba, k=5)
        
        # Business metrics
        coverage = top3_acc  # Business coverage = top-3 accuracy
        avg_confidence = np.mean(np.max(pred_proba, axis=1))
        
        # Diversity metric
        top3_preds = np.argsort(pred_proba, axis=1)[:, -3:]
        unique_recommendations = len(np.unique(top3_preds))
        diversity_score = unique_recommendations / (3 * len(y))
        
        # Calibration quality (Brier score)
        n_classes = len(np.unique(y))
        y_onehot = np.eye(n_classes)[y]
        brier_score = np.mean(np.sum((pred_proba - y_onehot) ** 2, axis=1))
        
        return {
            'accuracy': accuracy,
            'top1_accuracy': top1_acc,
            'top3_accuracy': top3_acc,
            'top5_accuracy': top5_acc,
            'matthews_corr_coef': mcc,
            'coverage': coverage,
            'avg_confidence': avg_confidence,
            'diversity_score': diversity_score,
            'brier_score': brier_score,
            'calibration_quality': 1 - brier_score  # Higher is better
        }


def hyperparameter_optimization(X: pd.DataFrame, y: np.ndarray, n_trials: int = 50):
    """Optuna-based hyperparameter optimization"""
    
    def objective(trial):
        # Suggest hyperparameters
        params = {
            'rf_n_estimators': trial.suggest_int('rf_n_estimators', 300, 800),
            'rf_max_depth': trial.suggest_int('rf_max_depth', 10, 25),
            'xgb_learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.2),
            'xgb_max_depth': trial.suggest_int('xgb_max_depth', 4, 12),
            'lgbm_num_leaves': trial.suggest_int('lgbm_num_leaves', 20, 50),
            'attention_dim': trial.suggest_int('attention_dim', 32, 128),
            'n_features': trial.suggest_int('n_features', 40, 80)
        }
        
        # Create system with suggested parameters
        system = EnhancedFinancialMLSystem()
        system.feature_selector.n_features = params['n_features']
        
        # Quick evaluation with reduced data for speed
        sample_size = min(10000, len(X))
        X_sample = X.sample(n=sample_size, random_state=42)
        y_sample = y[X_sample.index]
        
        try:
            X_prep, y_prep = system.prepare_data(
                pd.concat([X_sample, pd.DataFrame({'Target_SubCategory': y_sample})], axis=1)
            )
            
            # Quick train-test split
            from sklearn.model_selection import train_test_split
            X_train, X_test, y_train, y_test = train_test_split(
                X_prep, y_prep, test_size=0.2, random_state=42, stratify=y_prep
            )
            
            system.train(X_train, y_train, use_temporal_validation=False)
            metrics = system._calculate_metrics(system, X_test, y_test)
            
            # Optimize for top-3 accuracy
            return metrics['top3_accuracy']
        
        except Exception as e:
            print(f"Trial failed: {e}")
            return 0.0
    
    print("🔍 Starting hyperparameter optimization...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    
    print(f"   Best top-3 accuracy: {study.best_value:.4f}")
    print(f"   Best parameters: {study.best_params}")
    
    return study.best_params

# SAFE PREDICTION METHOD (handles any categorical data)
def safe_predict_with_confidence(system, X: pd.DataFrame, k: int = 3) -> List[Tuple]:
    """
    Safe prediction method that handles categorical data robustly
    """
    
    print("🔧 Safe prediction with robust categorical handling...")
    
    try:
        # Step 1: Create a copy and handle basic preprocessing
        X_processed = X.copy()
        
        print(f"   📊 Input data shape: {X_processed.shape}")
        print(f"   📊 Input columns: {X_processed.columns.tolist()}")
        
        # Step 2: Aggressive NaN handling for all columns
        for col in X_processed.columns:
            if X_processed[col].dtype == 'object':
                # Handle categorical columns
                X_processed[col] = X_processed[col].fillna('Unknown')
                # Convert to hash-based numeric encoding
                X_processed[col] = X_processed[col].apply(
                    lambda x: abs(hash(str(x))) % 1000 if pd.notna(x) else 0
                )
            else:
                # Handle numeric columns
                if X_processed[col].isna().any():
                    median_val = X_processed[col].median()
                    if pd.isna(median_val):
                        median_val = 0  # Ultimate fallback
                    X_processed[col] = X_processed[col].fillna(median_val)
                
                # Handle infinite values
                X_processed[col] = X_processed[col].replace([np.inf, -np.inf], 0)
        
        print(f"   ✅ Basic preprocessing complete")
        
        # Step 3: Apply feature engineering with NaN protection
        try:
            X_enhanced = system.feature_engineer.create_financial_features(X_processed)
            print(f"   ✅ Feature engineering complete: {X_enhanced.shape}")
        except Exception as e:
            print(f"   ⚠️  Feature engineering failed: {e}")
            # Use processed data as is
            X_enhanced = X_processed.copy()
        
        # Step 4: Comprehensive NaN cleaning for all columns
        for col in X_enhanced.columns:
            if not pd.api.types.is_numeric_dtype(X_enhanced[col]):
                # Force conversion to numeric
                X_enhanced[col] = pd.to_numeric(X_enhanced[col], errors='coerce')
            
            # Fill any remaining NaN values
            if X_enhanced[col].isna().any():
                fill_value = X_enhanced[col].median()
                if pd.isna(fill_value):
                    fill_value = 0
                X_enhanced[col] = X_enhanced[col].fillna(fill_value)
            
            # Final infinite value check
            X_enhanced[col] = X_enhanced[col].replace([np.inf, -np.inf], 0)
        
        print(f"   ✅ Comprehensive NaN cleaning complete")
        
        # Step 5: Handle feature alignment with training features
        required_features = system.feature_selector.selected_features
        print(f"   📊 Required features: {len(required_features)}")
        
        # Create missing features with zeros
        for feature in required_features:
            if feature not in X_enhanced.columns:
                X_enhanced[feature] = 0.0  # Explicit float
                print(f"   🔧 Created missing feature: {feature}")
        
        # Select only required features
        X_selected = X_enhanced[required_features].copy()
        
        # Step 6: Final NaN validation before scaling
        if X_selected.isna().any().any():
            print(f"   ⚠️  Found remaining NaNs, applying final cleanup")
            X_selected = X_selected.fillna(0.0)
        
        print(f"   📊 Features selected: {X_selected.shape}")
        print(f"   📊 NaN check: {X_selected.isna().sum().sum()} NaNs remaining")
        
        # Step 7: Scale features
        try:
            X_scaled = system.scaler.transform(X_selected)
            print(f"   ✅ Feature scaling complete")
        except Exception as e:
            print(f"   ⚠️  Scaling failed: {e}")
            # Use unscaled data as fallback
            X_scaled = X_selected.values
        
        # Step 8: Final NaN check on scaled data
        if np.isnan(X_scaled).any():
            print(f"   ⚠️  NaNs detected in scaled data, cleaning...")
            X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=0.0, neginf=0.0)
        
        print(f"   📊 Final data shape: {X_scaled.shape}")
        print(f"   📊 Final NaN check: {np.isnan(X_scaled).sum()} NaNs")
        
        # Step 9: Get predictions
        pred_proba = system.calibrated_ensemble.predict_proba(X_scaled)
        print(f"   ✅ Predictions obtained successfully")
        
        # Step 10: Generate recommendations
        recommendations = []
        for i, proba in enumerate(pred_proba):
            top_k_indices = np.argsort(proba)[-k:][::-1]
            top_k_probs = proba[top_k_indices]
            top_k_classes = system.label_encoder.inverse_transform(top_k_indices)
            
            client_recs = []
            for j, (class_name, confidence) in enumerate(zip(top_k_classes, top_k_probs)):
                reasoning = f"Confidence: {confidence:.1%} based on enhanced profile analysis"
                client_recs.append((class_name, confidence, reasoning))
            
            recommendations.append(client_recs)
        
        print("   ✅ Ultra-safe prediction completed successfully!")
        return recommendations
        
    except Exception as e:
        print(f"   ❌ Ultra-safe prediction failed: {e}")
        # Return dummy prediction to prevent crash
        dummy_classes = system.label_encoder.classes_[:k] if len(system.label_encoder.classes_) >= k else system.label_encoder.classes_
        dummy_recs = [(cls, 0.33, "Default prediction due to processing error") for cls in dummy_classes]
        return [dummy_recs]
    
class SimpleDynamicEnsemble:
    """
    Simplified ensemble without PyTorch - always picklable
    """
    
    def __init__(self, base_models=None, meta_models=None):
        self.base_models = base_models or []
        self.meta_models = meta_models or []
        self.is_fitted = False
        self.n_classes = None
        self.classes_ = None
        
    def fit(self, X, y):
        """Fit the simplified ensemble"""
        
        print("🔧 Training Simple Dynamic Ensemble (PyTorch-free)...")
        
        self.classes_ = np.unique(y)
        self.n_classes = len(self.classes_)
        
        # Train base models
        for i, model in enumerate(self.base_models):
            print(f"   Training base model {i+1}/{len(self.base_models)}: {model.__class__.__name__}")
            model.fit(X, y)
        
        # Train meta models on base predictions
        base_preds = np.zeros((X.shape[0], len(self.base_models), self.n_classes))
        for i, model in enumerate(self.base_models):
            base_preds[:, i, :] = model.predict_proba(X)
        
        meta_features = base_preds.reshape(X.shape[0], -1)
        
        for i, meta_model in enumerate(self.meta_models):
            print(f"   Training meta model {i+1}/{len(self.meta_models)}: {meta_model.__class__.__name__}")
            meta_model.fit(meta_features, y)
        
        self.is_fitted = True
        return self
    
    def predict_proba(self, X):
        """Simple ensemble prediction - average meta model outputs"""
        
        # Get base predictions
        base_preds = np.zeros((X.shape[0], len(self.base_models), self.n_classes))
        for i, model in enumerate(self.base_models):
            base_preds[:, i, :] = model.predict_proba(X)
        
        # Get meta predictions
        meta_features = base_preds.reshape(X.shape[0], -1)
        meta_predictions = []
        
        for meta_model in self.meta_models:
            meta_pred = meta_model.predict_proba(meta_features)
            meta_predictions.append(meta_pred)
        
        # Simple average of meta predictions
        final_predictions = np.mean(meta_predictions, axis=0)
        return final_predictions
    
    def predict(self, X):
        """Predict class labels"""
        probas = self.predict_proba(X)
        return self.classes_[np.argmax(probas, axis=1)]


class UltimatePicklableEnsemble:
    """Ultimate picklable ensemble - completely serializable"""
    
    def __init__(self, base_models=None, meta_models=None, attention_dim=64):
        self.base_models = base_models or []
        self.meta_models = meta_models or []
        self.attention_dim = attention_dim
        self.attention_network = None
        self.attention_weights_cache = None  # Store computed weights
        self.is_fitted = False
        self.n_classes = None
        self.classes_ = None
        
        # Store network architecture parameters for reconstruction
        self.network_params = {
            'input_dim': None,
            'n_meta_models': None,
            'attention_dim': attention_dim
        }
        
        # Store PyTorch state as dict (picklable)
        self.pytorch_state = None
        
    def get_params(self, deep=True):
        return {
            'base_models': self.base_models,
            'meta_models': self.meta_models,
            'attention_dim': self.attention_dim
        }
    
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self
        
    def fit(self, X, y):
        """Fit ensemble with PyTorch-free fallback"""
        
        print("🔧 Training Ultimate Picklable Ensemble...")
        
        self.classes_ = np.unique(y)
        self.n_classes = len(self.classes_)
        
        # Train base models
        for i, model in enumerate(self.base_models):
            print(f"   Training base model {i+1}/{len(self.base_models)}: {model.__class__.__name__}")
            model.fit(X, y)
        
        # Train meta models
        base_preds = self._get_base_predictions(X)
        meta_features = base_preds.reshape(X.shape[0], -1)
        
        for i, meta_model in enumerate(self.meta_models):
            print(f"   Training meta model {i+1}/{len(self.meta_models)}: {meta_model.__class__.__name__}")
            meta_model.fit(meta_features, y)
        
        # Try to train attention network, fallback to simple averaging
        try:
            self._train_attention_network(X, y)
            print("   ✅ Attention network trained successfully")
        except Exception as e:
            print(f"   ⚠️  Attention training failed: {e}")
            print("   🔄 Using simple meta-model averaging")
            self.attention_weights_cache = np.ones((len(self.meta_models),)) / len(self.meta_models)
        
        self.is_fitted = True
        return self
        
    def predict_proba(self, X):
        """Predict with dynamic or simple weights"""
        
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")
        
        # Get base and meta predictions
        base_preds = self._get_base_predictions(X)
        meta_features = base_preds.reshape(X.shape[0], -1)
        
        meta_predictions = []
        for meta_model in self.meta_models:
            meta_pred = meta_model.predict_proba(meta_features)
            meta_predictions.append(meta_pred)
        
        # Apply attention weights (dynamic or static)
        if self.pytorch_state is not None:
            # Try dynamic attention
            try:
                attention_weights = self._get_dynamic_attention_weights(X)
            except:
                # Fallback to simple average
                attention_weights = np.ones((X.shape[0], len(self.meta_models))) / len(self.meta_models)
        else:
            # Use simple average or cached weights
            if self.attention_weights_cache is not None:
                attention_weights = np.tile(self.attention_weights_cache, (X.shape[0], 1))
            else:
                attention_weights = np.ones((X.shape[0], len(self.meta_models))) / len(self.meta_models)
        
        # Weighted combination
        final_predictions = np.zeros((X.shape[0], self.n_classes))
        for i, meta_pred in enumerate(meta_predictions):
            final_predictions += attention_weights[:, i:i+1] * meta_pred
        
        return final_predictions
    
    def predict(self, X):
        probas = self.predict_proba(X)
        return self.classes_[np.argmax(probas, axis=1)]
    
    def decision_function(self, X):
        return self.predict_proba(X)
    
    def _get_base_predictions(self, X):
        """Get base model predictions"""
        base_preds = np.zeros((X.shape[0], len(self.base_models), self.n_classes))
        for i, model in enumerate(self.base_models):
            base_preds[:, i, :] = model.predict_proba(X)
        return base_preds
    
    def _train_attention_network(self, X, y, epochs=50):  # Reduced epochs for faster training
        """Train attention network and store state"""
        
        # Store parameters for reconstruction
        self.network_params['input_dim'] = X.shape[1]
        self.network_params['n_meta_models'] = len(self.meta_models)
        
        # Create and train network
        self.attention_network = AttentionNetwork(
            input_dim=X.shape[1],
            n_meta_models=len(self.meta_models),
            attention_dim=self.attention_dim
        )
        
        # Quick training with meta predictions
        base_preds = self._get_base_predictions(X)
        meta_features = base_preds.reshape(X.shape[0], -1)
        
        meta_predictions = []
        for meta_model in self.meta_models:
            meta_pred = meta_model.predict_proba(meta_features)
            meta_predictions.append(meta_pred)
        
        # Training setup
        X_tensor = torch.FloatTensor(X)
        y_tensor = torch.LongTensor(y)
        
        optimizer = torch.optim.Adam(self.attention_network.parameters(), lr=0.01)  # Higher LR for faster training
        criterion = nn.CrossEntropyLoss()
        
        # Quick training loop
        for epoch in range(epochs):
            optimizer.zero_grad()
            
            attention_weights = self.attention_network(X_tensor)
            
            # Weighted combination
            weighted_preds = torch.zeros(X.shape[0], self.n_classes)
            for i, meta_pred in enumerate(meta_predictions):
                meta_pred_tensor = torch.FloatTensor(meta_pred)
                weighted_preds += attention_weights[:, i:i+1] * meta_pred_tensor
            
            loss = criterion(weighted_preds, y_tensor)
            loss.backward()
            optimizer.step()
            
            if epoch % 10 == 0:
                print(f"      Attention epoch {epoch}, loss: {loss.item():.4f}")
        
        # Store state as dictionary (picklable)
        self.pytorch_state = {
            'state_dict': self.attention_network.state_dict(),
            'network_params': self.network_params.copy()
        }
        
        # Remove the actual network before pickling
        self.attention_network = None
    
    def _get_dynamic_attention_weights(self, X):
        """Get dynamic attention weights, reconstructing network if needed"""
        
        if self.attention_network is None and self.pytorch_state is not None:
            # Reconstruct network from saved state
            self.attention_network = AttentionNetwork(**self.pytorch_state['network_params'])
            self.attention_network.load_state_dict(self.pytorch_state['state_dict'])
            self.attention_network.eval()
        
        X_tensor = torch.FloatTensor(X)
        with torch.no_grad():
            attention_weights = self.attention_network(X_tensor).numpy()
        
        return attention_weights
    
    def __getstate__(self):
        """Custom pickling - remove PyTorch network"""
        state = self.__dict__.copy()
        state['attention_network'] = None  # Always remove for pickling
        return state
    
    def __setstate__(self, state):
        """Custom unpickling"""
        self.__dict__.update(state)


class UltimateFinancialMLSystem:
    """Ultimate financial ML system - module level, fully picklable"""
    
    def __init__(self):
        # Import your existing classes (assuming they're available)
        self.feature_engineer = FinancialFeatureEngineer()
        self.feature_selector = SHAPFeatureSelector(n_features=60)
        self.imbalance_handler = AdvancedImbalanceHandler(strategy='hybrid')
        self.temporal_validator = TemporalValidator()
        self.ensemble = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.is_fitted = False
        
    def prepare_data(self, df, target_column='Target_SubCategory'):
        """Prepare data for training"""
        
        print("🚀 Starting ultimate data preparation...")
        
        if target_column not in df.columns:
            raise ValueError(f"Target column '{target_column}' not found")
            
        y = df[target_column]
        X = df.drop(columns=[target_column, 'ClientId', 'DateCreated'], errors='ignore')
        
        # Feature engineering
        X_enhanced = self.feature_engineer.create_financial_features(X)
        
        # Handle missing values
        numeric_columns = X_enhanced.select_dtypes(include=[np.number]).columns
        categorical_columns = X_enhanced.select_dtypes(include=['object']).columns
        
        # Fill numeric
        if len(numeric_columns) > 0:
            X_enhanced[numeric_columns] = X_enhanced[numeric_columns].fillna(
                X_enhanced[numeric_columns].median()
            )
        
        # Fill categorical
        if len(categorical_columns) > 0:
            for col in categorical_columns:
                mode_value = X_enhanced[col].mode()
                if len(mode_value) > 0:
                    X_enhanced[col] = X_enhanced[col].fillna(mode_value.iloc[0])
                else:
                    X_enhanced[col] = X_enhanced[col].fillna('Unknown')
        
        # Encode categorical
        for col in categorical_columns:
            if col in X_enhanced.columns:
                le = LabelEncoder()
                X_enhanced[col] = le.fit_transform(X_enhanced[col].astype(str))
        
        # Encode target
        y_encoded = self.label_encoder.fit_transform(y)
        
        print(f"   ✅ Prepared: {X_enhanced.shape[0]} samples, {X_enhanced.shape[1]} features")
        return X_enhanced, y_encoded
    
    def build_ensemble(self, n_classes):
        """Build the ultimate ensemble"""
        
        print("🏗️ Building ultimate picklable ensemble...")
        
        # Base models
        base_models = [
            RandomForestClassifier(
                n_estimators=300,  # Reduced for faster training
                max_depth=15, min_samples_split=20, min_samples_leaf=10,
                max_features='sqrt', class_weight='balanced_subsample',
                random_state=42, n_jobs=-1
            ),
            XGBClassifier(
                n_estimators=200,  # Reduced for faster training
                max_depth=8, learning_rate=0.05, subsample=0.8,
                colsample_bytree=0.8, reg_alpha=1, reg_lambda=1,
                random_state=42, n_jobs=-1
            ),
            LGBMClassifier(
                n_estimators=200,  # Reduced for faster training
                max_depth=12, learning_rate=0.05, num_leaves=31,
                min_child_samples=20, subsample=0.8, colsample_bytree=0.8,
                class_weight='balanced', random_state=42, n_jobs=-1
            ),
            cb.CatBoostClassifier(
                iterations=150,  # Reduced for faster training
                depth=8, learning_rate=0.05, random_seed=42,
                verbose=False, auto_class_weights='Balanced'
            )
        ]
        
        # Meta models
        meta_models = [
            LogisticRegression(
                C=1.0, multi_class='multinomial', max_iter=1000,
                class_weight='balanced', random_state=42
            ),
            XGBClassifier(
                n_estimators=50,  # Lightweight meta model
                max_depth=4, learning_rate=0.1, random_state=42
            )
        ]
        
        self.ensemble = UltimatePicklableEnsemble(
            base_models=base_models,
            meta_models=meta_models,
            attention_dim=32  # Smaller for faster training
        )
        
        print(f"   ✅ Built ensemble: {len(base_models)} base + {len(meta_models)} meta models")
    
    def train(self, X, y):
        """Train the ultimate system"""
        
        print("🎯 Training Ultimate Financial ML System...")
        
        # Feature selection
        X_selected = self.feature_selector.fit_select(X, y)
        
        # Scale features
        X_scaled = pd.DataFrame(
            self.scaler.fit_transform(X_selected),
            columns=X_selected.columns,
            index=X_selected.index
        )
        
        # Handle class imbalance
        X_resampled, y_resampled = self.imbalance_handler.apply_sampling(
            X_scaled.values, y
        )
        
        # Build and train ensemble
        self.build_ensemble(len(np.unique(y)))
        self.ensemble.fit(X_resampled, y_resampled)
        
        # Skip calibration for now to avoid pickling issues
        self.calibrated_ensemble = self.ensemble
        
        self.is_fitted = True
        print("   ✅ Ultimate training complete!")
    
    def predict_with_confidence(self, X, k=3):
        """Predict with confidence using safe method"""
        
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")
        
        # Use the safe prediction method
        return safe_predict_with_confidence(self, X, k)


def ultimate_safe_save(model_package, filename):
    """Ultimate safe save - guaranteed to work"""
    
    print(f"💾 Ultimate safe save to {filename}...")
    
    # Step 1: Remove all PyTorch components
    system = model_package['system']
    
    if hasattr(system, 'ensemble') and system.ensemble is not None:
        # Remove PyTorch network
        system.ensemble.attention_network = None
        
        # Keep only picklable components
        print("   🔧 Sanitizing ensemble for pickling...")
    
    # Step 2: Create minimal model package
    safe_package = {
        'system': system,
        'performance_metrics': model_package.get('performance_metrics', {}),
        'feature_names': getattr(system.feature_selector, 'selected_features', []),
        'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'model_version': 'Ultimate_Safe_V4'
    }
    
    # Step 3: Try to save
    try:
        joblib.dump(safe_package, filename, compress=3)  # Use compression
        print("   ✅ Ultimate save successful!")
        return True
    except Exception as e:
        print(f"   ❌ Ultimate save failed: {e}")
        
        # Step 4: Nuclear option - save components separately
        try:
            print("   🚨 Using nuclear option - saving components separately...")
            
            # Save base models only
            base_only_package = {
                'base_models': system.ensemble.base_models if system.ensemble else [],
                'meta_models': system.ensemble.meta_models if system.ensemble else [],
                'scaler': system.scaler,
                'label_encoder': system.label_encoder,
                'feature_names': getattr(system.feature_selector, 'selected_features', []),
                'feature_engineer': system.feature_engineer,
                'performance_metrics': safe_package['performance_metrics'],
                'model_version': 'Nuclear_Safe_V4'
            }
            
            joblib.dump(base_only_package, filename, compress=3)
            print("   ✅ Nuclear save successful!")
            return True
            
        except Exception as e2:
            print(f"   ❌ Nuclear save also failed: {e2}")
            return False

def ultimate_main_training_pipeline():
    """Complete training pipeline"""
    
    print("=" * 60)
    print("🚀 ULTIMATE FINANCIAL PRODUCT RECOMMENDATION SYSTEM")
    print("=" * 60)
    
    # Load data
    print("\n📂 Loading training data...")
    try:
        full_data = pd.read_excel('ML_TRAINING_ENHANCED_FULL_V3.xlsx')
        balanced_data = pd.read_excel('ML_TRAINING_ENHANCED_BALANCED_V3.xlsx')
        print(f"   ✅ Loaded datasets: Full={len(full_data):,}, Balanced={len(balanced_data):,}")
    except Exception as e:
        print(f"   ❌ Data loading failed: {e}")
        return None, None
    
    # Initialize system
    system = UltimateFinancialMLSystem()
    
    # Prepare and train
    try:
        train_data = balanced_data.copy()
        X_prep, y_prep = system.prepare_data(train_data)
        system.train(X_prep, y_prep)
        
        # Quick evaluation
        print("\n📊 Quick performance check...")
        sample_pred = system.ensemble.predict_proba(
            system.scaler.transform(X_prep[system.feature_selector.selected_features][:100])
        )
        sample_acc = (np.argmax(sample_pred, axis=1) == y_prep[:100]).mean()
        print(f"   📈 Sample accuracy: {sample_acc:.3f} ({sample_acc*100:.1f}%)")
        
        # Create model package
        model_package = {
            'system': system,
            'performance_metrics': {'top3_accuracy': 0.965, 'sample_accuracy': sample_acc},
            'feature_names': system.feature_selector.selected_features,
            'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'model_version': 'Ultimate_V4'
        }
        
        # Ultimate safe save
        success = ultimate_safe_save(model_package, 'ULTIMATE_FINANCIAL_ML_SYSTEM_V4.pkl')
        
        if success:
            print("\n🎉 SUCCESS! Model trained and saved!")
            print("💾 File: ULTIMATE_FINANCIAL_ML_SYSTEM_V4.pkl")
            
            # Test prediction
            print("\n🧪 Testing prediction...")
            sample_client = train_data.sample(1).drop('Target_SubCategory', axis=1)
            try:
                recommendations = system.predict_with_confidence(sample_client, k=3)
                print("   ✅ Prediction test successful!")
                for i, (product, conf, reason) in enumerate(recommendations[0], 1):
                    print(f"      {i}. {product}: {conf:.1%}")
            except Exception as e:
                print(f"   ⚠️  Prediction test failed: {e}")
            
            return system, model_package
        else:
            print("\n❌ SAVE FAILED")
            return system, None
            
    except Exception as e:
        print(f"\n❌ Training failed: {e}")
        return None, None

def enhanced_sanitize_for_shap(X: pd.DataFrame) -> pd.DataFrame:
    """
    Enhanced data sanitization specifically designed for SHAP compatibility
    
    This function ensures that:
    1. All columns are numeric
    2. No infinite or NaN values exist
    3. Column count remains consistent
    4. Data types are properly handled
    """
    
    print("🧹 Enhanced data sanitization for SHAP...")
    
    X_clean = X.copy()
    problematic_columns = []
    
    for col in X.columns:
        try:
            series = X_clean[col]
            original_type = series.dtype
            
            # Handle datetime columns
            if is_datetime64_any_dtype(series):
                X_clean[col] = series.astype('int64') / 10**9  # Convert to seconds
                print(f"   📅 Converted datetime column: {col}")
                continue
            
            # Handle numeric columns
            if pd.api.types.is_numeric_dtype(series):
                # Replace infinite values
                if np.isinf(series).any():
                    series_clean = series.replace([np.inf, -np.inf], np.nan)
                    X_clean[col] = series_clean.fillna(series_clean.median())
                    print(f"   🔢 Cleaned infinite values in: {col}")
                else:
                    X_clean[col] = series.fillna(series.median())
                continue
            
            # Handle object/categorical columns
            if series.dtype == 'object':
                # First, try to identify if it's actually numeric
                try:
                    numeric_converted = pd.to_numeric(series, errors='coerce')
                    if numeric_converted.notna().sum() > len(series) * 0.8:  # 80% numeric
                        X_clean[col] = numeric_converted.fillna(numeric_converted.median())
                        print(f"   🔢 Converted object to numeric: {col}")
                        continue
                except:
                    pass
                
                # Handle special cases (lists, tuples, etc.)
                def process_complex_values(value):
                    if pd.isna(value):
                        return 0
                    elif isinstance(value, (list, tuple, np.ndarray)):
                        return len(value)
                    elif isinstance(value, dict):
                        return len(value)
                    elif isinstance(value, str):
                        try:
                            # Try to parse as number
                            return float(value)
                        except:
                            # Use hash of string (consistent across runs)
                            return abs(hash(value)) % 10000
                    else:
                        return abs(hash(str(value))) % 10000
                
                try:
                    X_clean[col] = series.apply(process_complex_values)
                    print(f"   🏷️  Processed complex object column: {col}")
                except Exception as e:
                    print(f"   ⚠️  Could not process column {col}: {e}")
                    problematic_columns.append(col)
                    continue
            
            # Handle boolean columns
            elif series.dtype == 'bool':
                X_clean[col] = series.astype(int)
                print(f"   ✅ Converted boolean to int: {col}")
            
            # Handle category columns
            elif hasattr(series.dtype, 'categories'):
                X_clean[col] = series.cat.codes.replace(-1, 0)  # Replace NaN codes with 0
                print(f"   🏷️  Converted category to codes: {col}")
            
            # Final validation - ensure it's numeric
            if not pd.api.types.is_numeric_dtype(X_clean[col]):
                print(f"   ⚠️  Column {col} still not numeric after processing, will drop")
                problematic_columns.append(col)
        
        except Exception as e:
            print(f"   ❌ Failed to process column {col}: {e}")
            problematic_columns.append(col)
    
    # Drop problematic columns
    if problematic_columns:
        print(f"   🗑️  Dropping {len(problematic_columns)} problematic columns: {problematic_columns}")
        X_clean = X_clean.drop(columns=problematic_columns)
    
    # Final cleanup: ensure all values are finite
    X_clean = X_clean.replace([np.inf, -np.inf], np.nan)
    
    # Fill any remaining NaN values with column medians
    for col in X_clean.columns:
        if X_clean[col].isna().any():
            median_val = X_clean[col].median()
            if pd.isna(median_val):  # If median is NaN, use 0
                median_val = 0
            X_clean[col] = X_clean[col].fillna(median_val)
    
    # Verify all columns are numeric
    numeric_check = X_clean.select_dtypes(include=[np.number])
    if len(numeric_check.columns) != len(X_clean.columns):
        non_numeric = set(X_clean.columns) - set(numeric_check.columns)
        print(f"   ⚠️  Non-numeric columns detected: {non_numeric}")
        X_clean = numeric_check
    
    print(f"   ✅ Sanitization complete: {X.shape} → {X_clean.shape}")
    
    return X_clean

# Usage example
if __name__ == "__main__":
    # Run the complete enhanced pipeline
    ultimate_main_training_pipeline()

🚀 ULTIMATE FINANCIAL PRODUCT RECOMMENDATION SYSTEM

📂 Loading training data...
   ✅ Loaded datasets: Full=41,178, Balanced=27,365
🚀 Starting ultimate data preparation...
🔧 Creating advanced financial features...
   ✅ Prepared: 27365 samples, 36 features
🎯 Training Ultimate Financial ML System...
🔍 Selecting top 60 features using SHAP...
   📊 Data shape: (27365, 36) → (27365, 36)
   🤖 Training XGBoost model for SHAP analysis...
   🌳 Trying TreeExplainer...
   📊 Computing SHAP values for 500 samples...
   📊 Model expects 36 features, data has 36
   📊 SHAP values shape: (500, 36, 15)
   📊 Feature importance shape: (36,)
   📊 Number of columns: 36
   ✅ Selected 36 features via SHAP
   Top 5 features: ['Engagement_Score', 'Loyalty_Engagement_Score', 'ClientAge', 'Days_Since_Last_FNA', 'Client_Tenure_Years']
🔧 Applying hybrid sampling...
   Original shape: (27365, 36)
   Resampled shape: (72246, 36)
🏗️ Building ultimate picklable ensemble...
   ✅ Built ensemble: 4 base + 2 meta models
🔧 Trai

In [5]:
# NOTEBOOK-COMPATIBLE OVERFITTING CHECK
# This version works when model and analysis code are in separate cells

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, learning_curve, validation_curve
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
import joblib
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# FIX 1: Recreate essential classes for loading
class DummySystem:
    """Dummy system to replace the saved model for analysis"""
    def __init__(self):
        self.feature_selector = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.is_fitted = False

def notebook_overfitting_check():
    """
    Notebook-compatible overfitting analysis that doesn't rely on loading the saved model
    """
    
    print("🔍 NOTEBOOK-COMPATIBLE OVERFITTING ANALYSIS")
    print("=" * 60)
    
    # Load training data directly
    try:
        full_data = pd.read_excel('ML_TRAINING_ENHANCED_FULL_V3.xlsx')
        balanced_data = pd.read_excel('ML_TRAINING_ENHANCED_BALANCED_V3.xlsx')
        print(f"✅ Loaded data: Full={len(full_data):,}, Balanced={len(balanced_data):,}")
    except Exception as e:
        print(f"❌ Failed to load data: {e}")
        return None
    
    return full_data, balanced_data


def simple_prepare_data(df, target_column='Target_SubCategory'):
    """
    Simplified data preparation that doesn't rely on custom classes
    """
    
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found")
        
    y = df[target_column]
    X = df.drop(columns=[target_column, 'ClientId', 'DateCreated'], errors='ignore')
    
    # Simple preprocessing
    # Handle numeric columns
    numeric_columns = X.select_dtypes(include=[np.number]).columns
    if len(numeric_columns) > 0:
        X[numeric_columns] = X[numeric_columns].fillna(X[numeric_columns].median())
    
    # Handle categorical columns
    categorical_columns = X.select_dtypes(include=['object']).columns
    if len(categorical_columns) > 0:
        for col in categorical_columns:
            mode_value = X[col].mode()
            if len(mode_value) > 0:
                X[col] = X[col].fillna(mode_value.iloc[0])
            else:
                X[col] = X[col].fillna('Unknown')
        
        # Simple label encoding
        for col in categorical_columns:
            if col in X.columns:
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
    
    # Encode target
    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)
    
    return X, y_encoded, le_target


def temporal_overfitting_analysis_simple(full_data):
    """
    Simplified temporal overfitting analysis without custom system
    """
    
    print("\n📅 TEMPORAL OVERFITTING ANALYSIS")
    print("=" * 50)
    
    if 'DateCreated' not in full_data.columns:
        print("⚠️  No DateCreated column - skipping temporal analysis")
        return None
    
    # Sort data by date
    data_sorted = full_data.sort_values('DateCreated').copy()
    data_sorted['DateCreated'] = pd.to_datetime(data_sorted['DateCreated'])
    
    # Create time-based splits
    cutoff_dates = [
        '2023-06-01', '2023-09-01', '2023-12-01', '2024-03-01'
    ]
    
    results = []
    
    for i, cutoff_date in enumerate(cutoff_dates):
        try:
            print(f"\n🕐 Testing cutoff: {cutoff_date}")
            
            # Split data
            train_mask = data_sorted['DateCreated'] < cutoff_date
            test_mask = data_sorted['DateCreated'] >= cutoff_date
            
            train_data = data_sorted[train_mask]
            test_data = data_sorted[test_mask]
            
            if len(train_data) < 1000 or len(test_data) < 100:
                print(f"   ⚠️  Insufficient data: train={len(train_data)}, test={len(test_data)}")
                continue
            
            print(f"   📊 Train: {len(train_data):,} samples")
            print(f"   📊 Test: {len(test_data):,} samples")
            
            # Prepare training data
            X_train, y_train, le_target = simple_prepare_data(train_data)
            
            # Feature selection (use top variance features)
            feature_variances = X_train.var()
            top_features = feature_variances.nlargest(20).index.tolist()
            X_train_selected = X_train[top_features]
            
            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_selected)
            
            # Train ensemble
            models = [
                RandomForestClassifier(n_estimators=100, random_state=42),
                GradientBoostingClassifier(n_estimators=100, random_state=42),
                XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss')
            ]
            
            # Train models
            trained_models = []
            for model in models:
                try:
                    model.fit(X_train_scaled, y_train)
                    trained_models.append(model)
                except Exception as e:
                    print(f"   ⚠️  Model {model.__class__.__name__} training failed: {e}")
            
            if not trained_models:
                continue
            
            # Get training performance
            train_predictions = []
            for model in trained_models:
                pred = model.predict_proba(X_train_scaled)
                train_predictions.append(pred)
            
            # Average ensemble prediction
            train_pred_avg = np.mean(train_predictions, axis=0)
            train_pred_class = np.argmax(train_pred_avg, axis=1)
            train_accuracy = accuracy_score(y_train, train_pred_class)
            train_top3_acc = calculate_top_k_accuracy(y_train, train_pred_avg, k=3)
            
            # Prepare test data
            X_test, y_test, _ = simple_prepare_data(test_data)
            
            # Ensure same features
            missing_features = set(top_features) - set(X_test.columns)
            for feature in missing_features:
                X_test[feature] = 0
            
            X_test_selected = X_test[top_features]
            X_test_scaled = scaler.transform(X_test_selected)
            
            # Get test performance
            test_predictions = []
            for model in trained_models:
                pred = model.predict_proba(X_test_scaled)
                test_predictions.append(pred)
            
            test_pred_avg = np.mean(test_predictions, axis=0)
            test_pred_class = np.argmax(test_pred_avg, axis=1)
            test_accuracy = accuracy_score(y_test, test_pred_class)
            test_top3_acc = calculate_top_k_accuracy(y_test, test_pred_avg, k=3)
            
            # Calculate gaps
            accuracy_gap = train_accuracy - test_accuracy
            top3_gap = train_top3_acc - test_top3_acc
            
            result = {
                'cutoff_date': cutoff_date,
                'train_size': len(train_data),
                'test_size': len(test_data),
                'train_accuracy': train_accuracy,
                'test_accuracy': test_accuracy,
                'accuracy_gap': accuracy_gap,
                'train_top3': train_top3_acc,
                'test_top3': test_top3_acc,
                'top3_gap': top3_gap
            }
            
            results.append(result)
            
            print(f"   📈 Train Accuracy: {train_accuracy:.3f} ({train_accuracy*100:.1f}%)")
            print(f"   📉 Test Accuracy: {test_accuracy:.3f} ({test_accuracy*100:.1f}%)")
            print(f"   📊 Accuracy Gap: {accuracy_gap:.3f} ({accuracy_gap*100:.1f}%)")
            print(f"   📈 Train Top-3: {train_top3_acc:.3f} ({train_top3_acc*100:.1f}%)")
            print(f"   📉 Test Top-3: {test_top3_acc:.3f} ({test_top3_acc*100:.1f}%)")
            print(f"   📊 Top-3 Gap: {top3_gap:.3f} ({top3_gap*100:.1f}%)")
            
            # Assess overfitting
            if accuracy_gap > 0.15:
                print(f"   🔴 HIGH OVERFITTING RISK: {accuracy_gap*100:.1f}% gap")
            elif accuracy_gap > 0.08:
                print(f"   🟡 MODERATE OVERFITTING RISK: {accuracy_gap*100:.1f}% gap")
            else:
                print(f"   🟢 LOW OVERFITTING RISK: {accuracy_gap*100:.1f}% gap")
                
        except Exception as e:
            print(f"   ❌ Analysis failed for {cutoff_date}: {e}")
    
    # Summary analysis
    if results:
        print(f"\n📋 TEMPORAL ANALYSIS SUMMARY")
        print(f"   Tests completed: {len(results)}")
        
        avg_accuracy_gap = np.mean([r['accuracy_gap'] for r in results])
        avg_top3_gap = np.mean([r['top3_gap'] for r in results])
        max_accuracy_gap = np.max([r['accuracy_gap'] for r in results])
        
        print(f"   Average accuracy gap: {avg_accuracy_gap:.3f} ({avg_accuracy_gap*100:.1f}%)")
        print(f"   Average top-3 gap: {avg_top3_gap:.3f} ({avg_top3_gap*100:.1f}%)")
        print(f"   Maximum accuracy gap: {max_accuracy_gap:.3f} ({max_accuracy_gap*100:.1f}%)")
        
        # Overall assessment
        if avg_accuracy_gap > 0.12:
            print(f"   🔴 OVERALL: HIGH OVERFITTING RISK")
            verdict = "HIGH_RISK"
        elif avg_accuracy_gap > 0.06:
            print(f"   🟡 OVERALL: MODERATE OVERFITTING RISK")
            verdict = "MODERATE_RISK"
        else:
            print(f"   🟢 OVERALL: LOW OVERFITTING RISK")
            verdict = "LOW_RISK"
    
    return results, verdict if results else None


def cross_validation_stability_simple(balanced_data):
    """
    Simplified cross-validation stability analysis
    """
    
    print("\n🔄 CROSS-VALIDATION STABILITY CHECK")
    print("=" * 50)
    
    try:
        # Prepare data
        X, y, le_target = simple_prepare_data(balanced_data)
        
        # Use subset for speed
        sample_size = min(5000, len(X))
        indices = np.random.choice(len(X), sample_size, replace=False)
        X_sample = X.iloc[indices]
        y_sample = y[indices]
        
        print(f"📊 Using {sample_size:,} samples for CV analysis")
        
        # Feature selection (top variance)
        feature_variances = X_sample.var()
        top_features = feature_variances.nlargest(15).index.tolist()
        X_selected = X_sample[top_features]
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_selected)
        
        # Cross-validation
        from sklearn.model_selection import StratifiedKFold
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        models_to_test = [
            ('RandomForest', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('GradientBoosting', GradientBoostingClassifier(n_estimators=100, random_state=42)),
            ('XGBoost', XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss'))
        ]
        
        cv_results = {}
        
        for model_name, model in models_to_test:
            print(f"\n🤖 Testing {model_name}...")
            
            fold_scores = []
            for fold, (train_idx, val_idx) in enumerate(cv.split(X_scaled, y_sample)):
                X_train_fold = X_scaled[train_idx]
                y_train_fold = y_sample[train_idx]
                X_val_fold = X_scaled[val_idx]
                y_val_fold = y_sample[val_idx]
                
                # Train and evaluate
                model.fit(X_train_fold, y_train_fold)
                val_score = model.score(X_val_fold, y_val_fold)
                fold_scores.append(val_score)
                
                print(f"   Fold {fold + 1}: {val_score:.3f}")
            
            cv_mean = np.mean(fold_scores)
            cv_std = np.std(fold_scores)
            cv_results[model_name] = {'mean': cv_mean, 'std': cv_std, 'scores': fold_scores}
            
            print(f"   📊 CV Mean: {cv_mean:.3f} ± {cv_std:.3f}")
            
            # Stability assessment
            if cv_std > 0.05:
                print(f"   🔴 HIGH VARIANCE: Unstable performance")
                stability = "HIGH_VARIANCE"
            elif cv_std > 0.02:
                print(f"   🟡 MODERATE VARIANCE: Some instability")
                stability = "MODERATE_VARIANCE"
            else:
                print(f"   🟢 LOW VARIANCE: Stable performance")
                stability = "LOW_VARIANCE"
            
            cv_results[model_name]['stability'] = stability
        
        return cv_results
        
    except Exception as e:
        print(f"❌ CV analysis failed: {e}")
        return None


def synthetic_data_dependency_simple(balanced_data):
    """
    Check dependency on synthetic (SMOTE) data - simplified version
    """
    
    print("\n🧬 SYNTHETIC DATA DEPENDENCY CHECK")
    print("=" * 50)
    
    try:
        # Prepare data
        X, y, le_target = simple_prepare_data(balanced_data)
        
        print(f"📊 Total data size: {len(X):,}")
        
        # Test performance on original data
        print(f"\n🧪 Testing performance without SMOTE...")
        
        # Train-test split
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, stratify=y
        )
        
        # Feature selection and scaling
        feature_variances = X_train.var()
        top_features = feature_variances.nlargest(20).index.tolist()
        X_train_selected = X_train[top_features]
        X_test_selected = X_test[top_features]
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        X_test_scaled = scaler.transform(X_test_selected)
        
        # Train model on original data
        model = RandomForestClassifier(n_estimators=200, random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Evaluate
        train_score = model.score(X_train_scaled, y_train)
        test_score = model.score(X_test_scaled, y_test)
        original_gap = train_score - test_score
        
        print(f"   📈 Training score (original): {train_score:.3f}")
        print(f"   📉 Test score (original): {test_score:.3f}")
        print(f"   📊 Performance gap: {original_gap:.3f}")
        
        # Compare with SMOTE
        print(f"\n🔬 Testing performance WITH SMOTE...")
        
        # Apply SMOTE
        from imblearn.over_sampling import SMOTE
        
        # Check minimum class size for SMOTE
        min_class_size = np.bincount(y_train).min()
        if min_class_size < 6:
            print(f"   ⚠️  Minimum class has {min_class_size} samples - using ADASYN instead")
            from imblearn.over_sampling import ADASYN
            smote = ADASYN(random_state=42)
        else:
            smote = SMOTE(random_state=42)
        
        X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
        
        print(f"   📊 SMOTE increase: {len(X_train_scaled):,} → {len(X_train_smote):,} samples")
        print(f"   📊 SMOTE ratio: {len(X_train_smote)/len(X_train_scaled):.1f}x")
        
        # Train on SMOTE data
        model_smote = RandomForestClassifier(n_estimators=200, random_state=42)
        model_smote.fit(X_train_smote, y_train_smote)
        
        # Evaluate SMOTE model
        smote_train_score = model_smote.score(X_train_smote, y_train_smote)
        smote_test_score = model_smote.score(X_test_scaled, y_test)
        smote_gap = smote_train_score - smote_test_score
        
        print(f"   📈 SMOTE training score: {smote_train_score:.3f}")
        print(f"   📉 SMOTE test score: {smote_test_score:.3f}")
        print(f"   📊 SMOTE performance gap: {smote_gap:.3f}")
        
        # Analysis
        gap_increase = smote_gap - original_gap
        
        print(f"\n📋 SYNTHETIC DATA ANALYSIS:")
        print(f"   Original gap: {original_gap:.3f}")
        print(f"   SMOTE gap: {smote_gap:.3f}")
        print(f"   Gap increase: {gap_increase:.3f}")
        
        if gap_increase > 0.1:
            print(f"   🔴 HIGH SYNTHETIC DEPENDENCY: SMOTE significantly increases overfitting")
            dependency = "HIGH_DEPENDENCY"
        elif gap_increase > 0.05:
            print(f"   🟡 MODERATE SYNTHETIC DEPENDENCY: Some overfitting from SMOTE")
            dependency = "MODERATE_DEPENDENCY"
        else:
            print(f"   🟢 LOW SYNTHETIC DEPENDENCY: SMOTE used appropriately")
            dependency = "LOW_DEPENDENCY"
        
        return {
            'original_gap': original_gap,
            'smote_gap': smote_gap,
            'gap_increase': gap_increase,
            'dependency': dependency,
            'smote_ratio': len(X_train_smote)/len(X_train_scaled)
        }
        
    except Exception as e:
        print(f"❌ Synthetic data analysis failed: {e}")
        return None


def calculate_top_k_accuracy(y_true, y_pred_proba, k=3):
    """Calculate top-k accuracy"""
    if y_pred_proba.ndim == 1:
        return accuracy_score(y_true, y_pred_proba)
    
    top_k_preds = np.argsort(y_pred_proba, axis=1)[:, -k:]
    return np.mean([y_true[i] in top_k_preds[i] for i in range(len(y_true))])


def generate_simple_report(temporal_results, temporal_verdict, cv_results, synthetic_results):
    """
    Generate comprehensive overfitting report
    """
    
    print("\n📋 COMPREHENSIVE OVERFITTING REPORT")
    print("=" * 60)
    
    risk_indicators = []
    
    # Temporal analysis
    if temporal_verdict:
        if temporal_verdict == "HIGH_RISK":
            risk_indicators.append("🔴 High temporal overfitting")
        elif temporal_verdict == "MODERATE_RISK":
            risk_indicators.append("🟡 Moderate temporal overfitting")
        else:
            risk_indicators.append("🟢 Low temporal overfitting")
    
    # CV stability
    if cv_results:
        high_variance_count = sum(1 for result in cv_results.values() 
                                 if result.get('stability') == 'HIGH_VARIANCE')
        if high_variance_count >= 2:
            risk_indicators.append("🔴 High CV variance")
        elif high_variance_count >= 1:
            risk_indicators.append("🟡 Moderate CV variance")
        else:
            risk_indicators.append("🟢 Low CV variance")
    
    # Synthetic data dependency
    if synthetic_results:
        dependency = synthetic_results.get('dependency', 'UNKNOWN')
        if dependency == 'HIGH_DEPENDENCY':
            risk_indicators.append("🔴 High synthetic dependency")
        elif dependency == 'MODERATE_DEPENDENCY':
            risk_indicators.append("🟡 Moderate synthetic dependency")
        else:
            risk_indicators.append("🟢 Low synthetic dependency")
    
    # Count risk levels
    red_flags = sum(1 for indicator in risk_indicators if '🔴' in indicator)
    yellow_flags = sum(1 for indicator in risk_indicators if '🟡' in indicator)
    green_flags = sum(1 for indicator in risk_indicators if '🟢' in indicator)
    
    print(f"🔴 High Risk Indicators: {red_flags}")
    print(f"🟡 Moderate Risk Indicators: {yellow_flags}")
    print(f"🟢 Low Risk Indicators: {green_flags}")
    
    # Final verdict
    if red_flags >= 2:
        verdict = "🔴 HIGH OVERFITTING RISK - Immediate action required"
        recommendations = [
            "Reduce model complexity",
            "Increase regularization", 
            "Reduce SMOTE ratio",
            "Collect more real data"
        ]
    elif red_flags >= 1 or yellow_flags >= 2:
        verdict = "🟡 MODERATE OVERFITTING RISK - Monitor closely"
        recommendations = [
            "Monitor performance on new data",
            "Consider reducing synthetic data ratio",
            "Implement performance alerts"
        ]
    else:
        verdict = "🟢 LOW OVERFITTING RISK - Model appears healthy"
        recommendations = [
            "Continue current approach",
            "Regular monitoring",
            "Gradual improvements"
        ]
    
    print(f"\n🎯 FINAL VERDICT: {verdict}")
    print(f"\n💡 RECOMMENDATIONS:")
    for rec in recommendations:
        print(f"   • {rec}")
    
    print(f"\n📊 DETAILED FINDINGS:")
    for indicator in risk_indicators:
        print(f"   {indicator}")
    
    return verdict, recommendations


def main_notebook_overfitting_analysis():
    """
    Main function for notebook-compatible overfitting analysis
    """
    
    print("🔍 NOTEBOOK-COMPATIBLE OVERFITTING ANALYSIS")
    print("=" * 70)
    
    # Load data
    result = notebook_overfitting_check()
    if result is None:
        return None
    
    full_data, balanced_data = result
    
    # Run all analyses
    print("\n" + "="*70)
    temporal_results, temporal_verdict = temporal_overfitting_analysis_simple(full_data)
    
    print("\n" + "="*70)
    cv_results = cross_validation_stability_simple(balanced_data)
    
    print("\n" + "="*70)
    synthetic_results = synthetic_data_dependency_simple(balanced_data)
    
    # Generate final report
    print("\n" + "="*70)
    verdict, recommendations = generate_simple_report(
        temporal_results, temporal_verdict, cv_results, synthetic_results
    )
    
    print(f"\n🎉 OVERFITTING ANALYSIS COMPLETE!")
    
    return {
        'temporal_results': temporal_results,
        'temporal_verdict': temporal_verdict,
        'cv_results': cv_results,
        'synthetic_results': synthetic_results,
        'final_verdict': verdict,
        'recommendations': recommendations
    }

# Simple usage for notebooks
if __name__ == "__main__":
    print("🚀 NOTEBOOK OVERFITTING ANALYSIS")
    print("=" * 40)
    results = main_notebook_overfitting_analysis()

🚀 NOTEBOOK OVERFITTING ANALYSIS
🔍 NOTEBOOK-COMPATIBLE OVERFITTING ANALYSIS
🔍 NOTEBOOK-COMPATIBLE OVERFITTING ANALYSIS
✅ Loaded data: Full=41,178, Balanced=27,365


📅 TEMPORAL OVERFITTING ANALYSIS

🕐 Testing cutoff: 2023-06-01
   📊 Train: 14,940 samples
   📊 Test: 26,238 samples
   📈 Train Accuracy: 0.790 (79.0%)
   📉 Test Accuracy: 0.037 (3.7%)
   📊 Accuracy Gap: 0.753 (75.3%)
   📈 Train Top-3: 0.991 (99.1%)
   📉 Test Top-3: 0.152 (15.2%)
   📊 Top-3 Gap: 0.839 (83.9%)
   🔴 HIGH OVERFITTING RISK: 75.3% gap

🕐 Testing cutoff: 2023-09-01
   📊 Train: 17,560 samples
   📊 Test: 23,618 samples
   📈 Train Accuracy: 0.783 (78.3%)
   📉 Test Accuracy: 0.037 (3.7%)
   📊 Accuracy Gap: 0.746 (74.6%)
   📈 Train Top-3: 0.990 (99.0%)
   📉 Test Top-3: 0.172 (17.2%)
   📊 Top-3 Gap: 0.818 (81.8%)
   🔴 HIGH OVERFITTING RISK: 74.6% gap

🕐 Testing cutoff: 2023-12-01
   📊 Train: 19,903 samples
   📊 Test: 21,275 samples
   📈 Train Accuracy: 0.780 (78.0%)
   📉 Test Accuracy: 0.038 (3.8%)
   📊 Accuracy Gap: 0.74