In [None]:
df_final, features, rolling_splits, metadata = load_engineered_dataset('sales_forecast_engineered_dataset_20250528_170657.pkl')

In [None]:
# Step 5: Advanced Embedding-Based Deep Learning (Complete)
# No nested methods, clean indentation, with prediction saving functionality

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, callbacks
from tensorflow.keras.optimizers import AdamW
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Define custom metrics as standalone functions
def mape_metric_original_scale(y_true, y_pred):
    """MAPE metric in original scale for monitoring during training"""
    y_true_orig = tf.exp(y_true) - 1
    y_pred_orig = tf.exp(y_pred) - 1
    y_true_orig = tf.clip_by_value(y_true_orig, 1.0, 1e6)
    y_pred_orig = tf.clip_by_value(y_pred_orig, 1.0, 1e6)
    epsilon = 1.0
    mape = tf.reduce_mean(tf.abs(y_true_orig - y_pred_orig) / (y_true_orig + epsilon)) * 100
    return tf.clip_by_value(mape, 0.0, 1000.0)

def rmse_metric_original_scale(y_true, y_pred):
    """RMSE in original scale for monitoring during training"""
    y_true_orig = tf.exp(y_true) - 1
    y_pred_orig = tf.exp(y_pred) - 1
    y_true_orig = tf.clip_by_value(y_true_orig, 1.0, 1e6)
    y_pred_orig = tf.clip_by_value(y_pred_orig, 1.0, 1e6)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true_orig - y_pred_orig)))

class AdvancedEmbeddingModel:
    """
    Advanced embedding-based deep learning for sales forecasting
    Clean structure without nested methods
    """
    
    def __init__(self, random_seed=42):
        self.random_seed = random_seed
        self.encoders = {}
        self.scalers = {}
        tf.random.set_seed(random_seed)
        np.random.seed(random_seed)
        print("=" * 70)
        print("ADVANCED EMBEDDING-BASED FRAMEWORK INITIALIZED")
        print("=" * 70)
    
    def safe_mape_calculation(self, y_true, y_pred):
        """Safe MAPE calculation with proper error handling"""
        y_true_orig = np.expm1(y_true)
        y_pred_orig = np.expm1(y_pred)
        y_pred_orig = np.clip(y_pred_orig, 0.1, 1e6)
        y_true_orig = np.clip(y_true_orig, 0.1, 1e6)
        epsilon = 1.0
        ape = np.abs(y_true_orig - y_pred_orig) / (y_true_orig + epsilon)
        mape = np.mean(ape) * 100
        return min(mape, 1000.0)
    
    def save_detailed_predictions(self, val_data, val_pred_orig, val_true_orig, split_num, description, model_type, timestamp):
        """Save detailed predictions with comprehensive analysis"""
        # Create results DataFrame
        results_df = val_data.copy()
        results_df['predicted_sales'] = val_pred_orig
        results_df['actual_sales'] = val_true_orig
        results_df['absolute_error'] = np.abs(val_pred_orig - val_true_orig)
        results_df['absolute_percentage_error'] = np.abs(val_pred_orig - val_true_orig) / (val_true_orig + 1) * 100
        results_df['is_perfect_prediction'] = results_df['absolute_error'] < 1
        
        # Add error categories
        def categorize_error(ape):
            if ape < 5: return "Excellent (<5%)"
            elif ape < 10: return "Very Good (5-10%)"
            elif ape < 20: return "Good (10-20%)"
            elif ape < 50: return "Fair (20-50%)"
            else: return "Poor (>50%)"
        
        results_df['error_category'] = results_df['absolute_percentage_error'].apply(categorize_error)
        
        # Save detailed predictions
        predictions_filename = f"detailed_predictions_split_{split_num}_{model_type}_{timestamp}.csv"
        results_df.to_csv(predictions_filename, index=False)
        
        # Save summary predictions (top-level metrics only)
        summary_cols = ['sales_month', 'primary_platform', 'store_name', 'brand_name', 
                       'actual_sales', 'predicted_sales', 'absolute_percentage_error', 'error_category']
        available_cols = [col for col in summary_cols if col in results_df.columns]
        
        summary_filename = f"summary_predictions_split_{split_num}_{model_type}_{timestamp}.csv"
        results_df[available_cols].to_csv(summary_filename, index=False)
        
        print(f"  📊 Predictions saved:")
        print(f"    Detailed: {predictions_filename}")
        print(f"    Summary: {summary_filename}")
        
        return results_df, {'detailed_predictions': predictions_filename, 'summary_predictions': summary_filename}
    
    def save_split_analysis_report(self, results_df, split_num, description, model_type, timestamp):
        """Save comprehensive analysis report for a training split"""
        filename = f"training_analysis_report_split_{split_num}_{model_type}_{timestamp}.txt"
        
        with open(filename, 'w') as f:
            f.write(f"TRAINING SPLIT ANALYSIS REPORT\n")
            f.write(f"=" * 50 + "\n")
            f.write(f"Split: {split_num} - {description}\n")
            f.write(f"Model Type: {model_type}\n")
            f.write(f"Generated: {pd.Timestamp.now()}\n")
            f.write(f"Total Predictions: {len(results_df):,}\n\n")
            
            # Overall metrics
            f.write(f"OVERALL PERFORMANCE METRICS\n")
            f.write(f"-" * 30 + "\n")
            f.write(f"Mean Absolute Percentage Error: {results_df['absolute_percentage_error'].mean():.2f}%\n")
            f.write(f"Median Absolute Percentage Error: {results_df['absolute_percentage_error'].median():.2f}%\n")
            f.write(f"Standard Deviation of APE: {results_df['absolute_percentage_error'].std():.2f}%\n")
            f.write(f"Mean Absolute Error: {results_df['absolute_error'].mean():.0f} units\n")
            f.write(f"Root Mean Square Error: {np.sqrt(np.mean(results_df['absolute_error']**2)):.0f} units\n\n")
            
            # Error distribution
            f.write(f"ERROR DISTRIBUTION BY CATEGORY\n")
            f.write(f"-" * 30 + "\n")
            error_dist = results_df['error_category'].value_counts()
            for category in error_dist.index:
                count = error_dist[category]
                percentage = count / len(results_df) * 100
                f.write(f"{category}: {count:,} ({percentage:.1f}%)\n")
            f.write("\n")
            
            # Percentile analysis
            f.write(f"ERROR PERCENTILES\n")
            f.write(f"-" * 18 + "\n")
            percentiles = [10, 25, 50, 75, 90, 95, 99]
            ape_percentiles = np.percentile(results_df['absolute_percentage_error'], percentiles)
            for p, value in zip(percentiles, ape_percentiles):
                f.write(f"{p:2d}th percentile: {value:.2f}%\n")
            f.write("\n")
            
            # Platform analysis (if available)
            if 'primary_platform' in results_df.columns:
                f.write(f"PERFORMANCE BY PLATFORM\n")
                f.write(f"-" * 25 + "\n")
                platform_stats = results_df.groupby('primary_platform').agg({
                    'absolute_percentage_error': ['mean', 'median', 'std', 'count'],
                    'is_perfect_prediction': 'sum'
                }).round(2)
                
                for platform in platform_stats.index:
                    f.write(f"{platform}:\n")
                    f.write(f"  Mean APE: {platform_stats.loc[platform, ('absolute_percentage_error', 'mean')]:.2f}%\n")
                    f.write(f"  Median APE: {platform_stats.loc[platform, ('absolute_percentage_error', 'median')]:.2f}%\n")
                    f.write(f"  Samples: {platform_stats.loc[platform, ('absolute_percentage_error', 'count')]:,}\n")
                    f.write(f"  Perfect predictions: {platform_stats.loc[platform, ('is_perfect_prediction', 'sum')]:,}\n\n")
            
            # Time-based analysis (if available)
            if 'sales_month' in results_df.columns:
                f.write(f"PERFORMANCE BY MONTH\n")
                f.write(f"-" * 20 + "\n")
                monthly_stats = results_df.groupby(results_df['sales_month'].dt.month).agg({
                    'absolute_percentage_error': ['mean', 'count'],
                    'is_perfect_prediction': 'sum'
                }).round(2)
                
                for month in sorted(monthly_stats.index):
                    f.write(f"Month {month:2d}: ")
                    f.write(f"{monthly_stats.loc[month, ('absolute_percentage_error', 'mean')]:.2f}% MAPE ")
                    f.write(f"({monthly_stats.loc[month, ('absolute_percentage_error', 'count')]:,} samples)\n")
            
            # Brand analysis (if available)
            if 'brand_name' in results_df.columns:
                f.write(f"\nTOP 10 BRANDS BY SAMPLE COUNT\n")
                f.write(f"-" * 30 + "\n")
                brand_stats = results_df.groupby('brand_name').agg({
                    'absolute_percentage_error': ['mean', 'count'],
                    'is_perfect_prediction': 'sum'
                }).round(2)
                
                top_brands = brand_stats.nlargest(10, ('absolute_percentage_error', 'count'))
                for brand in top_brands.index:
                    mean_ape = top_brands.loc[brand, ('absolute_percentage_error', 'mean')]
                    count = top_brands.loc[brand, ('absolute_percentage_error', 'count')]
                    perfect = top_brands.loc[brand, ('is_perfect_prediction', 'sum')]
                    f.write(f"{brand}: {mean_ape:.2f}% MAPE ({count:,} samples, {perfect:,} perfect)\n")
            
            # Store analysis (if available)  
            if 'store_name' in results_df.columns:
                f.write(f"\nTOP 10 STORES BY SAMPLE COUNT\n")
                f.write(f"-" * 30 + "\n")
                store_stats = results_df.groupby('store_name').agg({
                    'absolute_percentage_error': ['mean', 'count'],
                    'is_perfect_prediction': 'sum'
                }).round(2)
                
                top_stores = store_stats.nlargest(10, ('absolute_percentage_error', 'count'))
                for store in top_stores.index:
                    mean_ape = top_stores.loc[store, ('absolute_percentage_error', 'mean')]
                    count = top_stores.loc[store, ('absolute_percentage_error', 'count')]
                    perfect = top_stores.loc[store, ('is_perfect_prediction', 'sum')]
                    f.write(f"{store}: {mean_ape:.2f}% MAPE ({count:,} samples, {perfect:,} perfect)\n")
            
            # Suspicious patterns
            f.write(f"\nSUSPICION ANALYSIS\n")
            f.write(f"-" * 18 + "\n")
            perfect_count = results_df['is_perfect_prediction'].sum()
            perfect_pct = perfect_count / len(results_df) * 100
            f.write(f"Perfect predictions (<1 unit error): {perfect_count:,} ({perfect_pct:.1f}%)\n")
            
            if perfect_pct > 5:
                f.write(f"⚠️ WARNING: High percentage of perfect predictions may indicate data leakage\n")
            
            mean_ape = results_df['absolute_percentage_error'].mean()
            if mean_ape < 5:
                f.write(f"⚠️ WARNING: Very low MAPE ({mean_ape:.2f}%) may indicate technical issues\n")
            
            # Prediction range analysis
            pred_min = results_df['predicted_sales'].min()
            pred_max = results_df['predicted_sales'].max()
            actual_min = results_df['actual_sales'].min()
            actual_max = results_df['actual_sales'].max()
            
            f.write(f"\nPREDICTION RANGE ANALYSIS\n")
            f.write(f"-" * 25 + "\n")
            f.write(f"Predicted sales range: [{pred_min:.0f}, {pred_max:.0f}]\n")
            f.write(f"Actual sales range: [{actual_min:.0f}, {actual_max:.0f}]\n")
            f.write(f"Range coverage ratio: {(pred_max - pred_min) / (actual_max - actual_min):.2f}\n")
            
            # Worst predictions
            f.write(f"\nWORST PREDICTIONS (Top 10)\n")
            f.write(f"-" * 27 + "\n")
            worst_predictions = results_df.nlargest(10, 'absolute_percentage_error')
            for idx, row in worst_predictions.iterrows():
                f.write(f"Actual: {row['actual_sales']:8.0f}, Predicted: {row['predicted_sales']:8.0f}, ")
                f.write(f"APE: {row['absolute_percentage_error']:6.1f}%")
                if 'store_name' in row and 'brand_name' in row:
                    f.write(f" ({row['store_name']}, {row['brand_name']})")
                f.write("\n")
            
            # Best predictions
            f.write(f"\nBEST PREDICTIONS (Top 10)\n")
            f.write(f"-" * 26 + "\n")
            best_predictions = results_df.nsmallest(10, 'absolute_percentage_error')
            for idx, row in best_predictions.iterrows():
                f.write(f"Actual: {row['actual_sales']:8.0f}, Predicted: {row['predicted_sales']:8.0f}, ")
                f.write(f"APE: {row['absolute_percentage_error']:6.1f}%")
                if 'store_name' in row and 'brand_name' in row:
                    f.write(f" ({row['store_name']}, {row['brand_name']})")
                f.write("\n")
        
        print(f"  📊 Analysis report saved: {filename}")
        return filename
    
    def categorize_features_for_embeddings(self, df, features):
        """Analyze and categorize features for embedding strategies"""
        print("=== ANALYZING FEATURES FOR EMBEDDING STRATEGIES ===")
        
        feature_categories = {
            'temporal': [],
            'numerical_continuous': [],
            'numerical_discrete': [],
            'binary': [],
            'interactions': []
        }
        
        for feature in features:
            if feature not in df.columns:
                continue
            
            dtype = df[feature].dtype
            unique_count = df[feature].nunique()
            
            if any(x in feature for x in ['month', 'quarter', 'day']):
                if 'sin' in feature or 'cos' in feature:
                    feature_categories['numerical_discrete'].append(feature)
                else:
                    feature_categories['temporal'].append(feature)
            elif any(x in feature for x in ['lag_', 'rolling_', 'sales_', 'momentum', 'volatility']):
                feature_categories['numerical_continuous'].append(feature)
            elif 'store_type_' in feature or dtype == 'bool':
                feature_categories['binary'].append(feature)
            elif 'interaction' in feature:
                feature_categories['interactions'].append(feature)
            else:
                if dtype in ['int64', 'float64', 'int32', 'float32']:
                    if unique_count < 20:
                        feature_categories['numerical_discrete'].append(feature)
                    else:
                        feature_categories['numerical_continuous'].append(feature)
        
        print("Feature categories:")
        for category, feat_list in feature_categories.items():
            if feat_list:
                print(f"  {category}: {len(feat_list)} features")
        
        return feature_categories
    
    def prepare_embedding_features(self, df, feature_categories, is_training=True):
        """Prepare features for embedding-based model"""
        df_work = df.copy()
        
        if 'sales_quantity_log' not in df_work.columns:
            df_work['sales_quantity_log'] = np.log1p(df_work['sales_quantity'])
        
        prepared_data = {}
        
        # Temporal features - create embeddings
        temporal_features = feature_categories['temporal']
        if temporal_features:
            temporal_data = []
            for feature in temporal_features:
                if feature in df_work.columns:
                    values = df_work[feature].fillna(0).values.astype(int)
                    if feature == 'month':
                        values = np.clip(values, 1, 12) - 1  # 0-11 for embedding
                    elif feature == 'quarter':
                        values = np.clip(values, 1, 4) - 1   # 0-3 for embedding
                    else:
                        values = np.clip(values, 0, 100)     # General clipping
                    temporal_data.append(values)
            
            if temporal_data:
                prepared_data['temporal'] = np.column_stack(temporal_data)
        
        # Numerical continuous - bucketize and embed
        continuous_features = feature_categories['numerical_continuous']
        if continuous_features:
            continuous_data = []
            for feature in continuous_features:
                if feature in df_work.columns:
                    values = df_work[feature].replace([np.inf, -np.inf], np.nan).fillna(0).values
                    
                    if is_training:
                        # Create quantile-based buckets
                        try:
                            buckets = np.quantile(values[values != 0], np.linspace(0, 1, 51))  # 50 buckets
                            buckets = np.unique(buckets)
                            self.encoders[f'{feature}_buckets'] = buckets
                        except:
                            self.encoders[f'{feature}_buckets'] = np.array([0, 1])
                    
                    bucket_edges = self.encoders.get(f'{feature}_buckets', np.array([0, 1]))
                    bucket_indices = np.digitize(values, bucket_edges)
                    bucket_indices = np.clip(bucket_indices, 0, len(bucket_edges))
                    continuous_data.append(bucket_indices)
            
            if continuous_data:
                prepared_data['continuous'] = np.column_stack(continuous_data)
        
        # Direct numerical features
        direct_features = (feature_categories['numerical_discrete'] + 
                          feature_categories['binary'] + 
                          feature_categories['interactions'])
        
        if direct_features:
            existing_features = [f for f in direct_features if f in df_work.columns]
            if existing_features:
                direct_data = df_work[existing_features].values.astype(np.float32)
                direct_data = np.nan_to_num(direct_data, nan=0.0, posinf=1e6, neginf=-1e6)
                
                if is_training:
                    self.scalers['direct'] = RobustScaler()
                    direct_data = self.scalers['direct'].fit_transform(direct_data)
                else:
                    direct_data = self.scalers['direct'].transform(direct_data)
                
                prepared_data['direct'] = direct_data
        
        # Target
        target = df_work['sales_quantity_log'].values.astype(np.float32)
        target = np.nan_to_num(target, nan=0.0, posinf=10.0, neginf=-1.0)
        
        return prepared_data, target
    
    def create_advanced_embedding_model(self, feature_categories, data_shapes):
        """Create advanced embedding-based neural network"""
        print("\n=== CREATING ADVANCED EMBEDDING MODEL ===")
        
        inputs = {}
        embedding_outputs = []
        total_embedding_dim = 0
        
        # Temporal embeddings
        if 'temporal' in data_shapes:
            temporal_input = layers.Input(shape=(data_shapes['temporal'],), name='temporal_input')
            inputs['temporal'] = temporal_input
            
            # Process each temporal feature with specific embeddings
            temporal_embeddings = []
            for i in range(data_shapes['temporal']):
                # Extract single temporal feature
                single_temporal = layers.Lambda(lambda x, idx=i: x[:, idx:idx+1])(temporal_input)
                
                if i == 0:  # Month
                    emb = layers.Embedding(12, 8, name=f'month_embedding')(single_temporal)
                    emb_dim = 8
                elif i == 1:  # Quarter  
                    emb = layers.Embedding(4, 4, name=f'quarter_embedding')(single_temporal)
                    emb_dim = 4
                else:
                    emb = layers.Embedding(101, 8, name=f'temporal_{i}_embedding')(single_temporal)
                    emb_dim = 8
                
                emb_flat = layers.Flatten()(emb)
                temporal_embeddings.append(emb_flat)
                total_embedding_dim += emb_dim
            
            if len(temporal_embeddings) > 1:
                temporal_combined = layers.Concatenate(name='temporal_combined')(temporal_embeddings)
            else:
                temporal_combined = temporal_embeddings[0]
            
            embedding_outputs.append(temporal_combined)
            print(f"  Temporal embeddings: {len(temporal_embeddings)} features, total dim: {sum([8 if i==0 else 4 if i==1 else 8 for i in range(data_shapes['temporal'])])}")
        
        # Continuous feature embeddings
        if 'continuous' in data_shapes:
            continuous_input = layers.Input(shape=(data_shapes['continuous'],), name='continuous_input')
            inputs['continuous'] = continuous_input
            
            # Process each continuous feature with smaller embeddings
            continuous_embeddings = []
            embedding_dim_per_feature = 8  # Smaller dimension
            
            for i in range(data_shapes['continuous']):
                single_continuous = layers.Lambda(lambda x, idx=i: x[:, idx:idx+1])(continuous_input)
                emb = layers.Embedding(52, embedding_dim_per_feature, name=f'continuous_{i}_embedding')(single_continuous)
                emb_flat = layers.Flatten()(emb)
                continuous_embeddings.append(emb_flat)
                total_embedding_dim += embedding_dim_per_feature
            
            if len(continuous_embeddings) > 1:
                continuous_combined = layers.Concatenate(name='continuous_combined')(continuous_embeddings)
            else:
                continuous_combined = continuous_embeddings[0]
            
            embedding_outputs.append(continuous_combined)
            print(f"  Continuous embeddings: {len(continuous_embeddings)} features, total dim: {len(continuous_embeddings) * embedding_dim_per_feature}")
        
        # Direct numerical features
        direct_dim = 0
        if 'direct' in data_shapes:
            direct_input = layers.Input(shape=(data_shapes['direct'],), name='direct_input')
            inputs['direct'] = direct_input
            
            # Process direct features to fixed dimension
            direct_processed = layers.Dense(32, activation='relu', name='direct_dense')(direct_input)
            direct_processed = layers.BatchNormalization(name='direct_bn')(direct_processed)
            direct_processed = layers.Dropout(0.2, name='direct_dropout')(direct_processed)
            
            embedding_outputs.append(direct_processed)
            direct_dim = 32
            total_embedding_dim += direct_dim
            print(f"  Direct features: {data_shapes['direct']} → {direct_dim} dimensions")
        
        # Calculate actual combined dimension
        print(f"  Expected total embedding dimension: {total_embedding_dim}")
        
        # Combine all embeddings
        if len(embedding_outputs) > 1:
            combined = layers.Concatenate(name='combine_all')(embedding_outputs)
        else:
            combined = embedding_outputs[0]
        
        # Adaptive standardization - use actual input dimension
        standardized = layers.Dense(256, activation='relu', name='standardize')(combined)
        standardized = layers.BatchNormalization(name='std_bn')(standardized)
        standardized = layers.Dropout(0.3, name='std_dropout')(standardized)
        
        # Multi-head attention with smaller heads
        attention_1 = layers.Dense(64, activation='tanh', name='attention_1')(standardized)
        attention_2 = layers.Dense(64, activation='tanh', name='attention_2')(standardized)
        attention_3 = layers.Dense(64, activation='tanh', name='attention_3')(standardized)
        attention_4 = layers.Dense(64, activation='tanh', name='attention_4')(standardized)
        
        multi_head = layers.Concatenate(name='multi_head')([attention_1, attention_2, attention_3, attention_4])
        
        # Residual connection - both inputs now 256-dim
        attended = layers.Add(name='residual_attention')([standardized, multi_head])
        attended = layers.LayerNormalization(name='layer_norm')(attended)
        
        # Deep layers
        x1 = layers.Dense(256, activation='relu', name='deep1')(attended)
        x1 = layers.BatchNormalization(name='bn1')(x1)
        x1 = layers.Dropout(0.3, name='drop1')(x1)
        
        x2 = layers.Dense(128, activation='relu', name='deep2')(x1)
        x2 = layers.BatchNormalization(name='bn2')(x2)
        x2 = layers.Dropout(0.2, name='drop2')(x2)
        
        x3 = layers.Dense(64, activation='relu', name='deep3')(x2)
        x3 = layers.Dropout(0.2, name='drop3')(x3)
        
        # Output
        output = layers.Dense(1, activation='linear', name='sales_prediction')(x3)
        
        model = Model(inputs=list(inputs.values()), outputs=output, name='AdvancedEmbeddingModel')
        
        print(f"  Model created with {model.count_params():,} parameters")
        print(f"  Input types: {list(inputs.keys())}")
        
        return model, list(inputs.keys())
    
    def enhanced_sanity_check_results(self, results):
        """Enhanced sanity checks on results"""
        print("\n" + "=" * 50)
        print("ENHANCED SANITY CHECKS ON RESULTS")
        print("=" * 50)
        
        if not results:
            print("❌ No results to check")
            return False
        
        mapes = [result['mape'] for result in results.values()]
        avg_mape = np.mean(mapes)
        
        # Check 1: Too good to be true?
        if avg_mape < 5:
            print(f"🚨 SUSPICIOUS: Average MAPE ({avg_mape:.2f}%) is suspiciously low")
            print("   This may indicate data leakage or incorrect calculation")
            return False
        
        # Check 2: All splits performing similarly well?
        mape_std = np.std(mapes)
        if mape_std < 2 and avg_mape < 15:
            print(f"🚨 SUSPICIOUS: All splits perform very similarly ({mape_std:.2f}% std)")
            print("   This may indicate overfitting or data leakage")
            return False
        
        # Check 3: Check for perfect predictions across splits
        total_perfect = sum([result.get('perfect_predictions', 0) for result in results.values()])
        total_predictions = sum([result.get('total_predictions', 1) for result in results.values()])
        perfect_ratio = total_perfect / total_predictions
        
        if perfect_ratio > 0.1:  # More than 10% perfect predictions
            print(f"🚨 SUSPICIOUS: {perfect_ratio*100:.1f}% perfect predictions across all splits")
            print("   This strongly suggests data leakage")
            return False
        
        # Check 4: Dramatic improvement from baseline?
        if avg_mape < 10:
            print(f"📊 BUSINESS REALITY CHECK:")
            print(f"   Average MAPE: {avg_mape:.2f}%")
            print(f"   This means predictions are typically within {avg_mape:.1f}% of actual sales")
            print(f"   For a product selling 1000 units, predictions would be ±{avg_mape*10:.0f} units")
            print(f"   Please validate this level of accuracy with business stakeholders")
        
        print(f"✅ Results pass enhanced sanity checks")
        return True
    
    def train_advanced_embedding_model(self, df, features, rolling_splits):
        """Train advanced embedding model on rolling splits with prediction saving"""
        print("=" * 80)
        print("TRAINING ADVANCED EMBEDDING-BASED MODELS")
        print("=" * 80)
        
        # Analyze features
        feature_categories = self.categorize_features_for_embeddings(df, features)
        
        all_results = {}
        timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
        
        for split_idx, (train_data, val_data, description) in enumerate(rolling_splits):
            print(f"\nSplit {split_idx + 1}: {description}")
            print("-" * 50)
            
            try:
                # VALIDATE DATA INTEGRITY FIRST
                print(f"DATA INTEGRITY CHECKS:")
                print(f"  Train date range: {train_data['sales_month'].min()} to {train_data['sales_month'].max()}")
                print(f"  Val date range: {val_data['sales_month'].min()} to {val_data['sales_month'].max()}")
                
                # Check for temporal overlap
                train_max_date = train_data['sales_month'].max()
                val_min_date = val_data['sales_month'].min()
                
                if train_max_date >= val_min_date:
                    print(f"  🚨 POTENTIAL DATA LEAKAGE: Training data overlaps with validation!")
                    print(f"    Train max: {train_max_date}, Val min: {val_min_date}")
                
                # Check for identical records
                train_key = train_data[['store_name', 'brand_name', 'sales_month', 'sales_quantity']].copy()
                val_key = val_data[['store_name', 'brand_name', 'sales_month', 'sales_quantity']].copy()
                
                # Create composite keys
                train_key['composite'] = train_key['store_name'] + '_' + train_key['brand_name'] + '_' + train_key['sales_month'].astype(str)
                val_key['composite'] = val_key['store_name'] + '_' + val_key['brand_name'] + '_' + val_key['sales_month'].astype(str)
                
                overlapping_keys = set(train_key['composite']).intersection(set(val_key['composite']))
                
                if overlapping_keys:
                    print(f"  🚨 IDENTICAL RECORDS DETECTED: {len(overlapping_keys)} records appear in both train and validation!")
                else:
                    print(f"  ✓ No identical records between train and validation")
                
                # Check target distribution
                train_sales_stats = train_data['sales_quantity'].describe()
                val_sales_stats = val_data['sales_quantity'].describe()
                
                print(f"  Train sales stats: mean={train_sales_stats['mean']:.0f}, std={train_sales_stats['std']:.0f}")
                print(f"  Val sales stats: mean={val_sales_stats['mean']:.0f}, std={val_sales_stats['std']:.0f}")
                
                # Prepare data
                X_train, y_train = self.prepare_embedding_features(train_data, feature_categories, is_training=True)
                X_val, y_val = self.prepare_embedding_features(val_data, feature_categories, is_training=False)
                
                print(f"Prepared {len(X_train)} input types for training")
                
                # Get data shapes for model creation
                data_shapes = {key: data.shape[1] for key, data in X_train.items()}
                print(f"Data shapes: {data_shapes}")
                
                # Create model
                model, input_order = self.create_advanced_embedding_model(feature_categories, data_shapes)
                
                # Compile with consistent metrics
                model.compile(
                    optimizer=AdamW(learning_rate=0.001, weight_decay=0.01),
                    loss='mae',
                    metrics=[mape_metric_original_scale, rmse_metric_original_scale]
                )
                
                # Prepare inputs in correct order
                X_train_ordered = [X_train[key] for key in input_order if key in X_train]
                X_val_ordered = [X_val[key] for key in input_order if key in X_val]
                
                # Callbacks
                callbacks_list = [
                    callbacks.EarlyStopping(
                        patience=20,
                        restore_best_weights=True,
                        monitor='val_mape_metric_original_scale',
                        mode='min'
                    ),
                    callbacks.ReduceLROnPlateau(
                        patience=10,
                        factor=0.5,
                        monitor='val_mape_metric_original_scale',
                        mode='min'
                    )
                ]
                
                # Train
                print("Training advanced embedding model...")
                history = model.fit(
                    X_train_ordered, y_train,
                    validation_data=(X_val_ordered, y_val),
                    epochs=100,
                    batch_size=512,
                    callbacks=callbacks_list,
                    verbose=1 if split_idx == 0 else 0
                )
                
                # Evaluate with detailed analysis
                val_pred_log = model.predict(X_val_ordered, verbose=0)
                
                # Detailed debugging
                print(f"\nDETAILED EVALUATION ANALYSIS:")
                print(f"  Predictions (log space): [{val_pred_log.min():.3f}, {val_pred_log.max():.3f}]")
                
                # Convert to original scale
                val_pred_orig = np.expm1(val_pred_log.flatten())
                val_true_orig = np.expm1(y_val)
                
                print(f"  Predictions (original): [{val_pred_orig.min():.0f}, {val_pred_orig.max():.0f}] units")
                print(f"  Actuals (original): [{val_true_orig.min():.0f}, {val_true_orig.max():.0f}] units")
                
                # Check for potential issues
                zero_predictions = np.sum(val_pred_orig < 1)
                extreme_predictions = np.sum(val_pred_orig > 100000)
                
                print(f"  Zero/near-zero predictions: {zero_predictions}/{len(val_pred_orig)} ({zero_predictions/len(val_pred_orig)*100:.1f}%)")
                print(f"  Extreme predictions (>100K): {extreme_predictions}/{len(val_pred_orig)} ({extreme_predictions/len(val_pred_orig)*100:.1f}%)")
                
                # Sample comparison
                print(f"  Sample comparisons (first 10):")
                for i in range(min(10, len(val_pred_orig))):
                    actual = val_true_orig[i]
                    predicted = val_pred_orig[i]
                    error = abs(actual - predicted) / (actual + 1) * 100
                    print(f"    Actual: {actual:8.0f}, Predicted: {predicted:8.0f}, APE: {error:6.1f}%")
                
                # Calculate MAPE step by step
                raw_ape = np.abs(val_true_orig - val_pred_orig) / (val_true_orig + 1.0)
                raw_mape = np.mean(raw_ape) * 100
                
                print(f"  Raw MAPE calculation: {raw_mape:.2f}%")
                print(f"  APE distribution: [{np.min(raw_ape)*100:.1f}%, {np.median(raw_ape)*100:.1f}%, {np.max(raw_ape)*100:.1f}%] (min/median/max)")
                
                # Use safe MAPE calculation
                mape = self.safe_mape_calculation(y_val, val_pred_log.flatten())
                
                # Compare with sklearn MAPE
                try:
                    sklearn_mape = mean_absolute_percentage_error(val_true_orig, val_pred_orig) * 100
                    print(f"  Sklearn MAPE: {sklearn_mape:.2f}%")
                    print(f"  Our MAPE: {mape:.2f}%")
                    
                    if abs(sklearn_mape - mape) > 5:
                        print(f"  ⚠️ MAPE calculation discrepancy detected!")
                except:
                    print(f"  Could not calculate sklearn MAPE")
                
                # Check for data leakage indicators
                perfect_predictions = np.sum(np.abs(val_true_orig - val_pred_orig) < 1)
                print(f"  Perfect/near-perfect predictions: {perfect_predictions}/{len(val_pred_orig)} ({perfect_predictions/len(val_pred_orig)*100:.1f}%)")
                
                if perfect_predictions > len(val_pred_orig) * 0.1:  # More than 10% perfect
                    print(f"  🚨 POTENTIAL DATA LEAKAGE: Too many perfect predictions!")
                
                if mape < 5:
                    print(f"  🚨 SUSPICIOUSLY LOW MAPE: Results may indicate data leakage!")
                
                # Additional validation metrics
                val_pred_clipped = np.clip(val_pred_orig, 1, 1e6)
                val_true_clipped = np.clip(val_true_orig, 1, 1e6)
                
                rmse = np.sqrt(mean_squared_error(val_true_clipped, val_pred_clipped))
                r2 = r2_score(val_true_clipped, val_pred_clipped)
                mae = np.mean(np.abs(val_true_clipped - val_pred_clipped))
                
                print(f"  RMSE: {rmse:,.0f}")
                print(f"  MAE: {mae:,.0f}")
                print(f"  R²: {r2:.4f}")
                
                # Save model for inspection
                model_filename = f"advanced_embedding_model_split_{split_idx+1}_{timestamp}.h5"
                model.save(model_filename)
                print(f"  Model saved as: {model_filename}")
                
                # SAVE DETAILED PREDICTIONS AND ANALYSIS
                print(f"\nSAVING PREDICTIONS AND ANALYSIS:")
                results_df, saved_files = self.save_detailed_predictions(
                    val_data, val_pred_orig, val_true_orig, 
                    split_idx+1, description, "AdvancedEmbedding", timestamp
                )
                
                analysis_file = self.save_split_analysis_report(
                    results_df, split_idx+1, description, "AdvancedEmbedding", timestamp
                )
                
                # Get training metrics
                final_val_mape = history.history.get('val_mape_metric_original_scale', [None])[-1]
                
                print(f"\nFINAL METRICS:")
                print(f"  Training MAPE: {final_val_mape:.2f}%" if final_val_mape else "  Training MAPE: N/A")
                print(f"  Evaluation MAPE: {mape:.2f}%")
                
                if final_val_mape and abs(final_val_mape - mape) > 2:
                    print(f"  ⚠️ Training vs Evaluation inconsistency: {abs(final_val_mape - mape):.2f}% difference")
                
                # Store comprehensive results
                all_results[f'split_{split_idx+1}'] = {
                    'description': description,
                    'mape': mape,
                    'train_mape': final_val_mape,
                    'perfect_predictions': perfect_predictions,
                    'total_predictions': len(val_pred_orig),
                    'rmse': rmse,
                    'r2': r2,
                    'mae': mae,
                    'saved_files': {
                        'model': model_filename,
                        'detailed_predictions': saved_files['detailed_predictions'],
                        'summary_predictions': saved_files['summary_predictions'],
                        'analysis_report': analysis_file
                    }
                }
                
                # Early exit if performance is poor
                if mape > 500:
                    print("Model performing poorly, trying next split...")
                    continue
                
            except Exception as e:
                print(f"Error in split {split_idx + 1}: {str(e)}")
                import traceback
                traceback.print_exc()
                continue
        
        # Print final results
        self.print_final_results(all_results)
        return all_results
    
    def print_final_results(self, results):
        """Print comprehensive results with saved files information"""
        print("\n" + "=" * 70)
        print("ADVANCED EMBEDDING MODEL RESULTS")
        print("=" * 70)
        
        if not results:
            print("❌ No successful training completed")
            return
        
        mapes = [result['mape'] for result in results.values()]
        avg_mape = np.mean(mapes)
        
        print(f"Results by split:")
        for split_name, result in results.items():
            train_mape = result.get('train_mape', 'N/A')
            perfect_preds = result.get('perfect_predictions', 0)
            total_preds = result.get('total_predictions', 0)
            perfect_pct = (perfect_preds / total_preds * 100) if total_preds > 0 else 0
            
            print(f"  {result['description']}:")
            print(f"    MAPE: {result['mape']:.2f}% (train: {train_mape:.2f}% if train_mape != 'N/A' else 'N/A')")
            print(f"    Perfect predictions: {perfect_preds:,}/{total_preds:,} ({perfect_pct:.1f}%)")
            
            # Show saved files
            if 'saved_files' in result:
                files = result['saved_files']
                print(f"    📁 Saved files:")
                print(f"      Model: {files.get('model', 'N/A')}")
                print(f"      Predictions: {files.get('summary_predictions', 'N/A')}")
                print(f"      Analysis: {files.get('analysis_report', 'N/A')}")
        
        print(f"\nOverall Performance:")
        print(f"  Average MAPE: {avg_mape:.2f}%")
        print(f"  Best MAPE: {min(mapes):.2f}%")
        print(f"  Worst MAPE: {max(mapes):.2f}%")
        print(f"  Standard Deviation: {np.std(mapes):.2f}%")
        
        # Enhanced sanity checks
        total_perfect = sum([result.get('perfect_predictions', 0) for result in results.values()])
        total_predictions = sum([result.get('total_predictions', 1) for result in results.values()])
        overall_perfect_pct = (total_perfect / total_predictions * 100) if total_predictions > 0 else 0
        
        print(f"\nOVERALL QUALITY ANALYSIS:")
        print(f"  Total predictions across all splits: {total_predictions:,}")
        print(f"  Total perfect predictions: {total_perfect:,} ({overall_perfect_pct:.1f}%)")
        
        # Perform enhanced sanity checks
        is_sane = self.enhanced_sanity_check_results(results)
        
        if is_sane:
            if avg_mape <= 20:
                print(f"\n🎉 BREAKTHROUGH! Average MAPE ({avg_mape:.2f}%) broke 20% barrier!")
                if avg_mape <= 10:
                    print(f"🌟 EXCELLENT! Achieved business-usable accuracy!")
                    print(f"📋 RECOMMENDATION: Validate these results with business stakeholders")
                    print(f"📋 Review saved prediction files for detailed analysis")
            else:
                print(f"\n⚠️ Still above 20% threshold ({avg_mape:.2f}%)")
        else:
            print(f"\n🚨 RESULTS FAILED SANITY CHECKS - INVESTIGATE POTENTIAL ISSUES")
            print(f"   📋 RECOMMENDATION: Review saved prediction files to identify issues")
            print(f"   📋 Check for data leakage in detailed prediction analysis")
        
        # Summary of all saved files
        print(f"\n📁 ALL SAVED FILES SUMMARY:")
        all_files = []
        for result in results.values():
            if 'saved_files' in result:
                files = result['saved_files']
                all_files.extend([
                    files.get('model', ''),
                    files.get('detailed_predictions', ''),
                    files.get('summary_predictions', ''),
                    files.get('analysis_report', '')
                ])
        
        valid_files = [f for f in all_files if f and f != 'N/A']
        if valid_files:
            print(f"  Total files saved: {len(valid_files)}")
            print(f"  File types: Models, Predictions (detailed & summary), Analysis reports")
            print(f"  Use these files for:")
            print(f"    - Business validation of results")
            print(f"    - Identifying data leakage patterns")
            print(f"    - Understanding model performance by platform/brand/store")
            print(f"    - Generating business insights and recommendations")
        else:
            print(f"  No files were saved")

# Initialize framework
advanced_framework = AdvancedEmbeddingModel()

print("\nAdvanced Embedding Framework Ready!")
print("Run: results = advanced_framework.train_advanced_embedding_model(df_final, features, rolling_splits)")

In [None]:
results = advanced_framework.train_advanced_embedding_model(df_final, features, rolling_splits)