> **Special Thanks üôè**  
> Huge thanks to the **NVIDIA Developer Blog** team for sharing *The Kaggle Grandmasters Playbook: 7 Battle-Tested Modeling Techniques for Tabular Data*.  
>  
> It‚Äôs an incredibly insightful resource that inspired several improvements in this notebook.  
>  
> üìñ [Read the full article here ‚Üí](https://developer.nvidia.com/blog/the-kaggle-grandmasters-playbook-7-battle-tested-modeling-techniques-for-tabular-data/)


In [None]:

import os
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ks_2samp, wasserstein_distance
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

# Try to use cuDF for GPU acceleration if available
try:
    import cudf
    import cupy as cp
    GPU_AVAILABLE = True
    print("‚úÖ NVIDIA cuDF detected - Using GPU acceleration")
except ImportError:
    GPU_AVAILABLE = False
    print("‚ö†Ô∏è  NVIDIA cuDF not available - Using pandas CPU")

warnings.filterwarnings('ignore')
# ================================================================================
# ENHANCED EDA CLASS FOR SMART ANALYSIS
# ================================================================================

class SmartEDA:
    """
    Advanced EDA that goes beyond basic statistics to detect:
    1. Train-test distribution shifts
    2. Temporal patterns and concept drift
    3. Multivariate distribution differences
    """
    
    def __init__(self, gpu_acceleration=False):
        self.gpu_acceleration = gpu_acceleration and GPU_AVAILABLE
        self.results = {}
        
    def load_data_gpu(self, df):
        """Load data to GPU if available"""
        if self.gpu_acceleration:
            return cudf.from_pandas(df)
        return df
    
    def to_numpy(self, data):
        """Convert GPU data to numpy for scipy compatibility"""
        if hasattr(data, 'to_numpy'):
            return data.to_numpy()
        elif hasattr(data, 'values'):
            return data.values
        return data
    
    def compute_distribution_distance(self, train_data, test_data, feature):
        """Compute multiple distribution distance metrics with GPU compatibility"""
        # Extract data and convert to numpy for scipy compatibility
        train_vals = self.to_numpy(train_data[feature].dropna())
        test_vals = self.to_numpy(test_data[feature].dropna())
        
        if len(train_vals) == 0 or len(test_vals) == 0:
            return {}
        
        # KS Test for distribution similarity
        try:
            ks_stat, ks_pvalue = ks_2samp(train_vals, test_vals)
        except Exception as e:
            print(f"Warning: KS test failed for {feature}: {e}")
            ks_stat, ks_pvalue = 0, 1
        
        # Wasserstein distance (Earth Mover's Distance)
        try:
            wasserstein_dist = wasserstein_distance(train_vals, test_vals)
        except:
            wasserstein_dist = 0
        
        # Mean difference
        mean_diff = np.mean(test_vals) - np.mean(train_vals)
        mean_diff_pct = (mean_diff / np.mean(train_vals)) * 100 if np.mean(train_vals) != 0 else 0
        
        # Variance ratio
        var_ratio = np.var(test_vals) / np.var(train_vals) if np.var(train_vals) != 0 else 1
        
        return {
            'ks_statistic': ks_stat,
            'ks_pvalue': ks_pvalue,
            'wasserstein_distance': wasserstein_dist,
            'mean_difference': mean_diff,
            'mean_difference_pct': mean_diff_pct,
            'variance_ratio': var_ratio,
            'distribution_shift': ks_pvalue < 0.05 or abs(mean_diff_pct) > 10
        }
    
    def analyze_temporal_patterns(self, df, time_column, target_columns):
        """Analyze temporal patterns and concept drift"""
        print("üîç Analyzing temporal patterns...")
        
        temporal_insights = {}
        
        # Convert to GPU if available
        df_gpu = self.load_data_gpu(df)
        
        # Weekly patterns (if time_column represents weeks)
        if 'week' in df_gpu.columns:
            if self.gpu_acceleration:
                weekly_stats = df_gpu.groupby('week')[target_columns].agg(['mean', 'std', 'count']).compute()
            else:
                weekly_stats = df_gpu.groupby('week')[target_columns].agg(['mean', 'std', 'count'])
            
            temporal_insights['weekly_patterns'] = weekly_stats
            
            # Detect concept drift across weeks
            weeks = sorted(df_gpu['week'].unique())
            if len(weeks) > 1:
                first_week_data = df_gpu[df_gpu['week'] == weeks[0]][target_columns]
                last_week_data = df_gpu[df_gpu['week'] == weeks[-1]][target_columns]
                
                # Convert to numpy for statistical tests
                first_week_data = self.to_numpy(first_week_data) if self.gpu_acceleration else first_week_data
                last_week_data = self.to_numpy(last_week_data) if self.gpu_acceleration else last_week_data
                
                drift_scores = {}
                for col in target_columns:
                    if len(first_week_data[col].dropna()) > 0 and len(last_week_data[col].dropna()) > 0:
                        first_vals = self.to_numpy(first_week_data[col].dropna())
                        last_vals = self.to_numpy(last_week_data[col].dropna())
                        
                        ks_stat, ks_pvalue = ks_2samp(first_vals, last_vals)
                        drift_scores[col] = {
                            'ks_statistic': ks_stat,
                            'ks_pvalue': ks_pvalue,
                            'concept_drift': ks_pvalue < 0.05
                        }
                
                temporal_insights['concept_drift'] = drift_scores
        
        return temporal_insights
    
    def create_distribution_shift_dashboard(self, train_df, test_df, features):
        """Create comprehensive distribution shift analysis with GPU optimization"""
        print("üìä Creating distribution shift dashboard...")
        
        distribution_analysis = {}
        
        # Convert to GPU for faster computation (except for statistical tests)
        train_gpu = self.load_data_gpu(train_df)
        test_gpu = self.load_data_gpu(test_df)
        
        # For statistical tests, we'll use numpy but benefit from GPU for data processing
        # Analyze each feature
        for feature in tqdm(features, desc="Analyzing feature distributions"):
            if feature in train_gpu.columns and feature in test_gpu.columns:
                distribution_analysis[feature] = self.compute_distribution_distance(
                    train_gpu, test_gpu, feature
                )
        
        return distribution_analysis
    
    def visualize_distribution_shifts(self, train_df, test_df, distribution_analysis, top_n=15):
        """Visualize the most significant distribution shifts"""
        
        # Get features with significant shifts
        shifted_features = []
        for feature, metrics in distribution_analysis.items():
            if metrics.get('distribution_shift', False):
                shifted_features.append((feature, metrics['ks_statistic']))
        
        # Sort by KS statistic (most significant first)
        shifted_features.sort(key=lambda x: x[1], reverse=True)
        top_shifted = shifted_features[:top_n]
        
        if not top_shifted:
            print("‚úÖ No significant distribution shifts detected!")
            return
        
        print(f"üö® Found {len(shifted_features)} features with distribution shifts")
        print("Top shifted features:")
        for feature, ks_stat in top_shifted[:10]:
            metrics = distribution_analysis[feature]
            print(f"  ‚Ä¢ {feature}: KS={ks_stat:.3f}, MeanDiff={metrics['mean_difference_pct']:.1f}%")
        
        # Create visualization - convert GPU data to pandas for plotting
        if self.gpu_acceleration:
            train_plot = train_df.to_pandas() if hasattr(train_df, 'to_pandas') else train_df
            test_plot = test_df.to_pandas() if hasattr(test_df, 'to_pandas') else test_df
        else:
            train_plot = train_df
            test_plot = test_df
        
        n_features = min(len(top_shifted), 9)  # Show up to 3x3 grid
        nrows = int(np.ceil(n_features / 3))
        ncols = min(3, n_features)
        
        fig, axes = plt.subplots(nrows, ncols, figsize=(5*ncols, 4*nrows))
        if n_features == 1:
            axes = [axes]
        else:
            axes = axes.flatten()
        
        for idx, (feature, ks_stat) in enumerate(top_shifted[:n_features]):
            ax = axes[idx]
            
            # Plot distributions
            train_vals = train_plot[feature].dropna()
            test_vals = test_plot[feature].dropna()
            
            # Normalize for comparison
            if len(train_vals) > 0 and len(test_vals) > 0:
                sns.kdeplot(train_vals, ax=ax, label='Train', fill=True, alpha=0.5)
                sns.kdeplot(test_vals, ax=ax, label='Test', fill=True, alpha=0.5)
                
                metrics = distribution_analysis[feature]
                ax.set_title(f'{feature}\nKS: {ks_stat:.3f}, ŒîŒº: {metrics["mean_difference_pct"]:.1f}%', 
                           fontsize=10, fontweight='bold')
                ax.legend()
            
            if idx >= n_features - 1:
                break
        
        # Hide empty subplots
        for idx in range(n_features, len(axes)):
            axes[idx].set_visible(False)
        
        plt.suptitle('Top Feature Distribution Shifts: Train vs Test', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('distribution_shifts.png', dpi=120, bbox_inches='tight')
        plt.show()
        
        return top_shifted

# ================================================================================
# ENHANCED DATA LOADING WITH TEMPORAL ANALYSIS
# ================================================================================

def load_data_with_temporal_analysis(data_dir):
    """Enhanced data loading that preserves temporal information"""
    
    print("üïí Loading data with temporal analysis...")
    
    # Load training data with week information
    input_files = sorted(glob.glob(os.path.join(data_dir, "train/input_2023_w*.csv")))
    output_files = sorted(glob.glob(os.path.join(data_dir, "train/output_2023_w*.csv")))
    
    train_weeks = []
    
    # Load each week separately to preserve temporal structure
    for week_idx, (in_file, out_file) in enumerate(zip(input_files, output_files)):
        week_num = week_idx + 1
        df_in_week = pd.read_csv(in_file)
        df_out_week = pd.read_csv(out_file)
        
        # Add week information
        df_in_week['week'] = week_num
        df_out_week['week'] = week_num
        
        train_weeks.append((df_in_week, df_out_week))
    
    # Combine with week information
    df_in = pd.concat([week[0] for week in train_weeks], ignore_index=True)
    df_out = pd.concat([week[1] for week in train_weeks], ignore_index=True)
    
    # Load test data
    test_in = pd.read_csv(os.path.join(data_dir, "test_input.csv"))
    test_template = pd.read_csv(os.path.join(data_dir, "test.csv"))
    
    # Try to infer test week if possible
    if 'game_id' in test_in.columns:
        # Simple heuristic: if game_id patterns differ from training, mark as different temporal period
        train_game_prefixes = df_in['game_id'].astype(str).str[:3].unique()
        test_game_prefixes = test_in['game_id'].astype(str).str[:3].unique()
        
        if len(set(test_game_prefixes) - set(train_game_prefixes)) > 0:
            test_in['temporal_period'] = 'future'
            test_template['temporal_period'] = 'future'
        else:
            test_in['temporal_period'] = 'same_period'
            test_template['temporal_period'] = 'same_period'
    
    print(f"‚úÖ Loaded {len(train_weeks)} weeks of training data")
    print(f"‚úÖ Training periods: Weeks {df_in['week'].min()} to {df_in['week'].max()}")
    
    return df_in, df_out, test_in, test_template

# ================================================================================
# GPU-ACCELERATED CORRELATION ANALYSIS
# ================================================================================

def accelerated_correlation_analysis(train_df, test_df, features):
    """Perform fast correlation analysis using GPU acceleration"""
    print("üöÄ Performing GPU-accelerated correlation analysis...")
    
    # Use GPU if available
    if GPU_AVAILABLE:
        train_gpu = cudf.from_pandas(train_df[features].dropna())
        test_gpu = cudf.from_pandas(test_df[features].dropna())
        
        # Compute correlations on GPU (much faster for large datasets)
        train_corr = train_gpu.corr().to_pandas()
        test_corr = test_gpu.corr().to_pandas()
        
        # Compute correlation differences
        corr_diff = test_corr - train_corr
    else:
        # Fallback to pandas
        train_corr = train_df[features].dropna().corr()
        test_corr = test_df[features].dropna().corr()
        corr_diff = test_corr - train_corr
    
    return train_corr, test_corr, corr_diff

# ================================================================================
# MAIN ENHANCED EDA EXECUTION
# ================================================================================

def perform_enhanced_eda():
    """Perform comprehensive EDA with distribution shift and temporal analysis"""
    
    print("üöÄ STARTING ENHANCED EXPLORATORY DATA ANALYSIS")
    print("="*90)
    
    # Initialize smart EDA
    eda = SmartEDA(gpu_acceleration=True)
    
    # Load data
    DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"
    df_in, df_out, test_in, test_template = load_data_with_temporal_analysis(DATA_DIR)
    
    # Explain importance
    explain_distribution_shift_importance()
    explain_temporal_analysis_importance()
    
    # ============================================================================
    # DISTRIBUTION SHIFT ANALYSIS
    # ============================================================================
    
    print("\n" + "="*90)
    print("üîç ANALYZING TRAIN-TEST DISTRIBUTION SHIFTS")
    print("="*90)
    
    # Prepare comparable datasets
    # For training, use the last observed positions (similar to test setup)
    train_last_obs = df_in.sort_values(['game_id','play_id','nfl_id','frame_id']).groupby(
        ['game_id','play_id','nfl_id'], as_index=False
    ).last()
    
    # Select key features for distribution comparison
    key_features = ['x', 'y', 's', 'a', 'o', 'dir', 'ball_land_x', 'ball_land_y', 
                   'absolute_yardline_number', 'player_weight']
    
    # Only use features present in both datasets
    common_features = [f for f in key_features if f in train_last_obs.columns and f in test_in.columns]
    
    print(f"Analyzing {len(common_features)} common features for distribution shifts...")
    
    # Perform distribution shift analysis
    distribution_analysis = eda.create_distribution_shift_dashboard(
        train_last_obs, test_in, common_features
    )
    
    # Visualize results
    shifted_features = eda.visualize_distribution_shifts(
        train_last_obs, test_in, distribution_analysis
    )
    
    # ============================================================================
    # TEMPORAL ANALYSIS
    # ============================================================================
    
    print("\n" + "="*90)
    print("‚è∞ ANALYZING TEMPORAL PATTERNS AND CONCEPT DRIFT")
    print("="*90)
    
    # Analyze how player behavior changes across weeks
    temporal_features = ['s', 'a', 'x', 'y']  # Speed, acceleration, positions
    
    # Create weekly aggregates
    weekly_behavior = df_in.groupby('week')[temporal_features].agg(['mean', 'std']).reset_index()
    
    # Plot temporal evolution
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx, feature in enumerate(temporal_features):
        ax = axes[idx]
        
        # Plot mean with confidence intervals
        weeks = weekly_behavior['week']
        means = weekly_behavior[(feature, 'mean')]
        stds = weekly_behavior[(feature, 'std')]
        
        ax.plot(weeks, means, marker='o', linewidth=2, label='Weekly Mean')
        ax.fill_between(weeks, means - stds, means + stds, alpha=0.2, label='¬±1 Std Dev')
        
        # Add trend line
        z = np.polyfit(weeks, means, 1)
        p = np.poly1d(z)
        ax.plot(weeks, p(weeks), "r--", alpha=0.8, label=f'Trend (slope: {z[0]:.3f})')
        
        ax.set_xlabel('Week')
        ax.set_ylabel(feature)
        ax.set_title(f'Temporal Evolution: {feature}', fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.suptitle('Player Behavior Temporal Analysis Across Weeks', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig('temporal_analysis.png', dpi=120, bbox_inches='tight')
    plt.show()
    
    # ============================================================================
    # GPU-ACCELERATED CORRELATION ANALYSIS
    # ============================================================================
    
    print("\n" + "="*90)
    print("üöÄ PERFORMING GPU-ACCELERATED CORRELATION ANALYSIS")
    print("="*90)
    
    # Use GPU for fast correlation computation
    correlation_features = [f for f in common_features if f in ['x', 'y', 's', 'a', 'o', 'dir']]
    
    if len(correlation_features) >= 3:  # Need at least 3 features for meaningful correlation
        train_corr, test_corr, corr_diff = accelerated_correlation_analysis(
            train_last_obs, test_in, correlation_features
        )
        
        # Plot correlation matrices
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))
        
        # Train correlations
        sns.heatmap(train_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
                   ax=axes[0], square=True, cbar_kws={'shrink': 0.8})
        axes[0].set_title('Training Data Correlations', fontweight='bold')
        
        # Test correlations
        sns.heatmap(test_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
                   ax=axes[1], square=True, cbar_kws={'shrink': 0.8})
        axes[1].set_title('Test Data Correlations', fontweight='bold')
        
        # Correlation differences
        sns.heatmap(corr_diff, annot=True, fmt='.2f', cmap='RdYlBu_r', center=0, 
                   ax=axes[2], square=True, cbar_kws={'shrink': 0.8})
        axes[2].set_title('Correlation Differences (Test - Train)', fontweight='bold')
        
        plt.suptitle('GPU-Accelerated Correlation Analysis: Train vs Test', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('correlation_analysis.png', dpi=120, bbox_inches='tight')
        plt.show()
        
        # Identify significant correlation changes
        significant_changes = np.abs(corr_diff) > 0.1
        if significant_changes.any().any():
            print("üö® Significant correlation changes detected:")
            for i, row in significant_changes.iterrows():
                for j, val in row.items():
                    if val and i != j:
                        print(f"  ‚Ä¢ {i} vs {j}: Œî = {corr_diff.loc[i, j]:.3f}")
    
    # ============================================================================
    # MULTIVARIATE DISTRIBUTION ANALYSIS
    # ============================================================================
    
    print("\n" + "="*90)
    print("üìä ANALYZING MULTIVARIATE DISTRIBUTION DIFFERENCES")
    print("="*90)
    
    # Compare joint distributions of key feature pairs
    feature_pairs = [('s', 'a'), ('x', 'y'), ('ball_land_x', 'ball_land_y')]
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    for idx, (feat1, feat2) in enumerate(feature_pairs):
        if feat1 in common_features and feat2 in common_features:
            # Train distribution
            ax1 = axes[0, idx]
            sample_train = train_last_obs.sample(min(5000, len(train_last_obs)))
            h1 = ax1.hist2d(sample_train[feat1], sample_train[feat2], bins=30, cmap='Blues', alpha=0.8)
            ax1.set_title(f'Train: {feat1} vs {feat2}', fontweight='bold')
            ax1.set_xlabel(feat1)
            ax1.set_ylabel(feat2)
            plt.colorbar(h1[3], ax=ax1)
            
            # Test distribution  
            ax2 = axes[1, idx]
            sample_test = test_in.sample(min(5000, len(test_in)))
            h2 = ax2.hist2d(sample_test[feat1], sample_test[feat2], bins=30, cmap='Reds', alpha=0.8)
            ax2.set_title(f'Test: {feat1} vs {feat2}', fontweight='bold')
            ax2.set_xlabel(feat1)
            ax2.set_ylabel(feat2)
            plt.colorbar(h2[3], ax=ax2)
    
    plt.suptitle('Multivariate Distribution Comparison: Train vs Test', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig('multivariate_comparison.png', dpi=120, bbox_inches='tight')
    plt.show()
    

    # GPU performance report
    if GPU_AVAILABLE:
        print(f"\n‚ö° **GPU ACCELERATION**: Enabled - ~10-100x speedup for large datasets")
        print("   ‚Ä¢ Faster correlation computation")
        print("   ‚Ä¢ Efficient multivariate analysis")
        print("   ‚Ä¢ Scalable to millions of rows")
    
    return {
        'distribution_analysis': distribution_analysis,
        'temporal_analysis': weekly_behavior,
        'shifted_features': shifted_features,
        'temporal_trends': week_correlations
    }

# ================================================================================
# EXECUTE ENHANCED EDA
# ================================================================================

if __name__ == "__main__":
    # Perform the enhanced EDA
    eda_results = perform_enhanced_eda()
    
    print("\n" + "="*90)
    print("‚úÖ ENHANCED EDA COMPLETE!")
    print("="*90)
    
    # Summary statistics
    if eda_results.get('shifted_features'):
        print(f"üö® Distribution shifts detected: {len(eda_results['shifted_features'])} features")
    else:
        print("‚úÖ No significant distribution shifts detected")
    
    significant_trends = sum(1 for v in eda_results.get('temporal_trends', {}).values() if abs(v) > 0.01)
    if significant_trends > 0:
        print(f"üìà Temporal trends detected: {significant_trends} features")
    else:
        print("‚úÖ No significant temporal trends detected")