In [None]:
#!/usr/bin/env python3
"""
S&P 500 Market Timing Competition - Kaggle Notebook
Optimized for Kaggle environment with proper API integration
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Kaggle-specific imports
import os
import gc
from pathlib import Path

class KaggleSP500Pipeline:
    def __init__(self):
        # Kaggle paths
        self.input_path = Path('/kaggle/input/hull-tactical-market-prediction')
        self.working_path = Path('/kaggle/working')
        
        # Model components
        self.scaler = RobustScaler()  # More robust to outliers
        self.models = {}
        self.feature_columns = []
        self.target_column = 'market_forward_excess_returns'
        
        # Model weights (will be optimized during validation)
        self.model_weights = {}
        
    def load_data(self):
        """Load data from multiple possible locations"""
        print("Loading data...")
        
        # Possible data locations (ordered by preference)
        possible_paths = [
            # Current directory (for uploaded files)
            Path('.'),
            # Kaggle competition paths  
            Path('/kaggle/input/hull-tactical-market-prediction'),
            Path('/kaggle/input/hull-tactical-market-prediction'),
            # Local paths
            Path('./input'),
            Path('./data'), 
            # Current working directory
            Path(os.getcwd())
        ]
        
        self.train_df = None
        self.test_df = None
        
        # Try to find the data files
        for base_path in possible_paths:
            print(f"Checking path: {base_path}")
            
            # Try different file patterns
            train_files = [
                base_path / 'train.csv',
                base_path / 'sp-500-market-timing' / 'train.csv'
            ]
            
            test_files = [
                base_path / 'test.csv', 
                base_path / 'sp-500-market-timing' / 'test.csv'
            ]
            
            # Look for train.csv
            for train_path in train_files:
                if train_path.exists():
                    self.train_df = pd.read_csv(train_path)
                    print(f"‚úì Found training data at: {train_path}")
                    print(f"  Shape: {self.train_df.shape}")
                    break
            
            # Look for test.csv
            for test_path in test_files:
                if test_path.exists():
                    self.test_df = pd.read_csv(test_path)
                    print(f"‚úì Found test data at: {test_path}")
                    print(f"  Shape: {self.test_df.shape}")
                    break
            
            # If both found, stop searching
            if self.train_df is not None and self.test_df is not None:
                break
        
        # Check if files were found
        if self.train_df is None:
            print("‚ùå Training data not found in any location!")
            # List available files for debugging
            print("\nAvailable files in current directory:")
            for item in Path('.').iterdir():
                print(f"  {item}")
            raise FileNotFoundError("Could not find train.csv")
            
        if self.test_df is None:
            print("‚ùå Test data not found in any location!")
            raise FileNotFoundError("Could not find test.csv")
            
        return self.train_df, self.test_df
    
    def quick_eda(self):
        """Quick exploratory data analysis for Kaggle"""
        print("\n" + "="*50)
        print("QUICK EDA")
        print("="*50)
        
        # Basic info
        print(f"Training period: {self.train_df['date_id'].min()} to {self.train_df['date_id'].max()}")
        print(f"Test period: {self.test_df['date_id'].min()} to {self.test_df['date_id'].max()}")
        
        # Target statistics (recent data only to avoid early missing values)
        recent_data = self.train_df.iloc[-1000:]  # Last 1000 rows
        target_stats = recent_data[self.target_column].describe()
        print(f"\nTarget ({self.target_column}) statistics (recent 1000 days):")
        print(target_stats)
        
        # Feature groups
        feature_groups = {}
        for col in self.train_df.columns:
            if col not in ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']:
                prefix = col[0] if col[0].isalpha() else 'Other'
                feature_groups[prefix] = feature_groups.get(prefix, 0) + 1
        
        print(f"\nFeature groups: {feature_groups}")
        
        # Missing data check (recent data)
        missing_recent = recent_data.isnull().sum()
        missing_features = missing_recent[missing_recent > 0]
        print(f"\nFeatures with missing values (recent data): {len(missing_features)}")
        
    def create_engineered_features(self, df, is_train=True):
        """Optimized feature engineering for Kaggle environment"""
        print(f"Creating features... (is_train={is_train})")
        
        # Start with original features
        df_features = df.copy()
        
        # Get base feature columns
        exclude_cols = ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
        if not is_train:
            exclude_cols.extend(['is_scored', 'lagged_forward_returns', 'lagged_risk_free_rate', 'lagged_market_forward_excess_returns'])
        
        base_features = [col for col in df.columns if col not in exclude_cols]
        numeric_features = [col for col in base_features if df[col].dtype in ['float64', 'int64']]
        
        print(f"Base numeric features: {len(numeric_features)}")
        
        # 1. Rolling features (trend indicators)
        windows = [5, 20, 60]  # Short, medium, long term
        for window in windows:
            for i, feature in enumerate(numeric_features[:15]):  # Limit to avoid memory issues
                if i % 5 == 0:  # Progress indicator
                    print(f"Processing rolling features: {i+1}/{min(15, len(numeric_features))}")
                
                df_features[f'{feature}_ma{window}'] = df[feature].rolling(window=window, min_periods=1).mean()
                
                # Only add std for longer windows to reduce feature count
                if window >= 20:
                    df_features[f'{feature}_std{window}'] = df[feature].rolling(window=window, min_periods=1).std()
        
        # 2. Lag features (previous day information)
        lags = [1, 2, 5]
        for lag in lags:
            for feature in numeric_features[:8]:  # Top features only
                df_features[f'{feature}_lag{lag}'] = df[feature].shift(lag)
        
        # 3. Technical indicators - ALWAYS CREATE THESE FEATURES
        # Volatility regime (always create, use defaults if not enough data)
        if 'V1' in df.columns:
            if df['V1'].notna().sum() > 50:  # Lowered threshold
                vol_threshold = df['V1'].rolling(min(252, len(df)), min_periods=5).quantile(0.75)
                df_features['high_vol_regime'] = (df['V1'] > vol_threshold).astype(int)
            else:
                # Default: neutral regime for small datasets
                df_features['high_vol_regime'] = 0
        else:
            df_features['high_vol_regime'] = 0
        
        # Momentum signal (always create)
        if 'M1' in df.columns:
            if df['M1'].notna().sum() > 10:  # Lowered threshold
                momentum_ma = df['M1'].rolling(min(20, len(df)), min_periods=2).mean()
                df_features['momentum_signal'] = np.sign(df['M1'] - momentum_ma)
            else:
                # Default: neutral momentum for small datasets
                df_features['momentum_signal'] = 0
        else:
            df_features['momentum_signal'] = 0
        
        # 4. Cross-feature interactions (limited to avoid explosion)
        vol_features = [col for col in base_features if col.startswith('V')][:3]
        sent_features = [col for col in base_features if col.startswith('S')][:3]
        
        for v_feat in vol_features:
            for s_feat in sent_features:
                if v_feat in df.columns and s_feat in df.columns:
                    df_features[f'{v_feat}_{s_feat}_interaction'] = df[v_feat] * df[s_feat]
        
        # 5. Add lagged returns if available (test set)
        if 'lagged_forward_returns' in df.columns:
            df_features['lagged_return_sign'] = np.sign(df['lagged_forward_returns'])
            df_features['lagged_return_magnitude'] = np.abs(df['lagged_forward_returns'])
            
            # Rolling averages of lagged returns
            df_features['lagged_return_ma5'] = df['lagged_forward_returns'].rolling(5, min_periods=1).mean()
            df_features['lagged_return_ma20'] = df['lagged_forward_returns'].rolling(min(20, len(df)), min_periods=1).mean()
        
        print(f"Final feature count: {df_features.shape[1]}")
        
        # Memory cleanup
        gc.collect()
        
        return df_features
    
    def prepare_training_data(self):
        """Prepare clean training data"""
        print("Preparing training data...")
        
        # Create features
        train_features = self.create_engineered_features(self.train_df, is_train=True)
        
        # Remove rows with missing target
        train_clean = train_features.dropna(subset=[self.target_column])
        print(f"Training rows after target cleaning: {len(train_clean)}")
        
        # Use more recent data for better model performance (last 3 years ‚âà 750 trading days)
        if len(train_clean) > 1500:
            train_clean = train_clean.iloc[-1500:]
            print(f"Using recent {len(train_clean)} rows for training")
        
        # Get feature columns
        exclude_cols = ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
        self.feature_columns = [col for col in train_clean.columns if col not in exclude_cols]
        
        print(f"Using {len(self.feature_columns)} features")
        
        # Prepare X and y with proper missing value handling
        X = train_clean[self.feature_columns]
        
        # Smart missing value imputation
        X = X.fillna(X.median())  # Use median for robustness
        
        y = train_clean[self.target_column]
        date_ids = train_clean['date_id']
        
        return X, y, date_ids
    
    def train_ensemble_models(self, X, y):
        """Train ensemble of models optimized for Kaggle"""
        print("Training ensemble models...")
        
        # 1. LightGBM (fast and effective)
        self.models['lgb'] = lgb.LGBMRegressor(
            n_estimators=200,
            max_depth=8,
            learning_rate=0.05,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1
        )
        
        # 2. XGBoost  
        self.models['xgb'] = xgb.XGBRegressor(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbosity=0
        )
        
        # 3. Linear model (regularized)
        self.models['ridge'] = Ridge(alpha=10.0)
        
        # 4. Random Forest (for diversity)
        self.models['rf'] = RandomForestRegressor(
            n_estimators=100,
            max_depth=12,
            random_state=42,
            n_jobs=-1
        )
        
        # Train models
        for name, model in self.models.items():
            print(f"Training {name}...")
            if name == 'ridge':
                X_scaled = self.scaler.fit_transform(X)
                model.fit(X_scaled, y)
            else:
                model.fit(X, y)
        
        print("‚úì All models trained!")
    
    def validate_and_optimize_weights(self, X, y):
        """Validate models and optimize ensemble weights"""
        print("Validating models with time series CV...")
        
        tscv = TimeSeriesSplit(n_splits=3)
        model_performance = {name: [] for name in self.models.keys()}
        
        for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
            print(f"Fold {fold + 1}/3")
            
            X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
            y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]
            
            fold_predictions = {}
            
            for name, model in self.models.items():
                if name == 'ridge':
                    X_train_scaled = self.scaler.fit_transform(X_train_cv)
                    X_val_scaled = self.scaler.transform(X_val_cv)
                    
                    model.fit(X_train_scaled, y_train_cv)
                    pred = model.predict(X_val_scaled)
                else:
                    model.fit(X_train_cv, y_train_cv)
                    pred = model.predict(X_val_cv)
                
                fold_predictions[name] = pred
                mse = mean_squared_error(y_val_cv, pred)
                model_performance[name].append(mse)
        
        # Calculate average performance and weights
        avg_performance = {name: np.mean(scores) for name, scores in model_performance.items()}
        
        print("\nModel Performance (MSE):")
        for name, score in avg_performance.items():
            print(f"{name}: {score:.6f}")
        
        # Inverse MSE weighting (better models get higher weight)
        total_inv_mse = sum(1/score for score in avg_performance.values())
        self.model_weights = {name: (1/score)/total_inv_mse for name, score in avg_performance.items()}
        
        print("\nOptimized Model Weights:")
        for name, weight in self.model_weights.items():
            print(f"{name}: {weight:.3f}")
    
    def predict_allocations(self, test_df):
        """Generate allocation predictions for test data"""
        print("Generating predictions...")
        
        # Create test features
        test_features = self.create_engineered_features(test_df, is_train=False)
        
        # Ensure test features match training features
        # Add any missing features with default values
        for col in self.feature_columns:
            if col not in test_features.columns:
                print(f"Adding missing feature {col} with default value 0")
                test_features[col] = 0
        
        # Select only the features used during training
        X_test = test_features[self.feature_columns]
        
        # Handle missing values
        X_test = X_test.fillna(X_test.median())
        
        # If median results in NaN (all values are NaN), fill with 0
        X_test = X_test.fillna(0)
        
        print(f"Test feature matrix shape: {X_test.shape}")
        
        # Get predictions from all models
        predictions = {}
        for name, model in self.models.items():
            if name == 'ridge':
                X_test_scaled = self.scaler.transform(X_test)
                predictions[name] = model.predict(X_test_scaled)
            else:
                predictions[name] = model.predict(X_test)
        
        # Weighted ensemble prediction
        ensemble_pred = np.zeros(len(X_test))
        for name, pred in predictions.items():
            ensemble_pred += self.model_weights[name] * pred
        
        # Convert predictions to allocations
        allocations = self.convert_to_allocations(ensemble_pred)
        
        return allocations, ensemble_pred
    
    def convert_to_allocations(self, predictions):
        """Convert excess return predictions to allocation weights (0-2)"""
        allocations = np.ones_like(predictions)  # Start with market weight
        
        # Dynamic thresholds based on prediction distribution
        high_threshold = np.percentile(predictions, 70)
        low_threshold = np.percentile(predictions, 30)
        
        # Conservative allocation strategy
        for i, pred in enumerate(predictions):
            if pred > high_threshold:
                # Positive signal: increase allocation
                confidence = min((pred - high_threshold) / (np.max(predictions) - high_threshold + 1e-8), 1.0)
                allocations[i] = 1.0 + 1.0 * confidence  # Max 2x leverage
            elif pred < low_threshold:
                # Negative signal: reduce allocation
                confidence = min((low_threshold - pred) / (low_threshold - np.min(predictions) + 1e-8), 1.0)
                allocations[i] = 1.0 - 0.8 * confidence  # Min 0.2x (keep some market exposure)
        
        # Ensure bounds [0, 2]
        allocations = np.clip(allocations, 0, 2)
        
        return allocations
    
    def analyze_results(self, allocations, predictions, test_df):
        """Analyze prediction results"""
        print("\n" + "="*50)
        print("RESULTS ANALYSIS")  
        print("="*50)
        
        print(f"Allocation Statistics:")
        print(f"  Mean: {np.mean(allocations):.3f}")
        print(f"  Std:  {np.std(allocations):.3f}")
        print(f"  Min:  {np.min(allocations):.3f}")
        print(f"  Max:  {np.max(allocations):.3f}")
        
        leverage_days = np.sum(allocations > 1.05)
        conservative_days = np.sum(allocations < 0.95)
        
        print(f"\nStrategy Breakdown:")
        print(f"  Leverage days (>1.05):     {leverage_days}/{len(allocations)} ({leverage_days/len(allocations)*100:.1f}%)")
        print(f"  Conservative days (<0.95): {conservative_days}/{len(allocations)} ({conservative_days/len(allocations)*100:.1f}%)")
        print(f"  Neutral days:              {len(allocations)-leverage_days-conservative_days}/{len(allocations)}")
        
        # Quick visualization
        plt.figure(figsize=(15, 5))
        
        plt.subplot(1, 3, 1)
        plt.hist(allocations, bins=20, alpha=0.7, edgecolor='black')
        plt.xlabel('Allocation Weight')
        plt.ylabel('Frequency')
        plt.title('Allocation Distribution')
        plt.axvline(1.0, color='red', linestyle='--', alpha=0.7, label='Market Weight')
        plt.legend()
        
        plt.subplot(1, 3, 2)
        plt.plot(range(len(allocations)), allocations, alpha=0.8)
        plt.xlabel('Test Day')
        plt.ylabel('Allocation Weight')
        plt.title('Allocations Over Time')
        plt.axhline(1.0, color='red', linestyle='--', alpha=0.7)
        
        plt.subplot(1, 3, 3)
        plt.scatter(predictions, allocations, alpha=0.6)
        plt.xlabel('Excess Return Prediction')
        plt.ylabel('Allocation Weight')
        plt.title('Prediction vs Allocation')
        plt.axhline(1.0, color='red', linestyle='--', alpha=0.7)
        plt.axvline(0.0, color='red', linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        plt.show()

def main():
    """Main execution function for Kaggle"""
    print("üöÄ Starting S&P 500 Market Timing Pipeline")
    print("="*60)
    
    # Initialize pipeline
    pipeline = KaggleSP500Pipeline()
    
    try:
        # Load data
        train_df, test_df = pipeline.load_data()
        
        # Quick EDA
        pipeline.quick_eda()
        
        # Prepare training data  
        X, y, date_ids = pipeline.prepare_training_data()
        
        # Train models
        pipeline.train_ensemble_models(X, y)
        
        # Validate and optimize
        pipeline.validate_and_optimize_weights(X, y)
        
        # Generate predictions
        allocations, predictions = pipeline.predict_allocations(test_df)
        
        # Analyze results
        pipeline.analyze_results(allocations, predictions, test_df)
        
        # Create submission format
        submission_df = pd.DataFrame({
            'date_id': test_df['date_id'],
            'allocation': allocations
        })
        
        # Save results
        submission_path = Path('/kaggle/working/submission.csv')
        if not submission_path.parent.exists():
            submission_path = Path('./submission.csv')
        
        submission_df.to_csv(submission_path, index=False)
        
        print(f"\n‚úÖ Pipeline completed successfully!")
        print(f"üìä Submission saved to: {submission_path}")
        print(f"üéØ Mean allocation: {np.mean(allocations):.3f}")
        print(f"üìà Leverage usage: {np.sum(allocations > 1.05)/len(allocations)*100:.1f}%")
        
        return pipeline, submission_df
        
    except FileNotFoundError as e:
        print(f"\n‚ùå Data files not found: {e}")
        print("\nüí° Solutions:")
        print("1. Make sure you're in a Kaggle notebook with the competition dataset attached")
        print("2. Or upload train.csv and test.csv to your current directory")  
        print("3. Or run this code in the competition environment")
        return None, None
        
    except Exception as e:
        print(f"‚ùå Error in pipeline: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

# Execute pipeline
if __name__ == "__main__":
    pipeline, submission = main()