# Hull Tactical - Market Prediction: Exploratory Data Analysis 


# 0. Load Packages and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import jarque_bera
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 200)
pd.set_option('display.precision', 6)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
file_path = r"/kaggle/input/hull-tactical-market-prediction/"

In [None]:
# Load datasets
train_df = pd.read_csv(f'{file_path}/train.csv')
test_df = pd.read_csv(f'{file_path}/test.csv')

In [None]:
import yfinance as yf

In [None]:
sp500 = yf.Ticker('^GSPC')
df_sp500 = sp500.history(period="max", start='1990-01-01')

In [None]:
df_sp500.shape

In [None]:
df_sp500['Close'].plot()

In [None]:
def reconstruct_price(pct_changes, initial_price):
    prices = [initial_price]
    
    for pct_change in pct_changes:
        next_price = prices[-1] * (1 + pct_change)
        prices.append(next_price)
    
    return prices

In [None]:
prices = reconstruct_price(train_df['forward_returns'].values, 100)

In [None]:
train_df['price'] = prices[:-1]

In [None]:
train_df['SP500'] = df_sp500['Close'].values[:len(train_df)]

In [None]:
train_df.shape

# 1. Data Overview and Statistics

In [None]:
# Basic information
print(f"\nDataset Shape: {train_df.shape}")
print(f"Number of rows: {train_df.shape[0]} | Number of columns: {train_df.shape[1]}")


In [None]:
train_df.head(5)

In [None]:
# Data types
print(train_df.dtypes.value_counts())


In [None]:
# Memory usage
print(f"Total memory usage: {train_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Basic statistics
train_df.describe()

In [None]:
# date_id - An identifier for a single trading day.
# M* - Market Dynamics/Technical features.
# E* - Macro Economic features.
# I* - Interest Rate features.
# P* - Price/Valuation features.
# V* - Volatility features.
# S* - Sentiment features.
# MOM* - Momentum features.
# D* - Dummy/Binary features.
# forward_returns - The returns from buying the S&P 500 and selling it a day later. Train set only.
# risk_free_rate - The federal funds rate. Train set only.
# market_forward_excess_returns - Forward returns relative to expectations. Computed by subtracting the rolling five-year mean forward returns and winsorizing the result using a median absolute deviation (MAD) with a criterion of 4. Train set only.
    

feature_categories = {
    'Identifier': ['date_id'],
    'Momentum_Features': [col for col in train_df.columns if col.startswith('MOM')],  # Momentum features
    'Market_Features': [col for col in train_df.columns if (col.startswith('M') & ~col.startswith('MOM'))],  # Market Dynamics/Technical features
    'Economic_Features': [col for col in train_df.columns if col.startswith('E')],  # Macro Economic features
    'Interest_Features': [col for col in train_df.columns if col.startswith('I')],  # Interest Rate features
    'Price_Features': [col for col in train_df.columns if col.startswith('P')],  # Price/Valuation features
    'Volatility_Features': [col for col in train_df.columns if col.startswith('V')],  # Volatility features
    'Sentiment_Features': [col for col in train_df.columns if col.startswith('S')],  # Sentiment features
    'Dummy_Features': [col for col in train_df.columns if col.startswith('D')],  # Dummy/Binary features
    'Target_Variables': ['forward_returns', 'risk_free_rate', 'market_forward_excess_returns'],
    'Reference_Data': ['price', 'SP500']
}

total_feats = 0
for category, features in feature_categories.items():
    available = [f for f in features if f in train_df.columns]
    total_feats = total_feats + len(available)
    print(f"{category:25s}: {len(available):3d} features")

print(f"Total number of features: {total_feats}")

In [None]:
pd.DataFrame(data = {'category': feature_categories.keys(), 'feats': feature_categories.values()})

In [None]:
train_df.head(10)

# 2. Target Variable Analysis

In [None]:
target = 'forward_returns'
train_df[target].describe()

In [None]:
print(f"Skewness:    {train_df[target].skew():.4f}")
print(f"Kurtosis:    {train_df[target].kurtosis():.4f}")


In [None]:
# Percentile analysis
percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    print(f"{p:2d}th percentile: {train_df[target].quantile(p/100):.8f}")


In [None]:

#Positive (True) vs Negative (False) returns

(train_df[target] > 0).value_counts()/len(train_df[target])


In [None]:
# Risk metrics
print(f"   Daily Volatility:       {train_df[target].std():.6f}")
print(f"   Annualized Volatility:  {train_df[target].std() * np.sqrt(252):.4f}")
print(f"   Value at Risk (95%):    {np.percentile(train_df[target], 5):.6f}")
print(f"   Value at Risk (99%):    {np.percentile(train_df[target], 1):.6f}")

In [None]:
# Normality tests
stat_jb, p_jb = jarque_bera(train_df[target])
print(f"\n Normality Test (Jarque-Bera):")
print(f"   Test Statistic: {stat_jb:.4f}")
print(f"   P-value:        {p_jb:.6f}")
print(f"   Normal?:        {'No (returns are NOT normally distributed)' if p_jb < 0.05 else 'Yes'}")

In [None]:
# Detailed returns analysis
def analyze_returns(df):
    """Detailed analysis of forward returns"""
    returns = df['forward_returns'].dropna()
    
    print("\n" + "="*80)
    print("DETAILED RETURNS ANALYSIS")
    print("="*80)
    
    # Distribution statistics
    print(f"\nPositive Returns: {(returns > 0).sum()} ({(returns > 0).sum()/len(returns)*100:.2f}%)")
    print(f"Negative Returns: {(returns < 0).sum()} ({(returns < 0).sum()/len(returns)*100:.2f}%)")
    print(f"Zero Returns: {(returns == 0).sum()} ({(returns == 0).sum()/len(returns)*100:.2f}%)")
    
    # Percentiles
    percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]
    print("\nPercentiles:")
    for p in percentiles:
        print(f"{p}th: {np.percentile(returns, p):.6f}")
    
    # Volatility
    print(f"\nAnnualized Volatility (assuming daily data): {returns.std() * np.sqrt(252):.4f}")
    
    # Sharpe Ratio (assuming risk-free rate column)
    if 'risk_free_rate' in df.columns:
        excess_returns = returns - df['risk_free_rate'].dropna().mean()
        sharpe = excess_returns.mean() / excess_returns.std() * np.sqrt(252)
        print(f"Sharpe Ratio: {sharpe:.4f}")

analyze_returns(train_df)

In [None]:
# Visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Histogram
axes[0, 0].hist(train_df[target].dropna(), bins=100, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(train_df[target].mean(), color='red', linestyle='--', label=f'Mean: {train_df[target].mean():.6f}')
axes[0, 0].axvline(train_df[target].median(), color='green', linestyle='--', label=f'Median: {train_df[target].median():.6f}')
axes[0, 0].set_title('Forward Returns Distribution')
axes[0, 0].set_xlabel('Returns')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Box plot
axes[0, 1].boxplot(train_df[target].dropna())
axes[0, 1].set_title('Forward Returns - Box Plot')
axes[0, 1].set_ylabel('Returns')

# Q-Q plot
stats.probplot(train_df[target].dropna(), dist="norm", plot=axes[0, 2])
axes[0, 2].set_title('Q-Q Plot (Normality Test)')

# Time series
axes[1, 0].plot(train_df['date_id'], train_df[target], linewidth=0.8, alpha=0.7)
axes[1, 0].axhline(0, color='red', linestyle='--', alpha=0.5)
axes[1, 0].set_title('Forward Returns Over Time')
axes[1, 0].set_xlabel('Date ID')
axes[1, 0].set_ylabel('Returns')
axes[1, 0].grid(True, alpha=0.3)

# Rolling statistics
window = 50
axes[1, 1].plot(train_df['date_id'], train_df[target], label='Actual', alpha=0.3)
axes[1, 1].plot(train_df['date_id'], train_df[target].rolling(window=window).mean(), 
                label=f'{window}-period MA', linewidth=2)
axes[1, 1].plot(train_df['date_id'], train_df[target].rolling(window=window).std(), 
                label=f'{window}-period Std', linewidth=2)
axes[1, 1].set_title('Forward Returns with Rolling Statistics')
axes[1, 1].set_xlabel('Date ID')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# Cumulative returns
axes[1, 2].plot(train_df['date_id'], (1 + train_df[target]).cumprod() - 1)
axes[1, 2].set_title('Cumulative Returns')
axes[1, 2].set_xlabel('Date ID')
axes[1, 2].set_ylabel('Cumulative Return')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 3. Missing Data Analysis

In [None]:
# Analyze missing data patterns by feature group
for category, features in feature_categories.items():
    if category not in ['Identifier', 'Target_Variables']:
        available = [f for f in features if f in train_df.columns]
        if available:
            missing_pct = (train_df[available].isnull().sum().sum() / (len(train_df) * len(available))) * 100
            print(f"{category:25s}: {missing_pct:6.2f}% missing")


In [None]:
missing_stats = pd.DataFrame({
    'Missing_Count': train_df.isnull().sum(),
    'Missing_Percentage': (train_df.isnull().sum() / len(train_df)) * 100,
    'Present_Count': train_df.notna().sum(),
    'First_Valid_Index': train_df.apply(lambda x: x.first_valid_index()),
    'Last_Valid_Index': train_df.apply(lambda x: x.last_valid_index())
}).sort_values('Missing_Percentage', ascending=False)



print("Top 30 Columns with Missing Values")
missing_stats[missing_stats['Missing_Count'] > 0].head(30)


In [None]:
# Visualize missing data pattern
fig, axes = plt.subplots(2, 1, figsize=(16, 12))

# Bar plot of missing percentages
top_missing = missing_stats[missing_stats['Missing_Count'] > 0].head(40)
axes[0].barh(range(len(top_missing)), top_missing['Missing_Percentage'], color='coral')
axes[0].set_yticks(range(len(top_missing)))
axes[0].set_yticklabels(top_missing.index, fontsize=8)
axes[0].set_xlabel('Missing Percentage (%)')
axes[0].set_title('Top 40 Features with Missing Values', fontsize=12, fontweight='bold')
axes[0].grid(alpha=0.3, axis='x')

# Missing data over time (heatmap style)
feature_sample = [col for col in train_df.columns if col.startswith(('E', 'M', 'P', 'S', 'V'))][:30]
missing_over_time = train_df[feature_sample].isnull().astype(int).T
axes[1].imshow(missing_over_time, aspect='auto', cmap='RdYlGn_r', interpolation='nearest')
axes[1].set_title('Missing Data Pattern Over Time (Sample of 30 Features)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('date_id (time)')
axes[1].set_ylabel('Features')
axes[1].set_yticks(range(len(feature_sample)))
axes[1].set_yticklabels(feature_sample, fontsize=7)

plt.tight_layout()
plt.show()

# 4. Dummy/Binary Features Analysis

In [None]:
d_features = [col for col in train_df.columns if col.startswith('D')]

for col in d_features:
    print(f"\n{col} distribution:")
    print(train_df[col].value_counts().sort_index())


In [None]:
len(d_features) 

In [None]:

# Analyze relationship with target
d_target_analysis = []
for col in d_features:
    for val in train_df[col].dropna().unique():
        subset_returns = train_df[train_df[col] == val]['forward_returns']
        d_target_analysis.append({
            'Feature': col,
            'Value': val,
            'Count': len(subset_returns),
            'Mean_Return': subset_returns.mean(),
            'Std_Return': subset_returns.std(),
            'Median_Return': subset_returns.median()
        })

d_analysis_df = pd.DataFrame(d_target_analysis)
d_analysis_df

In [None]:

fig, axes = plt.subplots(3, 3, figsize=(16, 12))
axes = axes.ravel()

for idx, col in enumerate(d_features):
    if idx < 9:
        train_df.groupby(col)['forward_returns'].mean().plot(kind='bar', ax=axes[idx], color='steelblue')
        axes[idx].set_title(f'{col} vs Mean Forward Returns', fontsize=10, fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Mean Forward Returns')
        axes[idx].grid(alpha=0.3, axis='y')
        axes[idx].axhline(0, color='red', linestyle='--', linewidth=1)

plt.tight_layout()
plt.show()

# 5. Market & Reference Data Analysis

In [None]:
# Analyze SP500 and price relationship
fig, axes = plt.subplots(3, 2, figsize=(16, 15))

# SP500 over time
axes[0, 0].plot(train_df['date_id'], train_df['SP500'], linewidth=1)
axes[0, 0].set_title('S&P 500 Index Over Time')
axes[0, 0].set_xlabel('Date ID')
axes[0, 0].set_ylabel('S&P 500')
axes[0, 0].grid(True, alpha=0.3)

# Price over time
axes[0, 1].plot(train_df['date_id'], train_df['price'], linewidth=1, color='green')
axes[0, 1].set_title('Price Over Time')
axes[0, 1].set_xlabel('Date ID')
axes[0, 1].set_ylabel('Price')
axes[0, 1].grid(True, alpha=0.3)

# Scatter: SP500 vs forward returns
axes[1, 0].scatter(train_df['SP500'], train_df[target], alpha=0.3, s=10)
axes[1, 0].set_title('S&P 500 vs Forward Returns')
axes[1, 0].set_xlabel('S&P 500')
axes[1, 0].set_ylabel('Forward Returns')
axes[1, 0].grid(True, alpha=0.3)

# Risk-free rate over time
axes[1, 1].plot(train_df['date_id'], train_df['risk_free_rate'], linewidth=1, color='orange')
axes[1, 1].set_title('Risk-Free Rate Over Time')
axes[1, 1].set_xlabel('Date ID')
axes[1, 1].set_ylabel('Risk-Free Rate')
axes[1, 1].grid(True, alpha=0.3)

corr_price_sp500 = train_df['price'].corr(train_df['SP500'])

# Dual axis comparison
ax1 = axes[2, 0]
ax2 = ax1.twinx()
ax1.plot(train_df['date_id'], train_df['price'], color='green', label='Asset Price', linewidth=1.5)
ax2.plot(train_df['date_id'], train_df['SP500'], color='blue', label='S&P 500', linewidth=1.5, alpha=0.7)
ax1.set_xlabel('date_id')
ax1.set_ylabel('Asset Price', color='green')
ax2.set_ylabel('S&P 500', color='blue')
ax1.set_title('Asset Price vs S&P 500', fontsize=12, fontweight='bold')
ax1.grid(alpha=0.3)

# Scatter plot
axes[2, 1].scatter(train_df['SP500'], train_df['price'], alpha=0.5, s=10)
axes[2, 1].set_title(f'Asset Price vs S&P 500 (Corr: {corr_price_sp500:.4f})', 
                     fontsize=12, fontweight='bold')
axes[2, 1].set_xlabel('S&P 500')
axes[2, 1].set_ylabel('Asset Price')
axes[2, 1].grid(alpha=0.3)

# Add regression line
z = np.polyfit(train_df['SP500'].dropna(), train_df['price'].dropna(), 1)
p = np.poly1d(z)
axes[2, 1].plot(train_df['SP500'], p(train_df['SP500']), "r--", linewidth=2, alpha=0.8)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis

reference_cols = ['price', 'SP500', 'risk_free_rate', 'market_forward_excess_returns']
for col in reference_cols:
    if col in train_df.columns:
        corr = train_df[col].corr(train_df[target])
        print(f"{col:35s}: {corr:7.4f}")

# 6. Feature Group Correlation Analysis

In [None]:
# Calculate correlations for each feature group
all_correlations = []

for category, features in feature_categories.items():
    if category not in ['Identifier', 'Target_Variables', 'Reference_Data']:
        available = [f for f in features if f in train_df.columns]
        if available:
            # Calculate correlation with target, handling NaN
            corrs = train_df[available].corrwith(train_df[target])
            corrs = corrs.dropna()
            
            if len(corrs) > 0:
                print(f"\n--- {category} ---")
                print(f"Features with data: {len(corrs)}")
                print(f"Top 5 positive correlations:")
                print(corrs.nlargest(5))
                print(f"Top 5 negative correlations:")
                print(corrs.nsmallest(5))
                
                # Store for overall analysis
                for feat, corr_val in corrs.items():
                    all_correlations.append({
                        'Feature': feat,
                        'Category': category,
                        'Correlation': corr_val
                    })

In [None]:
# Create correlation dataframe
corr_df = pd.DataFrame(all_correlations).sort_values('Correlation', key=abs, ascending=False)

# Visualize top correlations
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Top positive correlations
top_positive = corr_df.nlargest(20, 'Correlation')
axes[0].barh(range(len(top_positive)), top_positive['Correlation'].values)
axes[0].set_yticks(range(len(top_positive)))
axes[0].set_yticklabels(top_positive['Feature'].values, fontsize=8)
axes[0].set_xlabel('Correlation')
axes[0].set_title('Top 20 Positive Correlations with Forward Returns')
axes[0].grid(True, alpha=0.3, axis='x')

# Top negative correlations
top_negative = corr_df.nsmallest(20, 'Correlation')
axes[1].barh(range(len(top_negative)), top_negative['Correlation'].values, color='red')
axes[1].set_yticks(range(len(top_negative)))
axes[1].set_yticklabels(top_negative['Feature'].values, fontsize=8)
axes[1].set_xlabel('Correlation')
axes[1].set_title('Top 20 Negative Correlations with Forward Returns')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

In [None]:
# Advanced correlation analysis for feature groups
def analyze_feature_group_correlation(df, prefix, target='forward_returns'):
    """Analyze correlation within a feature group"""
    cols = [col for col in df.columns if col.startswith(prefix)]
    
    if len(cols) > 1:
        # Correlation within group
        group_corr = df[cols].corr()
        
        # Plot heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(group_corr, cmap='coolwarm', center=0, 
                    square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
        plt.title(f'Correlation Matrix: {prefix} Features')
        plt.tight_layout()
        plt.show()
        
        # Find highly correlated pairs
        corr_pairs = []
        for i in range(len(group_corr.columns)):
            for j in range(i+1, len(group_corr.columns)):
                if abs(group_corr.iloc[i, j]) > 0.8:
                    corr_pairs.append({
                        'Feature1': group_corr.columns[i],
                        'Feature2': group_corr.columns[j],
                        'Correlation': group_corr.iloc[i, j]
                    })
        
        if corr_pairs:
            print(f"\nHighly correlated pairs in {prefix} features (|r| > 0.8):")
            print(pd.DataFrame(corr_pairs))

# Run for each feature group
for prefix in ['E', 'M', 'P', 'S', 'V']:
    analyze_feature_group_correlation(train_df, prefix)

# 7. Feature Availability Analysis

In [None]:
# Determine when each feature group becomes available
availability_info = []

for category, features in feature_categories.items():
    if category not in ['Identifier', 'Target_Variables', 'Reference_Data', 'Dummy_Features']:
        available = [f for f in features if f in train_df.columns]
        if available:
            # Find first date with non-null data
            first_data = train_df[train_df[available].notna().any(axis=1)]['date_id'].min()
            # Count rows with any data
            rows_with_data = train_df[available].notna().any(axis=1).sum()
            # Calculate completeness
            completeness = (train_df[available].notna().sum().sum() / (len(train_df) * len(available))) * 100
            
            availability_info.append({
                'Category': category,
                'Num_Features': len(available),
                'First_Data_Date': first_data,
                'Rows_With_Data': rows_with_data,
                'Overall_Completeness_%': completeness
            })

availability_df = pd.DataFrame(availability_info).sort_values('First_Data_Date')
availability_df

# 8. Volatility Analysis

In [None]:

# Calculate rolling volatility
windows = [20, 50, 100]
fig, axes = plt.subplots(len(windows), 1, figsize=(16, 12))

for idx, window in enumerate(windows):
    rolling_vol = train_df[target].rolling(window=window).std()
    axes[idx].plot(train_df['date_id'], rolling_vol, linewidth=1)
    axes[idx].set_title(f'{window}-Period Rolling Volatility')
    axes[idx].set_xlabel('Date ID')
    axes[idx].set_ylabel('Volatility')
    axes[idx].grid(True, alpha=0.3)
    
    print(f"\n{window}-period rolling volatility:")
    print(f"  Mean: {rolling_vol.mean():.6f}")
    print(f"  Std:  {rolling_vol.std():.6f}")
    print(f"  Max:  {rolling_vol.max():.6f}")

plt.tight_layout()
plt.show()

# 9. Excess Returns & Risk-free Rate Analysis

In [None]:

print(f"Risk-Free Rate Statistics:")
print(train_df['risk_free_rate'].describe())

print(f"Market Forward Excess Returns Statistics:")
print(train_df['market_forward_excess_returns'].describe())


In [None]:

# Verify relationship
calculated_excess = train_df['forward_returns'] - train_df['risk_free_rate']
print(f"Verification: market_forward_excess_returns = forward_returns - risk_free_rate")
print(f"  Max Difference: {abs(calculated_excess - train_df['market_forward_excess_returns']).max():.10f}")
print(f"   Relationship confirmed!" if abs(calculated_excess - train_df['market_forward_excess_returns']).max() < 1e-6 else "")


In [None]:

# Sharpe Ratio
sharpe_ratio = train_df['market_forward_excess_returns'].mean() / train_df['market_forward_excess_returns'].std() * np.sqrt(252)
print(f" Sharpe Ratio (annualized): {sharpe_ratio:.4f}")

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Risk-free rate over time
axes[0, 0].plot(train_df['date_id'], train_df['risk_free_rate'] * 100, linewidth=1.2)
axes[0, 0].set_title('Risk-Free Rate Over Time', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('date_id')
axes[0, 0].set_ylabel('Risk-Free Rate (%)')
axes[0, 0].grid(alpha=0.3)

# Excess returns over time
axes[0, 1].plot(train_df['date_id'], train_df['market_forward_excess_returns'], linewidth=0.8, alpha=0.7)
axes[0, 1].axhline(0, color='red', linestyle='--', linewidth=1)
axes[0, 1].set_title('Market Forward Excess Returns Over Time', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('date_id')
axes[0, 1].set_ylabel('Excess Returns')
axes[0, 1].grid(alpha=0.3)

# Distribution comparison
axes[1, 0].hist(train_df['forward_returns'].dropna(), bins=50, alpha=0.5, label='Forward Returns', color='blue')
axes[1, 0].hist(train_df['market_forward_excess_returns'].dropna(), bins=50, alpha=0.5, 
                label='Excess Returns', color='orange')
axes[1, 0].set_title('Returns Distribution Comparison', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Returns')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Rolling Sharpe ratio
rolling_sharpe = (train_df['market_forward_excess_returns'].rolling(window=60).mean() / 
                  train_df['market_forward_excess_returns'].rolling(window=60).std() * np.sqrt(252))
axes[1, 1].plot(train_df['date_id'], rolling_sharpe, linewidth=1.2)
axes[1, 1].axhline(0, color='red', linestyle='--', linewidth=1)
axes[1, 1].set_title('60-Day Rolling Sharpe Ratio', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('date_id')
axes[1, 1].set_ylabel('Sharpe Ratio')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# 10. Train vs Test Comparison

In [None]:
print(f"\nTraining period: date_id {train_df['date_id'].min()} to {train_df['date_id'].max()}")
print(f"Test period:     date_id {test_df['date_id'].min()} to {test_df['date_id'].max()}")
print(f"\nTime gap: {test_df['date_id'].min() - train_df['date_id'].max()} periods")

# Compare feature availability
print("\n--- Feature Availability Comparison ---")
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)
common_cols = train_cols.intersection(test_cols)
train_only = train_cols - test_cols
test_only = test_cols - train_cols

print(f"Common features: {len(common_cols)}")
print(f"Train only: {len(train_only)} - {list(train_only)}")
print(f"Test only: {len(test_only)} - {list(test_only)}")