# Exploratory Data Analysis for Quantitative Competition

This notebook performs comprehensive EDA on the competition data to understand:
- Data distributions and characteristics
- Target correlations
- Feature relationships
- Temporal patterns
- Spike features and outliers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## 1. Load and Inspect Data

In [None]:
# Load data
df = pd.read_csv('../data/train.csv')
print(f"Data shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nMemory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

In [None]:
# Basic info
df.info()

In [None]:
# First few rows
df.head(10)

In [None]:
# Separate features and targets
feature_cols = [chr(ord('A') + i) for i in range(14)]  # A through N
target_cols = ['Y1', 'Y2']
time_col = 'time'

X = df[feature_cols]
y = df[target_cols]
time = df[time_col]

print(f"Features shape: {X.shape}")
print(f"Targets shape: {y.shape}")
print(f"Time range: {time.min()} to {time.max()}")

## 2. Missing Values Analysis

In [None]:
# Missing values
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isna().sum(),
    'Missing_Percent': (df.isna().sum() / len(df)) * 100
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)

if len(missing_df) > 0:
    print("Missing values found:")
    print(missing_df)
    
    # Visualize missing patterns
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isna(), yticklabels=False, cbar=True, cmap='viridis')
    plt.title('Missing Values Pattern')
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")

## 3. Target Analysis

In [None]:
# Target statistics
print("Target Statistics:")
print(y.describe())

# Target correlation
target_corr = y.corr().iloc[0, 1]
print(f"\nY1-Y2 Correlation: {target_corr:.4f}")

if abs(target_corr) < 0.3:
    print("⚠️ Low correlation - consider independent models")
elif abs(target_corr) > 0.9:
    print("⚠️ Very high correlation - check for redundancy")
else:
    print("✓ Moderate correlation - good for multi-target learning")

In [None]:
# Visualize target distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# Y1 distribution
axes[0, 0].hist(y['Y1'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Y1 Distribution')
axes[0, 0].set_xlabel('Y1')
axes[0, 0].set_ylabel('Frequency')

# Y2 distribution
axes[0, 1].hist(y['Y2'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Y2 Distribution')
axes[0, 1].set_xlabel('Y2')
axes[0, 1].set_ylabel('Frequency')

# Scatter plot
axes[0, 2].scatter(y['Y1'], y['Y2'], alpha=0.5, s=1)
axes[0, 2].set_title(f'Y1 vs Y2 (corr={target_corr:.3f})')
axes[0, 2].set_xlabel('Y1')
axes[0, 2].set_ylabel('Y2')

# Q-Q plots
stats.probplot(y['Y1'], dist="norm", plot=axes[1, 0])
axes[1, 0].set_title('Y1 Q-Q Plot')

stats.probplot(y['Y2'], dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Y2 Q-Q Plot')

# Time series of targets
axes[1, 2].plot(time[:1000], y['Y1'][:1000], label='Y1', alpha=0.7)
axes[1, 2].plot(time[:1000], y['Y2'][:1000], label='Y2', alpha=0.7)
axes[1, 2].set_title('Target Time Series (first 1000 samples)')
axes[1, 2].set_xlabel('Time')
axes[1, 2].set_ylabel('Target Value')
axes[1, 2].legend()

plt.tight_layout()
plt.show()

## 4. Feature Analysis

In [None]:
# Feature statistics
print("Feature Statistics:")
print(X.describe())

In [None]:
# Feature distributions
n_features = len(feature_cols)
n_cols = 4
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, n_rows * 3))
axes = axes.flatten()

for i, col in enumerate(feature_cols):
    axes[i].hist(X[col].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Feature {col}')
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')
    
    # Add statistics
    mean_val = X[col].mean()
    std_val = X[col].std()
    axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.2f}')
    axes[i].legend(fontsize=8)

# Hide unused subplots
for i in range(n_features, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

## 5. Spike Features Detection

In [None]:
# Detect spike features (high concentration of specific values)
spike_threshold = 0.1  # 10% of samples with same value
spike_features = []

print("Spike Features Analysis:")
print("=" * 60)

for col in feature_cols:
    value_counts = X[col].value_counts()
    if len(value_counts) > 0:
        top_value = value_counts.iloc[0]
        top_value_ratio = top_value / len(X)
        
        if top_value_ratio > spike_threshold:
            spike_features.append(col)
            print(f"Feature {col}:")
            print(f"  - {top_value_ratio:.1%} samples have value {value_counts.index[0]:.4f}")
            print(f"  - Top 5 values: {value_counts.head().to_dict()}")
            print()

if spike_features:
    print(f"\nFound {len(spike_features)} spike features: {spike_features}")
    print("These features may require special handling (e.g., categorical encoding)")
else:
    print("No spike features detected.")

## 6. Feature-Target Correlations

In [None]:
# Calculate correlations with targets
correlations = pd.DataFrame({
    'Feature': feature_cols,
    'Corr_Y1': [X[col].corr(y['Y1']) for col in feature_cols],
    'Corr_Y2': [X[col].corr(y['Y2']) for col in feature_cols]
})

correlations['Avg_Abs_Corr'] = (correlations['Corr_Y1'].abs() + correlations['Corr_Y2'].abs()) / 2
correlations = correlations.sort_values('Avg_Abs_Corr', ascending=False)

print("Feature-Target Correlations:")
print(correlations)

# Visualize correlations
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Y1 correlations
axes[0].barh(correlations['Feature'], correlations['Corr_Y1'].abs())
axes[0].set_xlabel('Absolute Correlation')
axes[0].set_title('Feature Correlations with Y1')
axes[0].invert_yaxis()

# Y2 correlations
axes[1].barh(correlations['Feature'], correlations['Corr_Y2'].abs())
axes[1].set_xlabel('Absolute Correlation')
axes[1].set_title('Feature Correlations with Y2')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 7. Feature Correlation Matrix

In [None]:
# Feature correlation matrix
feature_corr_matrix = X.corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(feature_corr_matrix, dtype=bool))
sns.heatmap(feature_corr_matrix, mask=mask, annot=True, fmt='.2f', 
            cmap='coolwarm', center=0, square=True, 
            cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Find highly correlated feature pairs
high_corr_threshold = 0.8
high_corr_pairs = []

for i in range(len(feature_cols)):
    for j in range(i+1, len(feature_cols)):
        corr_val = abs(feature_corr_matrix.iloc[i, j])
        if corr_val > high_corr_threshold:
            high_corr_pairs.append((feature_cols[i], feature_cols[j], corr_val))

if high_corr_pairs:
    print(f"\nHighly correlated feature pairs (|corr| > {high_corr_threshold}):")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"  {feat1} - {feat2}: {corr:.3f}")
else:
    print(f"\nNo feature pairs with |correlation| > {high_corr_threshold}")

## 8. Temporal Analysis

In [None]:
# Analyze temporal patterns
window_size = 1000
n_windows = len(df) // window_size

temporal_stats = []

for i in range(n_windows):
    start_idx = i * window_size
    end_idx = (i + 1) * window_size
    window_data = df.iloc[start_idx:end_idx]
    
    stats = {
        'window': i,
        'time_start': window_data['time'].iloc[0],
        'time_end': window_data['time'].iloc[-1],
        'Y1_mean': window_data['Y1'].mean(),
        'Y1_std': window_data['Y1'].std(),
        'Y2_mean': window_data['Y2'].mean(),
        'Y2_std': window_data['Y2'].std()
    }
    
    # Add feature means
    for col in feature_cols[:5]:  # First 5 features for brevity
        stats[f'{col}_mean'] = window_data[col].mean()
    
    temporal_stats.append(stats)

temporal_df = pd.DataFrame(temporal_stats)

# Plot temporal patterns
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

# Target means over time
axes[0].plot(temporal_df['window'], temporal_df['Y1_mean'], label='Y1 mean', alpha=0.7)
axes[0].plot(temporal_df['window'], temporal_df['Y2_mean'], label='Y2 mean', alpha=0.7)
axes[0].set_xlabel('Time Window')
axes[0].set_ylabel('Mean Value')
axes[0].set_title('Target Means Over Time')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Target volatility over time
axes[1].plot(temporal_df['window'], temporal_df['Y1_std'], label='Y1 std', alpha=0.7)
axes[1].plot(temporal_df['window'], temporal_df['Y2_std'], label='Y2 std', alpha=0.7)
axes[1].set_xlabel('Time Window')
axes[1].set_ylabel('Std Deviation')
axes[1].set_title('Target Volatility Over Time')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Feature means over time (first 5 features)
for col in feature_cols[:5]:
    axes[2].plot(temporal_df['window'], temporal_df[f'{col}_mean'], label=f'{col}', alpha=0.7)
axes[2].set_xlabel('Time Window')
axes[2].set_ylabel('Mean Value')
axes[2].set_title('Feature Means Over Time (First 5 Features)')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Check for regime changes
print("\nTemporal Statistics Summary:")
print(f"Y1 mean range: [{temporal_df['Y1_mean'].min():.4f}, {temporal_df['Y1_mean'].max():.4f}]")
print(f"Y2 mean range: [{temporal_df['Y2_mean'].min():.4f}, {temporal_df['Y2_mean'].max():.4f}]")
print(f"Y1 volatility range: [{temporal_df['Y1_std'].min():.4f}, {temporal_df['Y1_std'].max():.4f}]")
print(f"Y2 volatility range: [{temporal_df['Y2_std'].min():.4f}, {temporal_df['Y2_std'].max():.4f}]")

# Check for significant changes
y1_mean_change = temporal_df['Y1_mean'].max() - temporal_df['Y1_mean'].min()
y2_mean_change = temporal_df['Y2_mean'].max() - temporal_df['Y2_mean'].min()

if y1_mean_change > temporal_df['Y1_std'].mean() or y2_mean_change > temporal_df['Y2_std'].mean():
    print("\n⚠️ Warning: Significant temporal shifts detected. Consider time-aware modeling.")
else:
    print("\n✓ Temporal patterns appear relatively stable.")

## 9. Outlier Detection

In [None]:
# Detect outliers using IQR method
outlier_summary = []

for col in feature_cols + target_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    
    outlier_summary.append({
        'Column': col,
        'Outlier_Count': len(outliers),
        'Outlier_Percent': len(outliers) / len(df) * 100,
        'Lower_Bound': lower_bound,
        'Upper_Bound': upper_bound
    })

outlier_df = pd.DataFrame(outlier_summary)
outlier_df = outlier_df.sort_values('Outlier_Percent', ascending=False)

print("Outlier Analysis (IQR Method):")
print(outlier_df)

# Visualize outliers for features with most outliers
top_outlier_features = outlier_df.head(4)['Column'].tolist()

if top_outlier_features:
    fig, axes = plt.subplots(1, min(4, len(top_outlier_features)), figsize=(15, 4))
    if len(top_outlier_features) == 1:
        axes = [axes]
    
    for i, col in enumerate(top_outlier_features[:4]):
        axes[i].boxplot(df[col].dropna())
        axes[i].set_title(f'{col} Boxplot')
        axes[i].set_ylabel('Value')
    
    plt.tight_layout()
    plt.show()

## 10. Summary and Recommendations

In [None]:
print("=" * 60)
print("EDA SUMMARY AND RECOMMENDATIONS")
print("=" * 60)

# Data characteristics
print("\n📊 Data Characteristics:")
print(f"  - Total samples: {len(df):,}")
print(f"  - Features: {len(feature_cols)} ({', '.join(feature_cols)})")
print(f"  - Targets: {len(target_cols)} ({', '.join(target_cols)})")
print(f"  - Y1-Y2 correlation: {target_corr:.4f}")

# Key findings
print("\n🔍 Key Findings:")

# Missing values
total_missing = df.isna().sum().sum()
if total_missing > 0:
    print(f"  - Missing values: {total_missing} ({total_missing/df.size*100:.2f}% of all data)")
    print("    → Implement appropriate imputation strategy")
else:
    print("  - No missing values detected ✓")

# Spike features
if spike_features:
    print(f"  - Spike features found: {spike_features}")
    print("    → Consider categorical encoding or special handling")
else:
    print("  - No significant spike features ✓")

# Feature correlations
best_features = correlations.head(5)['Feature'].tolist()
print(f"  - Top correlated features: {best_features}")
print("    → Prioritize these in feature engineering")

# Outliers
high_outlier_cols = outlier_df[outlier_df['Outlier_Percent'] > 5]['Column'].tolist()
if high_outlier_cols:
    print(f"  - High outlier columns (>5%): {high_outlier_cols}")
    print("    → Consider robust scaling or outlier treatment")

# Model recommendations
print("\n🎯 Modeling Recommendations:")
print("  1. Use multi-target learning given Y1-Y2 correlation")
print("  2. Implement purged time-series cross-validation")
print("  3. Focus on top 30-40 interaction features")
print("  4. Apply regularization to prevent overfitting")
print("  5. Use ensemble methods for robustness")

# Next steps
print("\n📝 Next Steps:")
print("  1. Implement feature engineering pipeline")
print("  2. Set up purged cross-validation")
print("  3. Train LightGBM baseline")
print("  4. Validate CV score > 0.68")
print("  5. Build ensemble if baseline successful")