In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pathlib import Path

DIR : Path = Path('/kaggle/input/hull-tactical-market-prediction/')
df_train = pd.read_csv(DIR/"train.csv")
df_train.head()

# Exploratory data analysis


## Atributtes Description

- date_id - An identifier for a single trading day.
- M* - Market Dynamics/Technical features.
- E* - Macro Economic features.
- I* - Interest Rate features.
- P* - Price/Valuation features.
- V* - Volatility features.
- S* - Sentiment features.
- MOM* - Momentum features.
- D* - Dummy/Binary features.
- forward_returns - The returns from buying the S&P 500 and selling it a day later. Train set only.
- risk_free_rate - The federal funds rate. Train set only.
- market_forward_excess_returns - Forward returns relative to expectations. Computed by subtracting the rolling five-year mean forward returns and winsorizing the result using a median absolute deviation (MAD) with a criterion of 4. Train set only.


In [None]:
print("\nData Info:")
df_train.info()

In [None]:
df_train.shape

## Handling Missing Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

#Calculate the percentage of missing values for each column
missing_percentage = (df_train.isnull().sum() / len(df_train)) * 100
missing_df = pd.DataFrame({'column_name': df_train.columns,
                           'missing_percentage': missing_percentage})

#Filter for columns with missing data and sort them
missing_df = missing_df[missing_df['missing_percentage'] > 0].sort_values('missing_percentage', ascending=False)

print("Columns with Missing Data (%):")
print(missing_df)

plt.figure()
sns.barplot(x='missing_percentage', y='column_name', data=missing_df.head(20), palette='plasma')
plt.title('20 Features by Percentage of Missing Values')
plt.xlabel('Percentage Missing (%)')
plt.ylabel('Feature Name')
plt.show()

## Target Variable Analysis (market_forward_excess_returns)

In [None]:
plt.figure(figsize=(12, 7))
sns.histplot(df_train['market_forward_excess_returns'], bins=100, kde=True)
plt.title('Distribution of "market_forward_excess_returns"', fontsize=16)
plt.xlabel('Excess Return', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.axvline(df_train['market_forward_excess_returns'].mean(), color='red', linestyle='--', label=f"Mean: {df_train['market_forward_excess_returns'].mean():.4f}")
plt.axvline(df_train['market_forward_excess_returns'].median(), color='green', linestyle='-', label=f"Median: {df_train['market_forward_excess_returns'].median():.4f}")
plt.legend()
plt.show()

print("\nDescriptive statistics of the target variable:")
print(df_train['market_forward_excess_returns'].describe())

## Time series analysis of the target variable

In [None]:
print("\nAnalyzing the Target Variable Over Time")
plt.figure(figsize=(18, 7))
plt.plot(df_train['date_id'], df_train['market_forward_excess_returns'], lw=0.8, alpha=0.9)
plt.title('Market Excess Return vs. Date ID (Time Series)', fontsize=16)
plt.xlabel('Date ID', fontsize=12)
plt.ylabel('Excess Return', fontsize=12)
plt.grid(True)
plt.show()

## Feature Analysis

### Grouping Features

In [None]:
#Create lists of feature names by category
feature_groups = {
    'D': [col for col in df_train.columns if col.startswith('D')],
    'E': [col for col in df_train.columns if col.startswith('E')],
    'I': [col for col in df_train.columns if col.startswith('I')],
    'M': [col for col in df_train.columns if col.startswith('M')],
    'P': [col for col in df_train.columns if col.startswith('P')],
    'S': [col for col in df_train.columns if col.startswith('S')],
    'V': [col for col in df_train.columns if col.startswith('V')],
}

### Correlation analysis with the target variable

In [None]:
#apply a simple imputation (forward fill) for the sake of correlation analysis
df_imputed = df_train.fillna(method='ffill').fillna(method='bfill') # ffill then bfill to handle NaNs

target = 'market_forward_excess_returns'
#Calculate correlations with the target variable
correlations = df_imputed.corr()[target].sort_values(ascending=False)

#Remove self-correlation and other target-related columns
correlations = correlations.drop([target, 'forward_returns', 'risk_free_rate'])

plt.figure(figsize=(10, 8))
top_corr = pd.concat([correlations.head(20), correlations.tail(20)])
sns.barplot(x=top_corr.values, y=top_corr.index, palette='coolwarm')
plt.title('20 Positive & Negative Feature Correlations with Target')
plt.xlabel('Correlation Coefficient')
plt.show()

In [None]:
df_imputed = df_train.fillna(method='ffill').fillna(method='bfill')

features_to_plot = ['V13', 'S5', 'V7', 'M4', 'S2', 'E11']

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Revised Scatter Plots of Top Correlated Continuous Features vs. Target', fontsize=20, y=1.03)

axes = axes.flatten()

sample_df = df_imputed.sample(n=min(2000, len(df_imputed)), random_state=42)

for i, feature in enumerate(features_to_plot):
    corr_value = df_imputed[[feature, target]].corr().iloc[0, 1]
    sns.scatterplot(x=feature, y=target, data=sample_df, ax=axes[i], alpha=0.6)
    axes[i].set_title(f'{feature} (Correlation: {corr_value:.3f})', fontsize=14)
    axes[i].set_xlabel(f'Value of Feature {feature}')
    axes[i].set_ylabel('Excess Return (Target)')

plt.tight_layout()
plt.show()

In [None]:
#box plot
plt.figure()
sns.boxplot(x='D2', y=target, data=df_imputed)
plt.title('Distribution of Target Returns by D2', fontsize=16)
plt.xlabel('Category of D2', fontsize=12)
plt.ylabel('Excess Return (Target)', fontsize=12)
plt.show()

### Multicollinearity Check

In [None]:
df_imputed = df_train.fillna(method='ffill').fillna(method='bfill')

#Define the feature groups
feature_categories = ['D', 'E', 'I', 'M', 'P', 'S', 'V']
feature_groups = {cat: [col for col in df_imputed.columns if col.startswith(cat)] for cat in feature_categories}

#Loop to generate a heatmap for each category
for category, features in feature_groups.items():
    # Check if there are enough features to create a correlation matrix
    if len(features) < 2:
        print(f"\nGroup '{category}' has fewer than 2 features, skipping the heatmap.")
        continue

    print(f"\nGenerating heatmap for group '{category}'...")

    #Calculate the correlation matrix for the current group
    corr_matrix = df_imputed[features].corr()

    #Decide whether to show annotations based on the number of features
    show_annotations = len(features) < 15

    #heatmap
    plt.figure(figsize=(12, 9))
    sns.heatmap(
        corr_matrix,
        annot=show_annotations, # Show numbers only if the matrix isn't too large
        cmap='coolwarm',        # Color palette (red-blue)
        fmt='.2f',              # Format numbers to 2 decimal places
        linewidths=.5
    )
    plt.title(f'Correlation Heatmap for {category} Group Features', fontsize=16)
    plt.show()


## Time Series Deep Dive

### Seasonality Analysis (Day-of-the-Week Effect)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.ensemble import RandomForestRegressor

print("\nSeasonality Analysis (Day of the Week)")

#assume trading days are sequential.
df_train['day_of_week'] = df_train['date_id'] % 5
day_names = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri'}
df_train['day_name'] = df_train['day_of_week'].map(day_names)

plt.figure(figsize=(12, 7))
sns.boxplot(data=df_train, x='day_name', y='market_forward_excess_returns', order=['Mon', 'Tue', 'Wed', 'Thu', 'Fri'])
plt.title('Distribution of Returns by Day of the Week', fontsize=16)
plt.xlabel('Day of the Week', fontsize=12)
plt.ylabel('Excess Return', fontsize=12)
plt.axhline(0, color='red', linestyle='--', alpha=0.7)
plt.show()

### Autocorrelation of Returns (ACF)

In [None]:
print("\nAutocorrelation Analysis (ACF)")

fig, ax = plt.subplots(figsize=(14, 6))
plot_acf(df_train['market_forward_excess_returns'], lags=40, ax=ax)
ax.set_title('Autocorrelation Function (ACF) for Excess Returns', fontsize=16)
ax.set_xlabel('Lag (in days)', fontsize=12)
ax.set_ylabel('Autocorrelation', fontsize=12)
plt.show()

### Rolling Statistics Analysis

In [None]:
print("\nRolling Statistics Analysis")

window_size = 50 # Window of approximately 2 trading months
df_train['rolling_mean'] = df_train['market_forward_excess_returns'].rolling(window=window_size).mean()
df_train['rolling_std'] = df_train['market_forward_excess_returns'].rolling(window=window_size).std()

fig, axes = plt.subplots(2, 1, figsize=(18, 12), sharex=True)

axes[0].plot(df_train['date_id'], df_train['market_forward_excess_returns'], label='Original Return', alpha=0.5, lw=0.8)
axes[0].plot(df_train['date_id'], df_train['rolling_mean'], label=f'Rolling Mean ({window_size} days)', color='red')
axes[0].set_title('Excess Returns and Rolling Mean', fontsize=16)
axes[0].set_ylabel('Value', fontsize=12)
axes[0].legend()
axes[0].grid(True)

#rolling standard deviation (volatility)
axes[1].plot(df_train['date_id'], df_train['rolling_std'], label=f'Rolling Standard Deviation ({window_size} days)', color='green')
axes[1].set_title('Rolling Volatility (Risk)', fontsize=16)
axes[1].set_xlabel('Date ID', fontsize=12)
axes[1].set_ylabel('Standard Deviation', fontsize=12)
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

## Detailed Feature Analysis

### Distribution of Important Features

In [None]:
print("\nDistribution of Selected Features ---")

features_to_plot = ['M4', 'S5', 'V13']

fig, axes = plt.subplots(1, len(features_to_plot), figsize=(18, 5))

for i, feature in enumerate(features_to_plot):
    sns.histplot(data=df_train, x=feature, kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {feature}', fontsize=14)

plt.tight_layout()
plt.show()

##  Interaction Analysis

### Correlation Heatmap (Between Features)

In [None]:
print("\nCorrelation Heatmap")

features_subset = ['market_forward_excess_returns', 'M4', 'S2', 'E11', 'E12', 'P8', 'V13', 'S5', 'D2', 'D1']
correlation_matrix = df_train[features_subset].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Heatmap of Selected Features', fontsize=16)
plt.show()

## Advanced Analysis

### Feature Importance from a Baseline Model

In [None]:
print("\nPerforming Feature Engineering...")
df_train['day_of_week'] = df_train['date_id'] % 5
day_names = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri'}
df_train['day_name'] = df_train['day_of_week'].map(day_names)
print("Columns after adding 'day_name':", df_train.columns.tolist())

#Handle Missing Values
df_processed = df_train.dropna()
print(f"\nHandled missing values. Shape is now: {df_processed.shape}")

#Encode Categorical Features (like 'day_name')
print("\nApplying One-Hot Encoding...")
df_encoded = pd.get_dummies(df_processed, columns=['day_name'], prefix='day')
print("Columns after encoding:", df_encoded.columns.tolist())

target_variable = 'market_forward_excess_returns'
y = df_encoded[target_variable]

cols_to_drop = [
    'market_forward_excess_returns',
    'forward_returns',
    'date_id',
    'day_of_week'
]

X = df_encoded.drop(columns=cols_to_drop)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X, y)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(18, 16))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Corrected Feature Importance from a Random Forest Model', fontsize=12)
plt.xlabel('Importance', fontsize=8)
plt.ylabel('Feature', fontsize=8)
plt.show()