In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the data
train_data = pd.read_csv('./hull-tactical-market-prediction/train.csv')
test_data = pd.read_csv('./hull-tactical-market-prediction/test.csv')

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print("\nTrain data columns:", train_data.columns.tolist())
print("\nFirst few rows of train data:")
train_data.head()

In [None]:
# Check for missing values
print("Missing values in train data:")
print(train_data.isnull().sum().sum())
print("\nMissing values in test data:")
print(test_data.isnull().sum().sum())

# Display target variable statistics
print("\n=== Target Variable Statistics ===")
print(train_data['market_forward_excess_returns'].describe())

In [None]:
# Prepare features and target
# Define feature groups based on column prefixes
feature_prefixes = ['D', 'E', 'I', 'M', 'P', 'S', 'V']

# Get all feature columns
feature_cols = [col for col in train_data.columns 
                if any(col.startswith(prefix) for prefix in feature_prefixes)]

print(f"Number of features: {len(feature_cols)}")

# Target variable
target_col = 'market_forward_excess_returns'

# Separate features and target for training data
X_train_full = train_data[feature_cols].copy()
y_train_full = train_data[target_col]

# Check which features have too many missing values
missing_pct = X_train_full.isnull().sum() / len(X_train_full) * 100
print(f"\nFeatures with >50% missing values: {(missing_pct > 50).sum()}")
print(f"Features with <50% missing values: {(missing_pct <= 50).sum()}")

# Use median imputation for features with some data, 0 for mostly empty features
for col in feature_cols:
    if missing_pct[col] > 90:
        X_train_full[col].fillna(0, inplace=True)
    else:
        median_val = X_train_full[col].median()
        X_train_full[col].fillna(median_val if not pd.isna(median_val) else 0, inplace=True)

print(f"\nX_train shape: {X_train_full.shape}")
print(f"y_train shape: {y_train_full.shape}")
print(f"Remaining missing values: {X_train_full.isnull().sum().sum()}")

In [None]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

In [None]:
# Train XGBoost model
print("Training XGBoost model...")

xgb_params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'learning_rate': 0.05,
    'n_estimators': 1000,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 1,
    'gamma': 0,
    'reg_alpha': 0.01,
    'reg_lambda': 1.0,
    'random_state': 42,
    'tree_method': 'hist'
}

model = xgb.XGBRegressor(**xgb_params)

# Train with evaluation
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=100
)

print(f"\nModel training completed!")
print(f"Number of boosting rounds: {model.n_estimators}")

In [None]:
# Evaluate model on validation set
y_pred_val = model.predict(X_val)

mse = mean_squared_error(y_val, y_pred_val)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)

print("=== Validation Set Performance ===")
print(f"MSE: {mse:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae:.6f}")
print(f"RÂ² Score: {r2:.6f}")

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 8))
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 most important features:")
print(importance_df.head(10))

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_pred_val, alpha=0.5)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
plt.xlabel('Actual Market Forward Excess Returns')
plt.ylabel('Predicted Market Forward Excess Returns')
plt.title('Actual vs Predicted Returns (Validation Set)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Make predictions on test set
# Filter only rows that need to be scored
test_scored = test_data[test_data['is_scored'] == True].copy()

print(f"Number of test samples to score: {len(test_scored)}")

# Prepare test features with same imputation strategy
X_test = test_scored[feature_cols].copy()

for col in feature_cols:
    if missing_pct[col] > 90:
        X_test[col].fillna(0, inplace=True)
    else:
        # Use training median for consistency
        train_median = X_train_full[col].median()
        X_test[col].fillna(train_median if not pd.isna(train_median) else 0, inplace=True)

print(f"Test missing values after imputation: {X_test.isnull().sum().sum()}")

# Make predictions
test_predictions = model.predict(X_test)

# Add predictions to test dataframe
test_scored['predicted_market_forward_excess_returns'] = test_predictions

print("\nTest predictions summary:")
print(pd.Series(test_predictions).describe())

# Display first few predictions
print("\nFirst 10 predictions:")
test_scored[['date_id', 'predicted_market_forward_excess_returns']].head(10)

In [2]:
# Save predictions to CSV for submission
submission = test_scored[['date_id', 'predicted_market_forward_excess_returns']].copy()
submission.columns = ['date_id', 'market_forward_excess_returns']
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")
print(f"\nSubmission file shape: {submission.shape}")
submission.head(10)

NameError: name 'test_scored' is not defined