### Description
Wisdom from most personal finance experts would suggest that it's irresponsible to try and time the market. The Efficient Market Hypothesis (EMH) would agree: everything knowable is already priced in, so don’t bother trying.

But in the age of machine learning, is it irresponsible to not try and time the market? Is the EMH an extreme oversimplification at best and possibly just…false?

* This competition is about more than predictive modeling. Predicting market returns challenges the assumptions of market efficiency. Your work could help reshape how investors and academics understand financial markets. Participants could uncover signals others overlook, develop innovative strategies, and contribute to a deeper understanding of market behavior—potentially rewriting a fundamental principle of modern finance. Most investors don’t beat the S&P 500. That failure has been used for decades to prop up EMH: If even the professionals can’t win, it must be impossible. This observation has long been cited as evidence for the Efficient Market Hypothesis the idea that prices already reflect all available information and no persistent edge is possible. This story is tidy, but reality is less so. Markets are noisy, messy, and full of behavioral quirks that don’t vanish just because academic orthodoxy said they should.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# -------------------------=-=-=-=-=-=-=-=-=-=-=
# Market Prediction Notebook 
# -------------------------=-=-=-=-=-=-=-=-=-=-=
# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
# 0. Import libraries
# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import joblib   # ❌ commented out (no joblib save required)

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

# ---------- Safety: confirm network is off ----------
import socket
def can_connect(host="8.8.8.8", port=53, timeout=1):
    try:
        socket.create_connection((host, port), timeout=timeout)
        return True
    except Exception:
        return False

print("Network reachable?", can_connect())   # Expect False

# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
# ----------------------------
# 1. Load Data
# ----------------------------
# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=

TRAIN_PATH = "/kaggle/input/hull-tactical-market-prediction/train.csv"
TEST_PATH  = "/kaggle/input/hull-tactical-market-prediction/test.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

print("Train shape:", train.shape)
print("Test shape: ", test.shape)

# ----------------------------
# 2. Quick Data Check
# ----------------------------
TARGET = "market_forward_excess_returns"
if TARGET not in train.columns:
    raise SystemExit(f"Target column '{TARGET}' not found in train!")

print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())

# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
# ----------------------------
# 3. Prepare features
# ----------------------------
# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=

X = train.drop(columns=[TARGET]).copy()
y = train[TARGET].copy()
X_test = test.copy()

common_cols = [c for c in X.columns if c in X_test.columns]
print("Number of common columns:", len(common_cols))

if len(common_cols) == 0:
    raise SystemExit("No overlapping feature columns between train and test.")

X = X[common_cols].copy()
X_test = X_test[common_cols].copy()

for col in common_cols:
    X[col] = pd.to_numeric(X[col], errors='coerce')
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

medians = X.median()
X = X.fillna(medians)
X_test = X_test.fillna(medians)

print("Final feature count (train):", X.shape[1])
print("Any NaNs in X?", X.isnull().sum().sum())
print("Any NaNs in X_test?", X_test.isnull().sum().sum())

# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
# ----------------------------
# 4. Scaling
# ----------------------------
# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# joblib.dump(scaler, "scaler.joblib")   # ❌ commented out

# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
# ----------------------------
# 5. Model Training
# ----------------------------
# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'random_state': 42,
    'verbosity': 0
}

xgb_model = XGBRegressor(**params)

grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [100, 200]
}

ts_split = TimeSeriesSplit(n_splits=5)

search = GridSearchCV(
    xgb_model,
    param_grid=grid,
    cv=ts_split,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=1
)

search.fit(X_scaled, y)

print("Best params:", search.best_params_)
best_model = search.best_estimator_

# joblib.dump(best_model, "best_model.joblib")   # ❌ commented out

# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
# ----------------------------
# 6. Evaluate on Train
# ----------------------------
# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
preds_train = best_model.predict(X_scaled)
rmse = np.sqrt(mean_squared_error(y, preds_train))
r2 = r2_score(y, preds_train)
print("Train RMSE:", rmse)
print("Train R2:", r2)

plt.figure(figsize=(8,5))
plt.scatter(y, preds_train, alpha=0.6, s=8)
plt.xlabel("Actual Returns")
plt.ylabel("Predicted Returns")
plt.title("Actual vs Predicted (Train)")
plt.grid(alpha=0.3)
plt.show()

# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
# ----------------------------
# 7. Predict on Test & Save
# ----------------------------
# =-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=
test_predictions = best_model.predict(X_test_scaled)

submission = pd.DataFrame({
    'row_id': np.arange(len(test_predictions)),
    'prediction': test_predictions
})

print(submission.head())

# ❌ Commented out CSV save
# submission.to_csv("submission.csv", index=False)

# ✅ Keep only parquet for Kaggle submission
submission.to_parquet("submission.parquet", index=False)

print("Saved: submission.parquet")
