# Load Data

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
test = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())


# EDA + Train Model

## EDA Notebook Template (clean + analytic)

In [None]:
%%time
# ==============================
# 1. Setup
# ==============================
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
test  = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/test.csv")

print(train.shape, test.shape)
train.head()


In [None]:
%%time
# ==============================
# 2. Targets
# ==============================
target_cols = ["forward_returns", "risk_free_rate", "market_forward_excess_returns"]

fig, axes = plt.subplots(1, 3, figsize=(15,4))
for i, col in enumerate(target_cols):
    sns.histplot(train[col], bins=30, kde=True, ax=axes[i])
    axes[i].set_title(col)
plt.tight_layout()
plt.show()

print(train[target_cols].corr())
sns.heatmap(train[target_cols].corr(), annot=True, cmap="coolwarm")
plt.show()


In [None]:
%%time
# ==============================
# 2. Targets
# ==============================
target_cols = ["forward_returns", "risk_free_rate", "market_forward_excess_returns"]

fig, axes = plt.subplots(1, 3, figsize=(15,4))
for i, col in enumerate(target_cols):
    sns.histplot(train[col], bins=30, kde=True, ax=axes[i])
    axes[i].set_title(col)
plt.tight_layout()
plt.show()

print(train[target_cols].corr())
sns.heatmap(train[target_cols].corr(), annot=True, cmap="coolwarm")
plt.show()


In [None]:
%%time
# ==============================
# 3. Feature Groups
# ==============================
groups = {
    "D": [f"D{i}" for i in range(1,10)],
    "E": [f"E{i}" for i in list(range(1,10))+list(range(10,21))],
    "I": [f"I{i}" for i in range(1,10)],
    "M": [f"M{i}" for i in range(1,19)],
    "P": [f"P{i}" for i in range(1,14)],
    "S": [f"S{i}" for i in range(1,13)],
    "V": [f"V{i}" for i in range(1,14)],
}

for g, cols in groups.items():
    plt.figure(figsize=(10,6))
    sns.heatmap(train[cols].corr(), cmap="coolwarm", center=0)
    plt.title(f"Correlation Heatmap: Group {g}")
    plt.show()


In [None]:
%%time
# ==============================
# 4. Train vs Test Comparison
# ==============================
shared_cols = [c for c in train.columns if c in test.columns and c not in ["date_id"]]

for col in shared_cols[:5]:  # preview first 5
    plt.figure(figsize=(7,4))
    sns.kdeplot(train[col], label="Train", fill=True)
    sns.kdeplot(test[col], label="Test", fill=True)
    plt.title(f"Train vs Test: {col}")
    plt.legend()
    plt.show()

In [None]:
%%time
# ==============================
# 4. Train vs Test Comparison
# ==============================
shared_cols = [c for c in train.columns if c in test.columns and c not in ["date_id"]]

for col in shared_cols[:5]:  # preview first 5
    plt.figure(figsize=(7,4))
    sns.kdeplot(train[col], label="Train", fill=True)
    sns.kdeplot(test[col], label="Test", fill=True)
    plt.title(f"Train vs Test: {col}")
    plt.legend()
    plt.show()

In [None]:
%%time
# ==============================
# 6. Feature Importance
# ==============================
X = train[shared_cols].fillna(0)
y = train["forward_returns"]

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X, y)

imp = pd.DataFrame({"feature": shared_cols, "importance": rf.feature_importances_})
imp = imp.sort_values("importance", ascending=False)

plt.figure(figsize=(8,6))
sns.barplot(data=imp.head(20), x="importance", y="feature")
plt.title("Top 20 Feature Importances (RF baseline)")
plt.show()

In [None]:
%%time
# =========================================
# Hull Tactical - Market Prediction
# Baseline with EDA + LightGBM
# =========================================

import os
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb


# -------------------------
# 1. Load Data
# -------------------------
DATA_PATH = "/kaggle/input/hull-tactical-market-prediction/"

train = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
test = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print(train.head())

# -------------------------
# 2. EDA
# -------------------------
plt.figure(figsize=(8,4))
sns.histplot(train["forward_returns"], bins=50, kde=True)
plt.title("Distribution of Forward Returns")
plt.show()

plt.figure(figsize=(10,6))
corr = train.corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap")
plt.show()

# -------------------------
# 3. Feature Preparation
# -------------------------
target = "forward_returns"
features = [c for c in train.columns if c not in [target, "risk_free_rate"]]

X = train[features]
y = train[target]

# time-based split: last 20% as validation
split_idx = int(len(train) * 0.8)
X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

# -------------------------
# 4. Baseline Model
# -------------------------
model = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          eval_metric="rmse",
          #early_stopping_rounds=50,
          #verbose=False
          #force_col_wise=true
         )

val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse:.5f}")


# Prediction

In [None]:
%%time
import os
import numpy as np
import pandas as pd
import polars as pl

from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import kaggle_evaluation.default_inference_server


# =======================
# Load Training Data
# =======================
train = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")

target = "market_forward_excess_returns"
exclude_cols = ["date_id", target]
features = [c for c in train.columns if c not in exclude_cols]

X = train[features]
y = train[target]

# =======================
# Train Model
# =======================
model = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0))
])
model.fit(X, y)

# =======================
# Predict Function
# =======================
previous_allocation = 1.0  # start neutral

def predict(test: pl.DataFrame) -> float:
    global previous_allocation

    # Convert Polars -> Pandas
    row = test.to_pandas()

    # Make sure only training features are used
    row = row.reindex(columns=features, fill_value=0)

    # Predict
    pred = model.predict(row)[0]

    # Convert prediction into allocation
    allocation = 1.0 + 50 * pred  # scaling factor

    # Clip to [0, 2]
    allocation = np.clip(allocation, 0.0, 2.0)

    # Smooth with previous allocation
    allocation = 0.8 * allocation + 0.2 * previous_allocation

    previous_allocation = allocation
    return float(allocation)


# =======================
# Start Inference Server
# =======================
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway(("/kaggle/input/hull-tactical-market-prediction/",))


# Submit

In [None]:
%%time
import pandas as pd

def save_and_validate_submission(predictions, row_ids, filename="submission.parquet"):
    """
    Save predictions in Parquet format and validate structure before submission.
    """
    # Build dataframe
    submission = pd.DataFrame({
        "row_id": row_ids,
        "prediction": predictions
    })

    # Save to parquet
    submission.to_parquet(filename, index=False)

    # Reload to validate
    sub_check = pd.read_parquet(filename)

    # Validation checks
    print("âœ… Submission saved as Parquet")
    print("Columns:", sub_check.columns.tolist())
    print("Number of rows:", len(sub_check))
    print("Missing values:", sub_check.isnull().sum().sum())
    print(sub_check.head())

    return filename
