In [1]:
import os
import sys
import warnings
import pickle
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")
plt.rcParams['figure.dpi'] = 120

from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
import lightgbm as lgb
import optuna
import shap
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


In [2]:
# -------------------------
# Helpers & output folders
# -------------------------
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "figs"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "models"), exist_ok=True)

def save_fig(name):
    path = os.path.join(OUT_DIR, "figs", name)
    plt.tight_layout()
    plt.savefig(path)
    print(f"Saved figure: {path}")

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [3]:
# -------------------------
# Data loading: flexible
# -------------------------
def load_merged_df():
    csv_path = "data/raw/data.csv"
    df = pd.read_csv(csv_path, parse_dates=['timestamp', 'date'], infer_datetime_format=True)
    return df


In [4]:
def preprocess_and_engineer(df, debug=False):
    """
    Input: merged_df (as produced by your EDA)
    Steps:
      - Ensure timestamp/date columns
      - Create target = next-day high
      - Lag features, rolling stats
      - Handle missing values
      - Standardize features (scaler fitted on training set later)
    Returns: processed dataframe (no scaling applied), feature list
    """
    df = df.copy()
    # ensure datetime
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    if 'date' in df.columns and not np.issubdtype(df['date'].dtype, np.datetime64):
        try:
            df['date'] = pd.to_datetime(df['date'])
        except:
            df['date'] = pd.to_datetime(df['timestamp'].dt.date)
    else:
        df['date'] = pd.to_datetime(df['timestamp'].dt.date)

    # Sort by time
    df = df.sort_values('timestamp').reset_index(drop=True)

    # Target: tomorrow's high
    df['target_high'] = df['high'].shift(-1)

    # lag features
    lags = [1,2,3]
    for lag in lags:
        df[f'close_lag_{lag}'] = df['close'].shift(lag)
        df[f'return_lag_{lag}'] = df['daily_return'].shift(lag)
        df[f'vol_lag_{lag}'] = df['volume'].shift(lag)

    # Rolling stats
    df['rolling_mean_5'] = df['close'].rolling(5).mean()
    df['rolling_std_5'] = df['close'].rolling(5).std()
    df['rolling_mean_10'] = df['close'].rolling(10).mean()
    df['rolling_std_10'] = df['close'].rolling(10).std()

    # Sentiment lags
    df['sentiment_lag_1'] = df['avg_sentiment'].shift(1)

    # Time features
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month

    # Drop rows with NA in target
    df = df.dropna(subset=['target_high']).reset_index(drop=True)

    # Fill remaining missing feature values with median (simple imputer later)
    # Keep columns list
    feature_cols = [
        'open','high','low','close','volume','vwap','trade_count','daily_return',
        'rolling_mean_5','rolling_std_5','rolling_mean_10','rolling_std_10',
        'close_lag_1','close_lag_2','close_lag_3',
        'return_lag_1','return_lag_2','return_lag_3',
        'vol_lag_1','vol_lag_2','vol_lag_3',
        'sentiment_lag_1','dayofweek','month'
    ]
    # Some columns on user data may not exist (e.g., trade_count)
    feature_cols = [c for c in feature_cols if c in df.columns]
    if debug:
        print("Feature columns:", feature_cols)

    # We'll not scale here; scaling is part of a sklearn pipeline fitted on train.
    return df, feature_cols

In [5]:
def training_only_eda(X_train_df, y_train, out_prefix="train"):
    """
    Perform exploration ONLY on the training set per assignment.
    Save descriptive stats, correlation matrix, and plots.
    """
    print("=== Training-only EDA ===")
    stats = X_train_df.describe().T
    stats.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_descriptive_stats.csv"))
    print(f"Saved descriptive stats to {OUT_DIR}/{out_prefix}_descriptive_stats.csv")

    # Correlation (features + target)
    corr = X_train_df.join(y_train).corr()
    plt.figure(figsize=(10,8))
    sns.heatmap(corr, annot=False, cmap='coolwarm')
    plt.title('Training-set Correlation Matrix (features + target)')
    save_fig(f"{out_prefix}_corr_matrix.png")
    plt.close()

    # Distribution plots for a handful of numeric features
    cols = X_train_df.select_dtypes(include=[np.number]).columns.tolist()[:8]
    for c in cols:
        plt.figure(figsize=(5,3))
        sns.histplot(X_train_df[c].dropna(), kde=True, bins=30)
        plt.title(f'{c} distribution (train)')
        save_fig(f"{out_prefix}_dist_{c}.png")
        plt.close()

    # Scatter: sentiment_lag_1 vs target (if present)
    if 'sentiment_lag_1' in X_train_df.columns:
        plt.figure(figsize=(5,4))
        sns.scatterplot(x=X_train_df['sentiment_lag_1'], y=y_train)
        plt.title('Sentiment (lag1) vs Target High (train)')
        save_fig(f"{out_prefix}_sentiment_vs_target.png")
        plt.close()

In [6]:
def evaluate_regression(y_true, y_pred):
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': rmse(y_true, y_pred),
        'R2': r2_score(y_true, y_pred)
    }

In [7]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train baseline, LinearRegression, RandomForest, LightGBM, (optional) Keras.
    Return fitted models and metrics.
    """
    results = {}
    models = {}

    baseline_pred = X_test['high'].values
    results['baseline'] = evaluate_regression(y_test, baseline_pred)
    print("Baseline:", results['baseline'])

    # Pipeline to impute + scale
    imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()
    X_train_imputed = imputer.fit_transform(X_train)
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(imputer.transform(X_test))

    # Linear Regression
    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    lr_pred = lr.predict(X_test_scaled)
    results['linear_regression'] = evaluate_regression(y_test, lr_pred)
    models['linear_regression'] = ('lr', lr, imputer, scaler)
    print("Linear Regression:", results['linear_regression'])

    # Random Forest
    rf = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42, n_jobs=-1)
    rf.fit(X_train_scaled, y_train)
    rf_pred = rf.predict(X_test_scaled)
    results['random_forest'] = evaluate_regression(y_test, rf_pred)
    models['random_forest'] = ('rf', rf, imputer, scaler)
    print("Random Forest:", results['random_forest'])

    # LightGBM
    lgbm = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=42)
    lgbm.fit(X_train_scaled, y_train)
    lgbm_pred = lgbm.predict(X_test_scaled)
    results['lightgbm'] = evaluate_regression(y_test, lgbm_pred)
    models['lightgbm'] = ('lgbm', lgbm, imputer, scaler)
    print("LightGBM:", results['lightgbm'])

    input_dim = X_train_scaled.shape[1]
    nn = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)  # regression output
    ])
    nn.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=[])

    nn.fit(X_train_scaled, y_train, validation_split=0.1, epochs=50, batch_size=32, verbose=0)
    nn_pred = nn.predict(X_test_scaled).flatten()
    results['keras_nn'] = evaluate_regression(y_test, nn_pred)
    models['keras_nn'] = ('keras_nn', nn, imputer, scaler)
    print("Keras NN results:", results['keras_nn'])

    # Save models
    for name, tup in models.items():
        tag, m, imputer_obj, scaler_obj = tup
        joblib.dump(tup, os.path.join(OUT_DIR, "models", f"{name}.joblib"))
    # also save results
    res_df = pd.DataFrame(results).T
    res_df.to_csv(os.path.join(OUT_DIR, "model_results.csv"))
    print("Saved model results to outputs/model_results.csv")
    return models, results

In [8]:
def unsupervised_analysis(df, feature_cols):
    """
    Run KMeans, PCA, Agglomerative, and IsolationForest
    """
    print("=== Unsupervised analysis ===")
    X_unsup = df[feature_cols].select_dtypes(include=[np.number]).fillna(0)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X_unsup)

    # PCA (2 components for visualization)
    pca = PCA(n_components=2, random_state=0)
    pcs = pca.fit_transform(Xs)
    pca_df = pd.DataFrame(pcs, columns=['PC1','PC2'])
    pca_df.to_csv(os.path.join(OUT_DIR, "pca_components.csv"))
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'])
    plt.title('PCA (2 components)')
    save_fig("unsup_pca.png")
    plt.close()

    # KMeans
    kmeans = KMeans(n_clusters=3, random_state=0)
    labels = kmeans.fit_predict(Xs)
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=labels, palette='tab10')
    plt.title('KMeans clusters (k=3) on PCA')
    save_fig("unsup_kmeans_pca.png")
    plt.close()

    # Agglomerative
    agg = AgglomerativeClustering(n_clusters=3)
    agg_labels = agg.fit_predict(Xs)
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=agg_labels, palette='deep')
    plt.title('Agglomerative clusters on PCA')
    save_fig("unsup_agg_pca.png")
    plt.close()

    # IsolationForest for anomalies
    iso = IsolationForest(random_state=0)
    iso_pred = iso.fit_predict(Xs)
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=iso_pred, palette='Set1')
    plt.title('IsolationForest (anomaly detection)')
    save_fig("unsup_iso_pca.png")
    plt.close()

    # Save clustering models
    joblib.dump((kmeans, scaler), os.path.join(OUT_DIR, "models", "kmeans.joblib"))
    joblib.dump((agg, scaler), os.path.join(OUT_DIR, "models", "agg.joblib"))
    joblib.dump((iso, scaler), os.path.join(OUT_DIR, "models", "iso.joblib"))

    return {
        'pca': pca,
        'kmeans': kmeans,
        'agg': agg,
        'isolation_forest': iso
    }

In [9]:
def time_series_validation(model, X, y, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    scores = []
    for train_index, val_index in tscv.split(X):
        X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
        y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
        # simple pipeline: impute -> scale -> fit model
        imputer = SimpleImputer(strategy='median')
        scaler = StandardScaler()
        X_tr_scaled = scaler.fit_transform(imputer.fit_transform(X_tr))
        X_val_scaled = scaler.transform(imputer.transform(X_val))
        model.fit(X_tr_scaled, y_tr)
        pred = model.predict(X_val_scaled)
        scores.append(rmse(y_val, pred))
    return np.mean(scores), np.std(scores)

In [10]:
def optuna_tune_rf(X_train, y_train, n_trials=30):
    def objective(trial):
        n_estimators = trial.suggest_int('n_estimators', 50, 500)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        max_features = trial.suggest_categorical('max_features', ['sqrt','log2', 0.5, None])

        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            max_features=max_features,
            random_state=42,
            n_jobs=-1
        )
        # cross-validate using simple 3-fold (not time series CV for speed)
        imputer = SimpleImputer(strategy='median')
        scaler = StandardScaler()
        Xs = scaler.fit_transform(imputer.fit_transform(X_train))
        scores = -cross_val_score(model, Xs, y_train, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
        return np.mean(scores)

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    print("RF best params:", study.best_params)
    return study.best_params

In [11]:
def train_keras_nn(X_train, y_train, X_test, y_test, 
                   hidden_layers=[64, 32], dropout=0.2, 
                   lr=0.001, batch_size=32, epochs=200, patience=20):
    """
    Train a simple feedforward neural network for regression.
    
    Returns:
        model: trained Keras model
        results: dict of metrics on test set
    """
    # Impute and scale
    imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()
    Xtr = scaler.fit_transform(imputer.fit_transform(X_train))
    Xte = scaler.transform(imputer.transform(X_test))
    
    # Build model
    model = Sequential()
    input_dim = Xtr.shape[1]
    for i, units in enumerate(hidden_layers):
        if i == 0:
            model.add(Dense(units, activation='relu', input_dim=input_dim))
        else:
            model.add(Dense(units, activation='relu'))
        if dropout > 0:
            model.add(Dropout(dropout))
    model.add(Dense(1, activation='linear'))  # regression output
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    
    # Early stopping
    es = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
    
    # Train
    history = model.fit(
        Xtr, y_train,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[es],
        verbose=0  # change to 1 to see progress
    )
    
    # Evaluate
    y_pred = model.predict(Xte).flatten()
    results = evaluate_regression(y_test, y_pred)
    
    # Save model
    joblib.dump(('keras_nn', model, imputer, scaler), os.path.join(OUT_DIR, "models", "keras_nn.joblib"))
    print("Neural Network results:", results)
    
    return model, results

In [12]:
def optuna_tune_lgb(X_train, y_train, n_trials=30):
    def objective(trial):
        param = {
            'objective': 'regression',
            'metric': 'rmse',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'num_leaves': trial.suggest_int('num_leaves', 16, 256),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
            'bagging_freq': 1
        }
        imputer = SimpleImputer(strategy='median')
        scaler = StandardScaler()
        Xs = scaler.fit_transform(imputer.fit_transform(X_train))
        split_idx = int(0.8 * len(Xs))
        X_tr, X_val = Xs[:split_idx], Xs[split_idx:]
        y_tr, y_val = y_train.iloc[:split_idx], y_train.iloc[split_idx:]

        dtrain = lgb.Dataset(X_tr, label=y_tr)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        # Train with early stopping
        cvres = lgb.train(
            params=param,
            train_set=dtrain,
            valid_sets=[dval],
            num_boost_round=1000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=20),   # early stopping
                lgb.log_evaluation(period=0)              # suppress printing
            ]
        )
        return cvres.best_score['valid_0']['rmse']

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    print("LightGBM best params:", study.best_params)
    return study.best_params

In [13]:
def shap_explain(model_tuple, X_train, X_test, feature_names, model_name="model"):
    """
    model_tuple: (tag, model, imputer, scaler) saved earlier
    """
    tag, model, imputer, scaler = model_tuple
    # preprocess
    X_train_pre = scaler.fit_transform(imputer.fit_transform(X_train))
    X_test_pre = scaler.transform(imputer.transform(X_test))

    # SHAP for tree models (RandomForest, LightGBM)
    model_type = str(type(model)).lower()
    explainer = None
    if hasattr(model, 'predict'):
        try:
            if 'lgbm' in model_type or 'lightgbm' in model_type:
                explainer = shap.TreeExplainer(model)
            elif 'randomforest' in model_type or 'forest' in model_type:
                explainer = shap.TreeExplainer(model)
            elif 'sequential' in model_type or 'keras' in model_type:
                background = X_train_pre[np.random.choice(X_train_pre.shape[0], min(100, X_train_pre.shape[0]), replace=False)]
                explainer = shap.KernelExplainer(model.predict, background)
                shap_values = explainer.shap_values(X_test_pre[:50], nsamples=100)
            else:
                explainer = shap.KernelExplainer(model.predict, X_train_pre[:100])
        except Exception as e:
            print("SHAP explainer construction failed:", e)
            return
    else:
        print("Model object not suitable for SHAP")
        return

    shap_values = explainer.shap_values(X_test_pre[:200])  # limit for speed
    # summary plot
    plt.figure(figsize=(6,4))
    shap.summary_plot(shap_values, X_test_pre[:200], feature_names=feature_names, show=False)
    save_fig(f"shap_summary_{model_name}.png")
    plt.close()

    # feature importance as table
    mean_abs = np.abs(shap_values).mean(axis=0)
    fi = pd.Series(mean_abs, index=feature_names).sort_values(ascending=False)
    fi.to_csv(os.path.join(OUT_DIR, f"shap_feature_importance_{model_name}.csv"))
    print(f"Saved SHAP feature importance for {model_name}")

In [14]:
def run_full_pipeline():
    df = load_merged_df()
    df, feature_cols = preprocess_and_engineer(df, debug=True)

    # train/test split: use temporal split (no shuffle)
    n = len(df)
    test_size = int(0.2 * n)
    train_df = df.iloc[:-test_size].reset_index(drop=True)
    test_df = df.iloc[-test_size:].reset_index(drop=True)

    # Training-only EDA
    X_train = train_df[feature_cols]
    y_train = train_df['target_high']
    training_only_eda(X_train, y_train, out_prefix="train")

    # Unsupervised analysis
    unsup_models = unsupervised_analysis(train_df, feature_cols)

    # Prepare test set
    X_test = test_df[feature_cols]
    y_test = test_df['target_high']

    # Train supervised models
    models, results = train_and_evaluate_models(X_train, X_test, y_train, y_test)

    # TimeSeries CV for naive RF
    try:
        mean_rmse, std_rmse = time_series_validation(
            RandomForestRegressor(n_estimators=100),
            train_df[feature_cols],
            train_df['target_high'],
            n_splits=5
        )
        print(f"TimeSeriesCV (naive RF) RMSE: mean {mean_rmse:.4f}, std {std_rmse:.4f}")
    except Exception as e:
        print("TimeSeries CV failed:", e)

    # Hyperparameter tuning with Optuna
    print("Starting Optuna tuning (this may take a while)...")
    try:
        best_rf_params = optuna_tune_rf(X_train, y_train, n_trials=20)
        best_lgb_params = optuna_tune_lgb(X_train, y_train, n_trials=20)
    except Exception as e:
        print("Optuna tuning failed or interrupted:", e)
        best_rf_params, best_lgb_params = None, None

    # Fit tuned models
    tuned_results = {}
    if best_rf_params:
        rf_tuned = RandomForestRegressor(**best_rf_params, random_state=42, n_jobs=-1)
        imputer = SimpleImputer(strategy='median')
        scaler = StandardScaler()
        Xtr = scaler.fit_transform(imputer.fit_transform(X_train))
        rf_tuned.fit(Xtr, y_train)
        Xte = scaler.transform(imputer.transform(X_test))
        rf_pred = rf_tuned.predict(Xte)
        tuned_results['rf_tuned'] = evaluate_regression(y_test, rf_pred)
        joblib.dump(('rf_tuned', rf_tuned, imputer, scaler), os.path.join(OUT_DIR, "models", "rf_tuned.joblib"))
        print("RF tuned results:", tuned_results['rf_tuned'])
    if best_lgb_params:
        lgb_tuned = lgb.LGBMRegressor(**best_lgb_params, random_state=42)
        imputer = SimpleImputer(strategy='median')
        scaler = StandardScaler()
        Xtr = scaler.fit_transform(imputer.fit_transform(X_train))
        lgb_tuned.fit(Xtr, y_train)
        Xte = scaler.transform(imputer.transform(X_test))
        lgb_pred = lgb_tuned.predict(Xte)
        tuned_results['lgb_tuned'] = evaluate_regression(y_test, lgb_pred)
        joblib.dump(('lgb_tuned', lgb_tuned, imputer, scaler), os.path.join(OUT_DIR, "models", "lgb_tuned.joblib"))
        print("LightGBM tuned results:", tuned_results['lgb_tuned'])

    # --- Neural Network ---
    try:
        nn_model, nn_results = train_keras_nn(X_train, y_train, X_test, y_test)
        tuned_results['keras_nn'] = nn_results
    except Exception as e:
        print("Neural network training failed:", e)

    # Final evaluation: combine all results
    all_results = []
    for name, metrics in results.items():
        row = {'model': name}
        row.update(metrics)
        all_results.append(row)
    for name, metrics in tuned_results.items():
        row = {'model': name}
        row.update(metrics)
        all_results.append(row)

    res_df = pd.DataFrame(all_results).set_index('model')
    res_df.to_csv(os.path.join(OUT_DIR, "final_evaluation_results.csv"))
    print("Saved final evaluation results to outputs/final_evaluation_results.csv")

    if not res_df.empty:
        best_model_name = res_df['RMSE'].idxmin()
        print("Best model by RMSE:", best_model_name)
        model_tuple = None
        # Try to load the model object
        try:
            model_tuple = joblib.load(os.path.join(OUT_DIR, "models", f"{best_model_name}.joblib"))
        except Exception:
            try:
                model_tuple = joblib.load(os.path.join(OUT_DIR, "models", f"{best_model_name}_tuned.joblib"))
            except Exception:
                if best_model_name in models:
                    model_tuple = models[best_model_name]
        if model_tuple:
            try:
                shap_explain(model_tuple, X_train, X_test, feature_cols, model_name=best_model_name)
            except Exception as e:
                print("SHAP explanation failed:", e)
        else:
            print("Could not find model object for SHAP explanation:", best_model_name)
    print("Pipeline complete.")


In [15]:
run_full_pipeline()

Feature columns: ['open', 'high', 'low', 'close', 'volume', 'vwap', 'trade_count', 'daily_return', 'rolling_mean_5', 'rolling_std_5', 'rolling_mean_10', 'rolling_std_10', 'close_lag_1', 'close_lag_2', 'close_lag_3', 'return_lag_1', 'return_lag_2', 'return_lag_3', 'vol_lag_1', 'vol_lag_2', 'vol_lag_3', 'sentiment_lag_1', 'dayofweek', 'month']
=== Training-only EDA ===
Saved descriptive stats to outputs/train_descriptive_stats.csv
Saved figure: outputs\figs\train_corr_matrix.png
Saved figure: outputs\figs\train_dist_open.png
Saved figure: outputs\figs\train_dist_high.png
Saved figure: outputs\figs\train_dist_low.png
Saved figure: outputs\figs\train_dist_close.png
Saved figure: outputs\figs\train_dist_volume.png
Saved figure: outputs\figs\train_dist_vwap.png
Saved figure: outputs\figs\train_dist_trade_count.png
Saved figure: outputs\figs\train_dist_daily_return.png
Saved figure: outputs\figs\train_sentiment_vs_target.png
=== Unsupervised analysis ===
Saved figure: outputs\figs\unsup_pca.p

[I 2025-12-03 15:46:29,138] A new study created in memory with name: no-name-bf99741b-92a8-47ad-999e-654ac3743430


TimeSeriesCV (naive RF) RMSE: mean 6.0770, std 4.1456
Starting Optuna tuning (this may take a while)...


Best trial: 0. Best value: 4.21716:   5%|▌         | 1/20 [00:03<01:04,  3.37s/it]

[I 2025-12-03 15:46:32,513] Trial 0 finished with value: 4.217164017071421 and parameters: {'n_estimators': 479, 'max_depth': 5, 'min_samples_split': 3, 'max_features': None}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  10%|█         | 2/20 [00:05<00:49,  2.77s/it]

[I 2025-12-03 15:46:34,868] Trial 1 finished with value: 4.812751227214245 and parameters: {'n_estimators': 458, 'max_depth': 17, 'min_samples_split': 2, 'max_features': 'log2'}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  15%|█▌        | 3/20 [00:07<00:38,  2.25s/it]

[I 2025-12-03 15:46:36,498] Trial 2 finished with value: 4.332436814091612 and parameters: {'n_estimators': 79, 'max_depth': 7, 'min_samples_split': 9, 'max_features': 0.5}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  20%|██        | 4/20 [00:09<00:35,  2.23s/it]

[I 2025-12-03 15:46:38,695] Trial 3 finished with value: 4.980240867077739 and parameters: {'n_estimators': 453, 'max_depth': 5, 'min_samples_split': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  25%|██▌       | 5/20 [00:10<00:24,  1.61s/it]

[I 2025-12-03 15:46:39,209] Trial 4 finished with value: 5.8127476798319195 and parameters: {'n_estimators': 258, 'max_depth': 3, 'min_samples_split': 9, 'max_features': 'sqrt'}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  30%|███       | 6/20 [00:10<00:18,  1.30s/it]

[I 2025-12-03 15:46:39,908] Trial 5 finished with value: 4.897576301576968 and parameters: {'n_estimators': 338, 'max_depth': 8, 'min_samples_split': 2, 'max_features': 'log2'}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  35%|███▌      | 7/20 [00:11<00:15,  1.22s/it]

[I 2025-12-03 15:46:40,966] Trial 6 finished with value: 4.898853409582692 and parameters: {'n_estimators': 450, 'max_depth': 3, 'min_samples_split': 5, 'max_features': None}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  40%|████      | 8/20 [00:12<00:13,  1.12s/it]

[I 2025-12-03 15:46:41,855] Trial 7 finished with value: 5.009796058862405 and parameters: {'n_estimators': 453, 'max_depth': 3, 'min_samples_split': 7, 'max_features': 0.5}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  45%|████▌     | 9/20 [00:12<00:09,  1.19it/s]

[I 2025-12-03 15:46:42,088] Trial 8 finished with value: 5.01367662295862 and parameters: {'n_estimators': 67, 'max_depth': 11, 'min_samples_split': 9, 'max_features': 'log2'}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  50%|█████     | 10/20 [00:13<00:06,  1.52it/s]

[I 2025-12-03 15:46:42,337] Trial 9 finished with value: 4.325379211583285 and parameters: {'n_estimators': 76, 'max_depth': 17, 'min_samples_split': 2, 'max_features': 0.5}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  55%|█████▌    | 11/20 [00:14<00:06,  1.38it/s]

[I 2025-12-03 15:46:43,212] Trial 10 finished with value: 4.235991569296627 and parameters: {'n_estimators': 306, 'max_depth': 13, 'min_samples_split': 4, 'max_features': None}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  60%|██████    | 12/20 [00:15<00:06,  1.25it/s]

[I 2025-12-03 15:46:44,176] Trial 11 finished with value: 4.235127818267906 and parameters: {'n_estimators': 308, 'max_depth': 13, 'min_samples_split': 4, 'max_features': None}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  65%|██████▌   | 13/20 [00:15<00:05,  1.32it/s]

[I 2025-12-03 15:46:44,850] Trial 12 finished with value: 4.24357368131433 and parameters: {'n_estimators': 211, 'max_depth': 13, 'min_samples_split': 4, 'max_features': None}. Best is trial 0 with value: 4.217164017071421.


Best trial: 0. Best value: 4.21716:  70%|███████   | 14/20 [00:16<00:05,  1.16it/s]

[I 2025-12-03 15:46:45,940] Trial 13 finished with value: 4.239323554635833 and parameters: {'n_estimators': 365, 'max_depth': 20, 'min_samples_split': 4, 'max_features': None}. Best is trial 0 with value: 4.217164017071421.


Best trial: 14. Best value: 4.18403:  75%|███████▌  | 15/20 [00:17<00:03,  1.28it/s]

[I 2025-12-03 15:46:46,541] Trial 14 finished with value: 4.184028665254133 and parameters: {'n_estimators': 167, 'max_depth': 10, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 4.184028665254133.


Best trial: 14. Best value: 4.18403:  80%|████████  | 16/20 [00:17<00:02,  1.42it/s]

[I 2025-12-03 15:46:47,071] Trial 15 finished with value: 4.242081076214215 and parameters: {'n_estimators': 160, 'max_depth': 9, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 4.184028665254133.


Best trial: 14. Best value: 4.18403:  85%|████████▌ | 17/20 [00:18<00:02,  1.46it/s]

[I 2025-12-03 15:46:47,707] Trial 16 finished with value: 4.2431747989145565 and parameters: {'n_estimators': 163, 'max_depth': 6, 'min_samples_split': 6, 'max_features': None}. Best is trial 14 with value: 4.184028665254133.


Best trial: 14. Best value: 4.18403:  90%|█████████ | 18/20 [00:19<00:01,  1.20it/s]

[I 2025-12-03 15:46:48,883] Trial 17 finished with value: 4.208392489693545 and parameters: {'n_estimators': 393, 'max_depth': 10, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 4.184028665254133.


Best trial: 14. Best value: 4.18403:  95%|█████████▌| 19/20 [00:20<00:00,  1.21it/s]

[I 2025-12-03 15:46:49,694] Trial 18 finished with value: 4.94626463000239 and parameters: {'n_estimators': 395, 'max_depth': 10, 'min_samples_split': 6, 'max_features': 'sqrt'}. Best is trial 14 with value: 4.184028665254133.


Best trial: 14. Best value: 4.18403: 100%|██████████| 20/20 [00:21<00:00,  1.07s/it]
[I 2025-12-03 15:46:50,466] A new study created in memory with name: no-name-3d7c8998-6bb8-4e69-a3b7-0f4f49fafeca


[I 2025-12-03 15:46:50,460] Trial 19 finished with value: 4.2301917724544005 and parameters: {'n_estimators': 239, 'max_depth': 11, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 4.184028665254133.
RF best params: {'n_estimators': 167, 'max_depth': 10, 'min_samples_split': 3, 'max_features': None}


Best trial: 0. Best value: 11.7596:   5%|▌         | 1/20 [00:00<00:02,  6.54it/s]

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[348]	valid_0's rmse: 11.7596
[I 2025-12-03 15:46:50,615] Trial 0 finished with value: 11.759587628649983 and parameters: {'learning_rate': 0.014833210898008, 'num_leaves': 208, 'feature_fraction': 0.5842170642189065, 'bagging_fraction': 0.823132055174568}. Best is trial 0 with value: 11.759587628649983.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[45]	valid_0's rmse: 11.6912


Best trial: 4. Best value: 11.6226:  25%|██▌       | 5/20 [00:00<00:00, 15.14it/s]

[I 2025-12-03 15:46:50,664] Trial 1 finished with value: 11.691159878983198 and parameters: {'learning_rate': 0.12760544144808889, 'num_leaves': 95, 'feature_fraction': 0.9547247330066191, 'bagging_fraction': 0.9196261898092184}. Best is trial 1 with value: 11.691159878983198.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[101]	valid_0's rmse: 11.6314
[I 2025-12-03 15:46:50,733] Trial 2 finished with value: 11.631408688108882 and parameters: {'learning_rate': 0.05000837757864231, 'num_leaves': 208, 'feature_fraction': 0.7649020397776838, 'bagging_fraction': 0.8787621717242314}. Best is trial 2 with value: 11.631408688108882.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[27]	valid_0's rmse: 12.4557
[I 2025-12-03 15:46:50,766] Trial 3 finished with value: 12.455650295602114 and parameters: {'learning_rate': 0.18485893077405366, 'num_leaves': 216, 'feature_fraction': 0.5985166354333797, 'b

Best trial: 8. Best value: 11.6108:  40%|████      | 8/20 [00:00<00:00, 19.28it/s]

[I 2025-12-03 15:46:50,870] Trial 5 finished with value: 11.914584013878013 and parameters: {'learning_rate': 0.16306336761248597, 'num_leaves': 62, 'feature_fraction': 0.6354340300728443, 'bagging_fraction': 0.8448011587716066}. Best is trial 4 with value: 11.62256840050383.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[70]	valid_0's rmse: 12.5411
[I 2025-12-03 15:46:50,919] Trial 6 finished with value: 12.541085309021138 and parameters: {'learning_rate': 0.08655141487269348, 'num_leaves': 40, 'feature_fraction': 0.5116368899609258, 'bagging_fraction': 0.5500913438197688}. Best is trial 4 with value: 11.62256840050383.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[23]	valid_0's rmse: 12.5818
[I 2025-12-03 15:46:50,950] Trial 7 finished with value: 12.581833053511803 and parameters: {'learning_rate': 0.17911071914641044, 'num_leaves': 47, 'feature_fraction': 0.5591792336884926, 'baggin

Best trial: 8. Best value: 11.6108:  50%|█████     | 10/20 [00:00<00:00, 19.14it/s]

[I 2025-12-03 15:46:51,053] Trial 9 finished with value: 11.67768112279497 and parameters: {'learning_rate': 0.09459377174886474, 'num_leaves': 236, 'feature_fraction': 0.8551808827957355, 'bagging_fraction': 0.8203530350626582}. Best is trial 8 with value: 11.61083385860292.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[41]	valid_0's rmse: 11.8236
[I 2025-12-03 15:46:51,188] Trial 10 finished with value: 11.823629749489815 and parameters: {'learning_rate': 0.12175613100630886, 'num_leaves': 146, 'feature_fraction': 0.9812108509675936, 'bagging_fraction': 0.6798303811630049}. Best is trial 8 with value: 11.61083385860292.
Training until validation scores don't improve for 20 rounds


Best trial: 8. Best value: 11.6108:  60%|██████    | 12/20 [00:00<00:00, 13.66it/s]

Early stopping, best iteration is:
[87]	valid_0's rmse: 11.6616
[I 2025-12-03 15:46:51,290] Trial 11 finished with value: 11.661575815011384 and parameters: {'learning_rate': 0.05850027986543628, 'num_leaves': 140, 'feature_fraction': 0.7178597024829546, 'bagging_fraction': 0.9959209852240273}. Best is trial 8 with value: 11.61083385860292.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[390]	valid_0's rmse: 11.6277


Best trial: 8. Best value: 11.6108:  70%|███████   | 14/20 [00:01<00:00, 10.48it/s]

[I 2025-12-03 15:46:51,476] Trial 12 finished with value: 11.627665703336536 and parameters: {'learning_rate': 0.01389692483546474, 'num_leaves': 16, 'feature_fraction': 0.8598539296851507, 'bagging_fraction': 0.9624090182265651}. Best is trial 8 with value: 11.61083385860292.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[79]	valid_0's rmse: 11.8122
[I 2025-12-03 15:46:51,576] Trial 13 finished with value: 11.81215116671338 and parameters: {'learning_rate': 0.05868682263397776, 'num_leaves': 109, 'feature_fraction': 0.7127557544218852, 'bagging_fraction': 0.724819321947803}. Best is trial 8 with value: 11.61083385860292.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[30]	valid_0's rmse: 11.615
[I 2025-12-03 15:46:51,648] Trial 14 finished with value: 11.615030582421701 and parameters: {'learning_rate': 0.14744326393615814, 'num_leaves': 174, 'feature_fraction': 0.8060608318408653, 'bagg

Best trial: 15. Best value: 11.4957:  80%|████████  | 16/20 [00:01<00:00, 11.33it/s]

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[33]	valid_0's rmse: 11.4957
[I 2025-12-03 15:46:51,720] Trial 15 finished with value: 11.495708281441878 and parameters: {'learning_rate': 0.14671088813212468, 'num_leaves': 176, 'feature_fraction': 0.8471791083088034, 'bagging_fraction': 0.9222101588020202}. Best is trial 15 with value: 11.495708281441878.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 11.8013
[I 2025-12-03 15:46:51,795] Trial 16 finished with value: 11.801295652954671 and parameters: {'learning_rate': 0.13641877032403416, 'num_leaves': 174, 'feature_fraction': 0.8868667096791779, 'bagging_fraction': 0.7653193514670557}. Best is trial 15 with value: 11.495708281441878.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[91]	valid_0's rmse: 11.7311


Best trial: 19. Best value: 11.4527: 100%|██████████| 20/20 [00:01<00:00, 12.79it/s]


[I 2025-12-03 15:46:51,881] Trial 17 finished with value: 11.7310892870008 and parameters: {'learning_rate': 0.11104510414637303, 'num_leaves': 103, 'feature_fraction': 0.9272929012110327, 'bagging_fraction': 0.9015194659964182}. Best is trial 15 with value: 11.495708281441878.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[49]	valid_0's rmse: 12.0967
[I 2025-12-03 15:46:51,951] Trial 18 finished with value: 12.09666220425218 and parameters: {'learning_rate': 0.07921844455643015, 'num_leaves': 169, 'feature_fraction': 0.7860190053111616, 'bagging_fraction': 0.6355859676534005}. Best is trial 15 with value: 11.495708281441878.
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 11.4527
[I 2025-12-03 15:46:52,025] Trial 19 finished with value: 11.452659667789959 and parameters: {'learning_rate': 0.1528668920931259, 'num_leaves': 124, 'feature_fraction': 0.822836806554386, 'b

100%|██████████| 100/100 [00:11<00:00,  8.99it/s]


Saved figure: outputs\figs\shap_summary_linear_regression.png
Saved SHAP feature importance for linear_regression
Pipeline complete.
