# Baseline Models: Logistic Regression & XGBoost
Engineer features and benchmark classic ML models for next-day return direction.

In [None]:
# Feature engineering
from src.features import add_features
features = add_features(df)
features = features.dropna()
X = features[['log_return', 'ma_10', 'ma_20', 'vol_20', 'rsi_14']]
y = (features['log_return'].shift(-1) > 0).astype(int)

In [None]:
# Walk-forward split and model training
from src.models import BaselineModels
from src.utils import time_series_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
bm = BaselineModels()
for fold, (train_idx, test_idx) in enumerate(time_series_split(X, y, n_splits=5)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    bm.fit(X_train, y_train)
    lr_pred, xgb_pred = bm.predict(X_test)
    print(f"Fold {fold+1} Logistic: Acc={accuracy_score(y_test, lr_pred):.3f}, F1={f1_score(y_test, lr_pred):.3f}, ROC-AUC={roc_auc_score(y_test, lr_pred):.3f}")
    print(f"Fold {fold+1} XGBoost: Acc={accuracy_score(y_test, xgb_pred):.3f}, F1={f1_score(y_test, xgb_pred):.3f}, ROC-AUC={roc_auc_score(y_test, xgb_pred):.3f}")

## Assumptions
- Features are lagged and real-time computable.
- Only time-series splits are used (no random shuffle).
- No data snooping or lookahead bias.

# Baseline Model Comparison for Stock Prediction
This notebook demonstrates feature engineering, baseline model training, walk-forward validation, and realistic backtesting for a single stock (AAPL).