# Random Forest Model – Stock Price Prediction

## Why Random Forest as Baseline?
Random Forests aggregate hundreds of decision trees, each trained on a random
sub-sample of data and features. The ensemble majority vote (classifier) or
average (regressor) reduces variance substantially, making it robust and
interpretable via **feature importances**.

Unlike LSTM, it doesn't natively understand temporal ordering, so we engineer
lagged features and rolling statistics explicitly.

## Overfitting Prevention
- **max_depth** limits tree depth
- **min_samples_split** prevents leaves from fitting to single data points
- **TimeSeriesSplit** cross-validation avoids look-ahead bias during tuning


In [None]:
import sys; sys.path.insert(0, '..')
import warnings; warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.data_loader import (
    fetch_stock_data, time_series_split, scale_features
)
from src.sentiment_analyzer import add_sentiment_to_df
from src.model_trainer import (
    train_random_forest_regressor,
    train_random_forest_classifier
)
from src.evaluator import (
    regression_metrics, classification_metrics,
    plot_predictions, plot_feature_importance,
    plot_confusion_matrix, sharpe_ratio, max_drawdown
)

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
print('Setup complete')

In [None]:
# ── Config ────────────────────────────────────────────────────────────────────
TICKER = 'AAPL'
START  = '2015-01-01'
END    = '2024-12-31'

FEATURE_COLS = [
    'Open', 'High', 'Low', 'Volume',
    'SMA_10', 'SMA_20', 'SMA_50',
    'RSI_14', 'RSI_7', 'MACD', 'MACD_Signal', 'MACD_Hist',
    'BB_Width', 'ATR_14', 'Vol_Change', 'OBV',
    'Log_Return', 'Pct_Change', 'Sentiment',
    'Close_Lag_1', 'Close_Lag_2', 'Close_Lag_3',
    'Close_Lag_5', 'Close_Lag_10',
]

In [None]:
# ── 1. Load & Split ───────────────────────────────────────────────────────────
df = fetch_stock_data(TICKER, START, END)
df = add_sentiment_to_df(df, TICKER, START, END)

feature_cols = [c for c in FEATURE_COLS if c in df.columns]

train_df, test_df = time_series_split(df, 0.80)
print(f'Train: {len(train_df)} | Test: {len(test_df)}')

In [None]:
# ── 2. Prepare X / y (no sequences needed for RF) ────────────────────────────
X_train = train_df[feature_cols].values
y_train = train_df['Close'].values

X_test  = test_df[feature_cols].values
y_test  = test_df['Close'].values

# Also build binary direction labels for the classifier
y_train_dir = (train_df['Close'].diff().shift(-1) > 0).astype(int).values[:-1]
y_test_dir  = (test_df['Close'].diff().shift(-1) > 0).astype(int).values[:-1]
X_train_dir = X_train[:-1]
X_test_dir  = X_test[:-1]

print(f'X_train: {X_train.shape} | X_test: {X_test.shape}')

In [None]:
# ── 3a. Regression RF ─────────────────────────────────────────────────────────
# tune=True runs GridSearchCV with TimeSeriesSplit (takes a few minutes)
rf_reg = train_random_forest_regressor(
    X_train, y_train,
    tune=True,
    save_path='../results/rf_regressor.pkl'
)

y_pred_reg = rf_reg.predict(X_test)
metrics_reg = regression_metrics(y_test, y_pred_reg, 'RF Regressor')

In [None]:
# ── 3b. Classification RF ─────────────────────────────────────────────────────
rf_clf = train_random_forest_classifier(
    X_train_dir, y_train_dir,
    tune=False,
    save_path='../results/rf_classifier.pkl'
)

y_pred_clf = rf_clf.predict(X_test_dir)
metrics_clf = classification_metrics(y_test_dir, y_pred_clf, 'RF Classifier')

In [None]:
# ── 4. Prediction Plot (Regression) ──────────────────────────────────────────
plot_predictions(y_test, y_pred_reg, label='RF Regressor', dates=test_df.index)

In [None]:
# ── 5. Feature Importance ─────────────────────────────────────────────────────
plot_feature_importance(rf_reg, feature_cols, top_n=15)

In [None]:
# ── 6. Confusion Matrix (Classifier) ─────────────────────────────────────────
plot_confusion_matrix(y_test_dir, y_pred_clf, 'RF Classifier')

In [None]:
# ── 7. Finance Metrics ────────────────────────────────────────────────────────
actual_rets  = np.diff(y_test) / (y_test[:-1] + 1e-10)
signals      = np.where(np.diff(y_pred_reg) > 0, 1, -1)
n = min(len(signals), len(actual_rets))
strat_rets   = signals[:n] * actual_rets[:n]

print('\n── RF Strategy Finance Metrics ─────────────')
sr  = sharpe_ratio(strat_rets)
mdd = max_drawdown(np.cumprod(1 + strat_rets))

In [None]:
# ── 8. Save metrics ───────────────────────────────────────────────────────────
import json, os
combined = {**metrics_reg, **metrics_clf,
            'Sharpe': round(sr, 4), 'MaxDrawdown': round(mdd, 4)}
with open('../results/rf_metrics.json', 'w') as f:
    json.dump(combined, f, indent=2)
print('Saved → ../results/rf_metrics.json')