In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from src.stock_features import apply_kalman_filter_with_lag, create_target_variable

In [None]:
# ================================================================
# Step 1: Define Parameters
# ================================================================
# Path to your processed data file
data_path = r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\stock_and_macro.csv'

# Path to the saved model from the fine-tuning stage
model_path = r'C:\Users\epoch_bpjmdqk\Documents\Code\models\randomforest_w10_t0.005.pkl'

# Define the data preparation parameters that the model was trained on
target_ticker = 'WMT'
window = 10
threshold = 0.05

# Define the backtesting period split
backtest_start_date = '2021-01-01'

# Define real-world trading costs and starting balance
transaction_cost_per_trade = 0.001  # 0.1% transaction cost
slippage_factor = 0.0005 # 0.05% slippage

# Starting capital for the backtest
starting_balance = 10000.0

# This is the target percentage of your portfolio to allocate to the stock when
# the model predicts a price increase (1).
allocation_percentage = 0.50 # 50% allocation

In [None]:
# ================================================================
# Step 2: Load the Data and the Trained Model
# ================================================================
try:
    # Load the processed data
    data = pd.read_csv(data_path, index_col='Date', parse_dates=True)
    
    # Load the trained model from the pickle file
    with open(model_path, 'rb') as file:
        best_model = pickle.load(file)
    
    print("✅ Successfully loaded data and the best model.")

except FileNotFoundError:
    print(f"Error: Could not find one of the files. Please check paths:")
    print(f"Data path: {data_path}")
    print(f"Model path: {model_path}")
    exit()

In [None]:
# ================================================================
# Step 3: Data Preparation for Backtesting
# ================================================================
# Apply the Kalman filter function
tickers_to_filter = [target_ticker]
lags_to_use = [1, 5, 10]
processed_data_with_kalman = apply_kalman_filter_with_lag(data, tickers_to_filter, lags_to_use)

# Create the target variable for the backtest data
data_with_target = create_target_variable(processed_data_with_kalman.copy(), target_ticker, window=window, threshold=threshold)

# Define column names
target_col_name = f'{target_ticker}_Target'
target_return_col_name = f'{target_ticker}_target_return_{window}D_{threshold}'

# Drop columns not used for training
columns_to_drop = [
    target_col_name,
    target_return_col_name,
    f'Open_{target_ticker}',
    f'High_{target_ticker}',
    f'Low_{target_ticker}',
    f'Close_{target_ticker}'
] 
data_with_target.dropna(inplace=True)

# Split data into training and backtesting sets based on the fixed date
X_backtest = data_with_target.loc[backtest_start_date:].drop(columns=columns_to_drop, errors='ignore')
y_backtest = data_with_target.loc[backtest_start_date:][target_col_name]
backtest_data = data_with_target.loc[backtest_start_date:].copy()

# Get the list of features used in the model
features = X_backtest.columns

print(f"\n--- Backtesting on unseen data from {backtest_start_date} to {backtest_data.index[-1].strftime('%Y-%m-%d')} ---")

# Make predictions on the backtesting data
backtest_predictions = best_model.predict(X_backtest)
backtest_predictions_df = pd.Series(backtest_predictions, index=X_backtest.index, name='Prediction')
backtest_data['Prediction'] = backtest_predictions_df

In [None]:
# --- Step 4: Account Simulation ---
balance = starting_balance
shares_held = 0
account_history = []
buy_trades = 0
sell_trades = 0

# Count the number of signals for diagnostic purposes
buy_signals = (backtest_data['Prediction'] == 1).sum()
sell_signals = (backtest_data['Prediction'] == 0).sum()

for date, row in backtest_data.iterrows():
    # Get today's close price and model prediction
    current_price = row[f'Close_{target_ticker}']
    prediction = row['Prediction']

    # --- NEW CONTINUOUS ALLOCATION TRADING LOGIC ---
    current_account_value = balance + (shares_held * current_price)
    current_position_value = shares_held * current_price

    if prediction == 1:
        # If model predicts a rise, target is the defined allocation percentage
        target_position_value = current_account_value * allocation_percentage
    else:
        # If model predicts no rise, target is 0 (move to cash)
        target_position_value = 0

    # Execute trades to get to the target position
    if target_position_value > current_position_value:
        # We need to buy
        amount_to_buy = target_position_value - current_position_value
        shares_to_buy = amount_to_buy / (current_price * (1 + slippage_factor))
        
        # Only trade if we can buy at least one share
        if shares_to_buy >= 1:
            shares_bought = int(shares_to_buy)
            cost_of_trade = shares_bought * current_price * (1 + slippage_factor)
            
            # Check if we have enough balance to make the trade
            if balance >= cost_of_trade:
                balance -= cost_of_trade
                balance -= cost_of_trade * transaction_cost_per_trade
                shares_held += shares_bought
                buy_trades += 1
                
    elif target_position_value < current_position_value:
        # We need to sell
        amount_to_sell = current_position_value - target_position_value
        shares_to_sell = amount_to_sell / (current_price * (1 - slippage_factor))
        
        # Only trade if we can sell at least one share
        if shares_to_sell >= 1:
            shares_sold = int(shares_to_sell)
            
            # Make sure we don't try to sell more than we hold
            if shares_held >= shares_sold:
                proceeds = shares_sold * current_price * (1 - slippage_factor)
                balance += proceeds
                balance -= proceeds * transaction_cost_per_trade
                shares_held -= shares_sold
                sell_trades += 1

    # Record the current account value for plotting
    current_account_value = balance + (shares_held * current_price)
    account_history.append(current_account_value)

# Create a DataFrame for the account history
account_df = pd.DataFrame({
    'Balance': account_history
}, index=backtest_data.index)

In [None]:
# ================================================================
# Step 5: Create a Buy-and-Hold Benchmark
# ================================================================
# Slice the raw data for the backtest period to ensure accuracy
buy_and_hold_data = data.loc[backtest_start_date:].copy()

# Buy as many shares as possible at the start
buy_and_hold_shares = int(starting_balance / buy_and_hold_data.iloc[0][f'Close_{target_ticker}'])
buy_and_hold_initial_cash = starting_balance - (buy_and_hold_shares * buy_and_hold_data.iloc[0][f'Close_{target_ticker}'])
# Calculate the daily balance by multiplying shares by the daily close price and adding initial cash
buy_and_hold_balance = buy_and_hold_data[f'Close_{target_ticker}'] * buy_and_hold_shares + buy_and_hold_initial_cash


In [None]:
# ================================================================
# Step 6: Calculate and Visualize Backtesting Performance
# ================================================================
# Calculate final metrics
total_strategy_return = (account_df['Balance'].iloc[-1] / starting_balance) - 1
total_buy_and_hold_return = (buy_and_hold_balance.iloc[-1] / starting_balance) - 1
annualized_strategy_return = (1 + total_strategy_return)**(252 / len(account_df)) - 1
annualized_buy_and_hold_return = (1 + total_buy_and_hold_return)**(252 / len(account_df)) - 1

# Display final results
print("\n--- Backtesting Results (with Real-World Costs & Continuous Allocation) ---")
print(f"Starting Balance: ${starting_balance:.2f}")

print("\n--- Buy-and-Hold Diagnostic ---")
print(f"Buy-and-Hold Total Return: {total_buy_and_hold_return:.2%}")

print("\n--- Strategy Results ---")
print(f"Final Strategy Balance: ${account_df['Balance'].iloc[-1]:.2f}")
print(f"Strategy Total Return: {total_strategy_return:.2%}")
print(f"Strategy Annualized Return: {annualized_strategy_return:.2%}")
print(f"Total Buy Trades: {buy_trades}")
print(f"Total Sell Trades: {sell_trades}")
print(f"Total Trades: {buy_trades + sell_trades}")

print("\n--- Overall Performance ---")
print(f"Buy-and-Hold Annualized Return: {annualized_buy_and_hold_return:.2%}")

# --- Extract and display top features ---
if hasattr(best_model, 'feature_importances_'):
    importances = pd.Series(best_model.feature_importances_, index=features)
    top_features = importances.nlargest(10)
    print("\n--- Top 10 Most Important Features ---")
    print(top_features)

# Create the plot
plt.figure(figsize=(12, 8))
plt.style.use('seaborn-v0_8-darkgrid')
plt.plot(buy_and_hold_balance, label='Buy and Hold', color='skyblue', linewidth=2)
plt.plot(account_df['Balance'], label='RandomForest Strategy', color='red', linewidth=2)
plt.title(f'Backtest: RandomForest Strategy vs. Buy and Hold ({target_ticker})', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Account Balance ($)', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

