1. This program takes the combined trade and sentiments data as input:
a. aapl_trading_sentiment_data_all_days_counterfactual.csv and
b. aapl_trading_sentiment_data_all_days_RefPaper.csv
2. Sets up the stock trading environment using libraries from https://github.com/benstaf/FinRL_DeepSeek.git
3. Trains agents based on data from Counterfactual prompting approach and the Reference Paper's prompting approach
4. Peforms back testing and evaluates both the trading agents
5. Compares the performance of the agents     

In [1]:
# Step 0: Prerequisites & Setup
# -------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

!pip install finrl yfinance stockstats gymnasium stable_baselines3 alpaca-trade-api exchange_calendars wrds matplotlib pandas scikit-learn ta
%matplotlib inline

# Clone repo and set paths
!git clone https://github.com/benstaf/FinRL_DeepSeek.git
%cd /content/FinRL_DeepSeek
import sys
sys.path.append('/content/FinRL_DeepSeek')

Mounted at /content/drive
Collecting finrl
  Downloading FinRL-0.3.7-py3-none-any.whl.metadata (909 bytes)
Collecting stockstats
  Downloading stockstats-0.6.4-py2.py3-none-any.whl.metadata (39 kB)
Collecting stable_baselines3
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting alpaca-trade-api
  Downloading alpaca_trade_api-3.2.0-py3-none-any.whl.metadata (29 kB)
Collecting exchange_calendars
  Downloading exchange_calendars-4.10-py3-none-any.whl.metadata (37 kB)
Collecting wrds
  Downloading wrds-3.3.0-py3-none-any.whl.metadata (5.7 kB)
Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting urllib3<2,>1.24 (from alpaca-trade-api)
  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting websockets<11,>=9.0 (from alpaca-trade-api)
  Downloading websocket

In [2]:
# Step 1: Create directory to persist models
# -------------------------------------------------


import os

MODEL_DIR = "/content/drive/MyDrive/finrl_models"
os.makedirs(MODEL_DIR, exist_ok=True)


In [3]:
# -------------------------------------------------
# Step 2: Data Loading and Preparation
# -------------------------------------------------
import pandas as pd
import numpy as np
from finrl.meta.preprocessor.preprocessors import data_split
import itertools

def load_and_prepare_data(filepath):
    """Load and prepare dataset for training"""
    df = pd.read_csv(filepath)
    df['date'] = pd.to_datetime(df['date']).dt.normalize()

    # Drop unwanted columns
    df = df.drop(columns=[col for col in df.columns if 'Unnamed:' in col or col.endswith('_y')])
    df.columns = [col.replace('_x', '') for col in df.columns]

    # Forward fill missing values
    list_ticker = df["tic"].unique().tolist()
    list_date = pd.date_range(start=df['date'].min(), end=df['date'].max())
    combination = list(itertools.product(list_date, list_ticker))

    processed_full = pd.DataFrame(combination, columns=["date", "tic"])
    processed_full['date'] = pd.to_datetime(processed_full['date']).dt.normalize()
    processed_full = processed_full.merge(df, on=["date", "tic"], how="left")
    processed_full = processed_full.sort_values(by=["tic", "date"]).ffill()

    return processed_full

# Load both datasets
counterfactual_df = load_and_prepare_data('/content/Counterfactual_aapl_trading_sentiment_data_all_days.csv')
refpaper_df = load_and_prepare_data('/content/aapl_trading_sentiment_data_all_days_RefPaper.csv')

# Split into train/trade periods
TRAIN_START_DATE = '2022-06-03'
TRAIN_END_DATE = '2023-06-30'
TRADE_START_DATE = '2023-07-01'
TRADE_END_DATE = '2023-12-16'

def split_data(df):
    train_df = data_split(df, TRAIN_START_DATE, TRAIN_END_DATE)
    trade_df = data_split(df, TRADE_START_DATE, TRADE_END_DATE)
    return train_df, trade_df

counterfactual_train, counterfactual_trade = split_data(counterfactual_df)
refpaper_train, refpaper_trade = split_data(refpaper_df)

In [4]:
# -------------------------------------------------
# Step 3: Environment Setup
# -------------------------------------------------
from env_stocktrading import StockTradingEnv
from finrl.config import INDICATORS
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import gymnasium as gym

def create_env(df, state_space=11):  # Correct observation space size
    """Create trading environment"""
    env = StockTradingEnv(
        df=df,
        stock_dim=1,
        num_stock_shares=[100],
        buy_cost_pct=[0.001],
        sell_cost_pct=[0.001],
        hmax=100,
        initial_amount=1_000_000,
        reward_scaling=1e-4,
        state_space=state_space,
        action_space=1,
        tech_indicator_list=INDICATORS,
        risk_indicator_col='sentiment'
    )
    print("lenght of INDICATORS" , len(INDICATORS))
    return DummyVecEnv([lambda: env])

# Create environments with correct observation size
counterfactual_train_env = create_env(counterfactual_train)
refpaper_train_env = create_env(refpaper_train)

from finrl.config import INDICATORS
print(f"Number of indicators: {len(INDICATORS)}")  # e.g., 5
state_space_size=11
#state_space_sizeprint(f"State space size : {len(state_space_size)}")
#state_space_size = 4 (price) + 1 (holdings) + len(INDICATORS) + 1 #(sentiment if used)
#print(f"Calculated state space size: {state_space_size}")"""

lenght of INDICATORS 8
lenght of INDICATORS 8
Number of indicators: 8


In [5]:
# -------------------------------------------------
# Step 4: Train Both Agents
# -------------------------------------------------
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
import os

# Make sure your checkpoint folder exists
os.makedirs("/content/checkpoints/", exist_ok=True)

def train_agent(env, model_name="ppo", total_timesteps=200_000):
    """Train a trading agent with periodic checkpointing."""
    # 1. Create the checkpoint callback
    checkpoint_callback = CheckpointCallback(
        save_freq=50_000,
        save_path="/content/checkpoints/",
        name_prefix=model_name
    )

    # 2. Instantiate the PPO model
    model = PPO(
        "MlpPolicy",
        env=env,
        seed=42,
        verbose=1,
        policy_kwargs={
            "net_arch": [{"pi": [64, 64], "vf": [64, 64]}]
        }
    )

    # 3. Train, *passing* the callback into learn()
    model.learn(
        total_timesteps=total_timesteps,
        callback=checkpoint_callback
    )

    return model

print("Training Counterfactual Agent…")
counterfactual_model = train_agent(counterfactual_train_env, model_name="ppo_counterfactual")
counterfactual_model.save("/content/counterfactual_trading_model")

print("\nTraining RefPaper Agent…")
refpaper_model = train_agent(refpaper_train_env, model_name="ppo_refpaper")
refpaper_model.save("/content/refpaper_trading_model")


Training Counterfactual Agent…
Using cuda device




-----------------------------
| time/              |      |
|    fps             | 541  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
day: 391, episode: 10
begin_total_asset: 1014342.96
end_total_asset: 1047200.94
total_reward: 32857.98
total_cost: 3850.27
total_trades: 386
Sharpe: 0.517
-------------------------------------------
| time/                   |               |
|    fps                  | 414           |
|    iterations           | 2             |
|    time_elapsed         | 9             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00043162014 |
|    clip_fraction        | 0.0108        |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | -0.0194       |
|    learning_rate        | 0.0003        |
|    loss                 | 0.244         |
|    n_updates     



-----------------------------
| time/              |      |
|    fps             | 630  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
day: 391, episode: 10
begin_total_asset: 1014342.96
end_total_asset: 1047200.94
total_reward: 32857.98
total_cost: 3850.27
total_trades: 386
Sharpe: 0.517
-------------------------------------------
| time/                   |               |
|    fps                  | 470           |
|    iterations           | 2             |
|    time_elapsed         | 8             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00043162014 |
|    clip_fraction        | 0.0108        |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | -0.0194       |
|    learning_rate        | 0.0003        |
|    loss                 | 0.244         |
|    n_updates     

In [6]:
# -------------------------------------------------
# Step 5: Save Both Agents
# -------------------------------------------------
# After training Counterfactual
counterfactual_model.save(f"{MODEL_DIR}/ppo_counterfactual_latest.zip")

# After training RefPaper
refpaper_model.save(f"{MODEL_DIR}/ppo_refpaper_latest.zip")



In [7]:
# -------------------------------------------------
# Step 5 b: Save Both Agents as .pth files
# -------------------------------------------------
# After training Counterfactual
import torch

# Save Counterfactual model policy as .pth
torch.save(counterfactual_model.policy.state_dict(), f"{MODEL_DIR}/ppo_counterfactual_policy.pth")

# Save RefPaper model policy as .pth
torch.save(refpaper_model.policy.state_dict(), f"{MODEL_DIR}/ppo_refpaper_policy.pth")


In [8]:
 # -------------------------------------------------
# Step 6: Backtesting and Evaluation - Sharpe ratio,
# total return, annual return, annual volatility,
# maximum drawdown and win rate
# -------------------------------------------------
def calculate_metrics(df_account_value):
    """Calculate performance metrics"""
    df_account_value['daily_return'] = df_account_value['account_value'].pct_change(fill_method=None)
    daily_returns = df_account_value['daily_return'].dropna()

    # Basic metrics
    total_return = df_account_value['account_value'].iloc[-1] / df_account_value['account_value'].iloc[0] - 1
    annual_return = np.mean(daily_returns) * 252
    annual_volatility = np.std(daily_returns) * np.sqrt(252)
    sharpe_ratio = annual_return / annual_volatility if annual_volatility != 0 else 0

    # Drawdown calculations
    cumulative_returns = (1 + daily_returns).cumprod()
    peak = cumulative_returns.cummax()
    drawdown = (peak - cumulative_returns) / peak
    max_drawdown = drawdown.max()

    # Win rate
    win_rate = (daily_returns > 0).mean()

    return {
        'Total Return': total_return,
        'Annual Return': annual_return,
        'Annual Volatility': annual_volatility,
        'Sharpe Ratio': sharpe_ratio,
        'Max Drawdown': max_drawdown,
        'Win Rate': win_rate
    }

def evaluate_agent(model, trade_df):
    """Evaluate agent performance"""
    env = StockTradingEnv(
        df=trade_df,
        stock_dim=1,
        num_stock_shares=[100],
        buy_cost_pct=[0.001],
        sell_cost_pct=[0.001],
        hmax=100,
        initial_amount=1_000_000,
        reward_scaling=1e-4,
        state_space=11,
        action_space=1,
        tech_indicator_list=INDICATORS,
        risk_indicator_col='sentiment'
    )

    obs, _ = env.reset()
    account_values = [env.initial_amount]
    dates = [trade_df.iloc[0]['date']]

    for i in range(len(trade_df)-1):
        action, _ = model.predict(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Get current account value directly from the environment
#        current_account_value = env.total_asset
# replaced from below line
# Get current account value from the info dict (fallback to internal memory)
        current_account_value = info.get('total_asset', None)
        if current_account_value is None:
# as a backup, grab the last recorded value
          current_account_value = env.asset_memory[-1]  # or env.state[0]
# replaced till above line
        account_values.append(current_account_value)
        dates.append(trade_df.iloc[i+1]['date'])

        if done:
            break

    df_account_value = pd.DataFrame({'date': dates, 'account_value': account_values})
    perf_metrics = calculate_metrics(df_account_value)

    return df_account_value, perf_metrics

print("\nEvaluating Counterfactual Agent...")
cf_account_value, cf_metrics = evaluate_agent(counterfactual_model, counterfactual_trade)

print("\nEvaluating RefPaper Agent...")
rp_account_value, rp_metrics = evaluate_agent(refpaper_model, refpaper_trade)


Evaluating Counterfactual Agent...

Evaluating RefPaper Agent...


In [9]:
# -------------------------------------------------
# Step 7: Backtesting and Evaluation - Rachev Ratio Calculation
# -------------------------------------------------
import numpy as np

# Choose your tail‐probability (e.g. 5%)
alpha = 0.05

# Assuming you already have cf_account_value and rp_account_value from evaluate_agent()
for name, df in [("Counterfactual", cf_account_value), ("RefPaper", rp_account_value)]:
    # 1) Compute daily returns
    daily_returns = df['account_value'].pct_change().dropna()

    # 2) Compute cutoffs
    q_low  = daily_returns.quantile(alpha)
    q_high = daily_returns.quantile(1 - alpha)

    # 3) Extract tails
    lower_tail = daily_returns[daily_returns <= q_low]
    upper_tail = daily_returns[daily_returns >= q_high]

    # 4) Expected Tail Loss (ETL) and Expected Tail Gain (ETG)
    etl = abs(lower_tail.mean())
    etg = upper_tail.mean()

    # 5) Rachev ratio (guarding against zero ETL)
    rachev = etg / etl if etl != 0 else np.nan

    print(f"Rachev Ratio (α={alpha}) for {name} Agent: {rachev:.4f}")


Rachev Ratio (α=0.05) for Counterfactual Agent: 0.9316
Rachev Ratio (α=0.05) for RefPaper Agent: 0.9042


In [10]:


# -------------------------------------------------
# Step 8: Comparison and Visualization
# -------------------------------------------------
import matplotlib.pyplot as plt

# Combine results for comparison
results_df = pd.DataFrame({
    'Metric': list(cf_metrics.keys()),
    'Counterfactual': list(cf_metrics.values()),
    'RefPaper': list(rp_metrics.values())
})

print("\n=== Performance Comparison ===")
print(results_df.to_string(index=False))

# Plot account value growth
plt.figure(figsize=(12, 6))
plt.plot(cf_account_value['date'], cf_account_value['account_value'], label='Counterfactual Agent')
plt.plot(rp_account_value['date'], rp_account_value['account_value'], label='RefPaper Agent')
plt.title('Account Value Growth Comparison')
plt.xlabel('Date')
plt.ylabel('Account Value ($)')
plt.legend()
plt.grid()
plt.show()


=== Performance Comparison ===
           Metric  Counterfactual  RefPaper
     Total Return        0.078088  0.072370
    Annual Return        0.120755  0.113106
Annual Volatility        0.120535  0.123613
     Sharpe Ratio        1.001830  0.915002
     Max Drawdown        0.098987  0.101649
         Win Rate        0.395210  0.395210


In [11]:

# -------------------------------------------------
# Load PPO model from .pth and evaluate Counterfactual Agent
# -------------------------------------------------
import torch
from stable_baselines3 import PPO


# Rebuild the environment used during training
env = StockTradingEnv(
    df=counterfactual_trade,
    stock_dim=1,
    num_stock_shares=[100],
    buy_cost_pct=[0.001],
    sell_cost_pct=[0.001],
    hmax=100,
    initial_amount=1_000_000,
    reward_scaling=1e-4,
    state_space=11,
    action_space=1,
    tech_indicator_list=INDICATORS,
    risk_indicator_col='sentiment'
)

# Recreate the PPO model (same architecture as training)
model_from_pth = PPO("MlpPolicy", env=env, verbose=0, seed=42)

# Load policy weights from .pth file
model_from_pth.policy.load_state_dict(torch.load("/content/ppo_counterfactual_policy.pth"))

# Evaluate Counterfactual agent loaded from .pth
print("\nEvaluating Counterfactual Agent (from .pth)...")
cf_account_value, cf_metrics = evaluate_agent(model_from_pth, counterfactual_trade)

# Display performance metrics
import pandas as pd
results_df = pd.DataFrame({
    'Metric': list(cf_metrics.keys()),
    'Counterfactual (.pth)': list(cf_metrics.values())
})
print("\n=== Performance Metrics from .pth ===")
print(results_df.to_string(index=False))

# Plot performance
import matplotlib.pyplot as plt
cf_account_value['date'] = pd.to_datetime(cf_account_value['date'])

plt.figure(figsize=(12, 6))
plt.plot(cf_account_value['date'], cf_account_value['account_value'], label='Counterfactual Agent (.pth)', linewidth=2)
plt.title('Counterfactual Agent Performance (Loaded from .pth)')
plt.xlabel('Date')
plt.ylabel('Account Value ($)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()





Evaluating Counterfactual Agent (from .pth)...

=== Performance Metrics from .pth ===
           Metric  Counterfactual (.pth)
     Total Return               0.072264
    Annual Return               0.113120
Annual Volatility               0.124904
     Sharpe Ratio               0.905658
     Max Drawdown               0.103105
         Win Rate               0.395210
