In [19]:
import numpy as np
import optuna as optuna
import pandas as pd
from gym import spaces
from stable_baselines3 import PPO
from pykalman import KalmanFilter
from scipy.signal import periodogram
from scipy.stats import norm
import gym
from stable_baselines3.common.vec_env import DummyVecEnv

In [20]:

class StockTradingEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000):
        super(StockTradingEnv, self).__init__()

        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance

        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=1, shape=(window_size, 5), dtype=np.float32)

        self.reset()

    def reset(self):
        self.balance = self.initial_balance
        self.current_step = 0
        self.done = False
        self.positions = []

        return self._next_observation()

    def step(self, action):
        current_price = self.data.iloc[self.current_step]['Close']
        action = action[0]

        if action > 0:
            shares_to_buy = self.balance // current_price
            self.positions.append(shares_to_buy)
            self.balance -= shares_to_buy * current_price
        elif action < 0:
            if self.positions:
                shares_to_sell = self.positions.pop(0)
                self.balance += shares_to_sell * current_price

        self.current_step += 1

        if self.current_step >= len(self.data) - 1:
            self.done = True

        obs = self._next_observation()
        reward = self.balance - self.initial_balance
        done = self.done
        info = {}

        return obs, reward, done, info

    def _next_observation(self):
        price_data = self.data.iloc[self.current_step:self.current_step + self.window_size]['Close'].values

        # Estimate the underlying asset price dynamics using the Kalman filter
        kf = KalmanFilter(initial_state_mean=price_data[0], n_dim_obs=1)
        smoothed_prices, _ = kf.smooth(price_data)

        # Isolate cyclic components using Fourier-based spectral estimation
        freqs, psd = periodogram(price_data)
        significant_freqs = freqs[np.argsort(psd)[-5:]]  # Top 5 frequencies with highest power spectral density
        cyclic_components = np.sum([np.sin(2 * np.pi * f * np.arange(len(price_data))) for f in significant_freqs], axis=0)

        # Model the underlying asset price dynamics using Geometric Brownian Motion (GBM)
        returns = np.diff(price_data) / price_data[:-1]
        mu = np.mean(returns)
        sigma = np.std(returns)
        dt = 1
        Z = norm.ppf(np.random.rand(len(price_data) - 1))
        price_dynamics = price_data[0] * np.exp(np.cumsum((mu - 0.5 * sigma**2) * dt + sigma * np.sqrt(dt) * Z))

        obs = np.hstack([smoothed_prices, cyclic_components.reshape(-1, 1), price_dynamics.reshape(-1, 1), self.data.iloc[self.current_step:self.current_step + self.window_size].drop('Close', axis=1).values])
        return obs

# Define the optimization function
def optimize_agent(trial):
    # Define hyperparameters to optimize
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99, 0.999])
    n_epochs = trial.suggest_int("n_epochs", 2, 10)
    n_steps = trial.suggest_int("n_steps", 16, 2048)

    # Train the agent using PPO algorithm
    model = PPO("MlpPolicy", env, learning_rate=learning_rate, gamma=gamma, n_epochs=n_epochs, n_steps=n_steps, verbose=0)
    model.learn(total_timesteps=100000)

    return evaluate_agent(model)

# Evaluate the agent
def evaluate_agent(model):
    # Test the agent's performance
    test_env = gym.make("stocks-v0", df=ohlcv_data)
    test_env = DummyVecEnv([lambda: test_env])

    obs = test_env.reset()
    done = False
    total_profits = []

    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = test_env.step(action)
        total_profits.append(info[0]['total_profit'])

    # Calculate portfolio returns
    portfolio_returns = np.diff(total_profits) / total_profits[:-1]
    annualized_return = np.mean(portfolio_returns) * 252

    #TODO change to use the actual snp500 pct change
    # Calculate benchmark returns (S&P 500 Index)
    benchmark_returns = ohlcv_data['Close'].pct_change().dropna().values

    return annualized_return, calculate_performance_metrics(portfolio_returns, benchmark_returns)

def calculate_performance_metrics(portfolio_returns, benchmark_returns, risk_free_rate=0.02, sortino_target=0.0):
    # Annualize returns
    annualized_return = np.mean(portfolio_returns) * 252
    benchmark_annualized_return = np.mean(benchmark_returns) * 252

    # Max drawdown
    cum_returns = (1 + portfolio_returns).cumprod()
    running_max = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns / running_max) - 1
    max_drawdown = np.min(drawdown)

    # Sharpe ratio
    sharpe_ratio = (annualized_return - risk_free_rate) / (np.std(portfolio_returns) * np.sqrt(252))
    benchmark_sharpe_ratio = (benchmark_annualized_return - risk_free_rate) / (np.std(benchmark_returns) * np.sqrt(252))

    # Sortino ratio
    downside_returns = portfolio_returns.copy()
    downside_returns[downside_returns > sortino_target] = 0
    sortino_ratio = (annualized_return - risk_free_rate) / (np.std(downside_returns) * np.sqrt(252))

    # Performance relative to the S&P 500 Index on a risk-adjusted basis
    risk_adjusted_performance = sharpe_ratio / benchmark_sharpe_ratio

    return {
        'annualized_return': annualized_return,
        'max_drawdown': max_drawdown,
        'sharpe_ratio': sharpe_ratio,
        'benchmark_sharpe_ratio': benchmark_sharpe_ratio,
        'sortino_ratio': sortino_ratio,
        'risk_adjusted_performance': risk_adjusted_performance
    }

In [None]:
OPTIMIZATION_ENABLED = False  # Set this to False to disable the optimization process

# Load data for the environment
ohlcv_data = pd.read_csv("sp500_ohlcv_data.csv", index_col=0, parse_dates=True)

# Initialize the environment
env = gym.make("stocks-v0", df=ohlcv_data)
env = DummyVecEnv([lambda: env])

# Run the optimization if enabled
if OPTIMIZATION_ENABLED:
    study = optuna.create_study(direction="maximize")
    study.optimize(optimize_agent, n_trials=100)
    best_params = study.best_params
    print("Best hyperparameters:", best_params)
else:
    best_params = {}

# Train the PPO agent with the best hyperparameters
best_model = PPO("MlpPolicy", env, **best_params, verbose=1)
best_model.learn(total_timesteps=200000)

# Calculate and print performance metrics
annualized_return, metrics = evaluate_agent(best_model)
print("Annualized return:", annualized_return)
print(metrics)