In [119]:
import pandas as pd
import numpy as np
import gymnasium as gym
import sys
import torch
from gymnasium import spaces

#from arguments import get_args
from ppo import PPO
from network import FeedForwardNN
from eval_policy import eval_policy
from stable_baselines3.common.env_checker import check_env

class StockHedgingEnv(gym.Env):
    def __init__(self, df, window_size = 1, render_mode='human'):
        super(StockHedgingEnv, self).__init__()
        self.render_mode = render_mode
        self.df = df  # The dataframe contains historical stock prices
        self.window_size = window_size  # Number of previous days to observe
        self.current_step = 0
        self.max_steps = len(df)  # Length of the data
        self.action_space = spaces.Box(low = 1, high = 100) # 
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.window_size, len(df.columns)), dtype=np.float64)
        self.performance = []
        self.stock_held = 1
        self.option_held = 0
        self.balance = 0
        self.vol = 0

    def reset(self, seed = None, options = None):
        super().reset(seed=seed, options=options)
        self.current_step = self.window_size
        self.stock_held = 1
        self.option_held = 0
        self.balance = 0
        self.performance = []
        obs = self.df.iloc[self.current_step - self.window_size:self.current_step].values
        return obs, {}

    def step(self, action):
        p_now = self.df.iloc[self.current_step]['price']
        p_prev = self.df.iloc[self.current_step - 1]['price']
        old_o_p_prev = self.df.iloc[self.current_step - 1]['option_p']
        old_o_p_now = self.df.iloc[self.current_step]['option_p_next']
        #print(self.option_held)
        port_value_change = p_now - p_prev - self.option_held * (old_o_p_now - old_o_p_prev)
        port_value_change = float(port_value_change)
        self.balance += port_value_change
        if isinstance(self.balance, np.ndarray):
            self.balance = self.balance[0]
        elif isinstance(self.balance, np.float64):
            self.balance = self.balance
        #print(action)
        self.option_held = action
        self.performance.append(self.balance)
        self.vol = np.std(self.performance)
        #print(self.vol)
        reward = port_value_change - 0.5 * self.vol ** 2
        #reward = -np.abs(port_value_change)
        #reward = 1 / (0.0001 if port_value_change == 0 else port_value_change)
        transaction_cost = 0  # in %
        self.current_step += 1

        # Check if we are done (i.e., if we've reached the end of the data)
        done = self.current_step >= self.max_steps
        truncated = False
        # Return the next observation, reward, and done flag
        next_state = self.df.iloc[self.current_step - self.window_size:self.current_step].values
        return next_state, reward, done, truncated, {}

    def render(self):
        print(f"Step: {self.current_step}, Balance: {self.balance}, Option held: {self.option_held}")
        #return(self.balance)




In [121]:
df = pd.read_csv("AAPL_RL_test.csv", index_col = 0)
df = df.drop(['ticker', 'Ticker', 'expiry', 'option_type', 'BarDateTime'], axis = 1)
df = df.dropna()


d_train = df.iloc[:int(len(df) * 0.8),:]
d_test = df.iloc[int(len(df) * 0.8):,:]
#df = df.drop(['OpenAskPrice', 'OpenBidPrice', 'HighAskPrice', 'HighBidPrice', 'LowAskPrice', 'LowBidPrice', 'CloseAskPrice', 'CloseBidPrice'], axis = 1)


In [133]:
hyperparameters = {
				'timesteps_per_batch': 2048, 
				'max_timesteps_per_episode': 2000, 
				'gamma': 0.95, 
				'n_updates_per_iteration': 10,
				'lr': 1e-3, 
				'clip': 0.2,
				'render': True,
				'render_every_i': 10
			  }

env_train = StockHedgingEnv(d_train)
# It will check your custom environment and output additional warnings if needed
check_env(env_train)

#print(f"Training", flush=True)

# Create a model for PPO.
model = PPO(policy_class=FeedForwardNN, env=env_train, **hyperparameters)

print(f"Training from scratch.", flush=True)

model.learn(total_timesteps=2000)

Training from scratch.


  port_value_change = float(port_value_change)


Learning... Running 2000 timesteps per episode, 2048 timesteps per batch for a total of 2000 timesteps


Training Progress:   0%|                                                   | 0/2000 [00:00<?, ?it/s]

Step: 1, Balance: 0, Option held: 0
Step: 2, Balance: -0.9000000000000057, Option held: [[9.117597]]
Step: 3, Balance: -7.805673980712896, Option held: [[8.849117]]
Step: 4, Balance: -6.889944577217108, Option held: [[9.491797]]
Step: 5, Balance: -6.379554712772375, Option held: [[8.625011]]
Step: 6, Balance: -2.736150467395788, Option held: [[8.689749]]
Step: 7, Balance: -4.0638690829277095, Option held: [[20.10902]]
Step: 8, Balance: -2.9479180216789302, Option held: [[16.600946]]
Step: 9, Balance: -5.904069268703466, Option held: [[9.581128]]
Step: 10, Balance: -5.492229783535009, Option held: [[114.11209]]
Step: 11, Balance: -8.396532142162329, Option held: [[7.621492]]
Step: 12, Balance: -11.654054248332983, Option held: [[12.017821]]
Step: 13, Balance: -13.216014587879187, Option held: [[12.22682]]
Step: 14, Balance: -12.921478176116949, Option held: [[9.45943]]
Step: 15, Balance: -12.911478176340466, Option held: [[8.848699]]
Step: 16, Balance: -14.54378308318556, Option held: [

Training Progress: 2326it [00:01, 1734.63it/s]                                                      


In [135]:
env_test = StockHedgingEnv(d_test)
actor_model = model.actor
obs_dim = env_test.observation_space.shape[1]
act_dim = env_test.action_space.shape[0]

policy = actor_model
	# Load in the actor model saved by the PPO algorithm
#policy.load_state_dict(torch.load(actor_model))

	# Evaluate our policy with a separate module, eval_policy, to demonstrate
	# that once we are done training the model/policy with ppo.py, we no longer need
	# ppo.py since it only contains the training algorithm. The model/policy itself exists
	# independently as a binary file that can be loaded in with torch.
eval_policy(policy = policy, env = env_test, n_episode = 1, render = True)

Step: 1, Balance: 0, Option held: 0
Step: 2, Balance: 0.044999999999987494, Option held: [[351.19836]]
Step: 3, Balance: -1.7056917715072757, Option held: [[63.15986]]
Step: 4, Balance: -1.6646917703747874, Option held: [[37.726868]]
Step: 5, Balance: -5.538678491413606, Option held: [[112.98124]]
Step: 6, Balance: -10.547834718525422, Option held: [[83.91502]]
Step: 7, Balance: -14.494011009037507, Option held: [[33.759113]]
Step: 8, Balance: -10.586613023579133, Option held: [[33.924957]]
Step: 9, Balance: -2.5693477037549144, Option held: [[135.6402]]
Step: 10, Balance: 28.077794706523406, Option held: [[242.63882]]
Step: 11, Balance: 46.54090658396481, Option held: [[56.24429]]
Step: 12, Balance: 46.31648514240979, Option held: [[76.8838]]
Step: 13, Balance: 48.09348008602856, Option held: [[101.32708]]
Step: 14, Balance: 62.87470624417065, Option held: [[54.67942]]
Step: 15, Balance: 66.54006860226391, Option held: [[59.867985]]
Step: 16, Balance: 66.0601887461543, Option held: [[

  port_value_change = float(port_value_change)


In [13]:
df.drop(['OpenAskPrice', 'OpenBidPrice', 'HighAskPrice', 'HighBidPrice', 'LowAskPrice', 'LowBidPrice', 'CloseAskPrice', 'CloseBidPrice'], axis = 1)


Unnamed: 0_level_0,price,strike,option_p,high,low,ttm,moneyness,riskfree,Volume,option_p_next
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-01-02 10:35:00,188.0300,187.5,1.305,1.460,1.280,0.009751,0.5300,5.28,188,1.305
2024-01-02 10:40:00,187.1300,187.5,2.540,2.585,2.330,0.028919,-0.3700,5.28,5,1.740
2024-01-02 10:45:00,186.0140,185.0,1.165,1.165,1.055,0.009732,1.0140,5.28,386,3.175
2024-01-02 10:50:00,186.2218,185.0,2.855,2.900,2.760,0.067256,1.2218,5.28,17,1.085
2024-01-02 10:55:00,186.2576,187.5,3.020,3.025,2.940,0.028891,-1.2424,5.28,377,2.805
...,...,...,...,...,...,...,...,...,...,...
2024-01-26 16:40:00,192.0850,192.5,0.455,0.485,0.420,0.000837,-0.4150,5.27,50763,0.395
2024-01-26 16:45:00,192.0499,192.5,0.435,0.490,0.365,0.000828,-0.4501,5.27,38496,0.215
2024-01-26 16:50:00,192.1250,192.5,0.395,0.435,0.375,0.000818,-0.3750,5.27,21638,0.195
2024-01-26 16:55:00,192.2983,192.5,0.215,0.375,0.140,0.000809,-0.2017,5.27,27395,0.195
