# Using Gymnasium to do simple trading algorithm

In [None]:
!pip install gymnasium 
!pip install yfinance
!pip install "stable-baselines3[extra]>=2.0.0a4"
!pip install ipywidgets
!pip install pandas
!pip install matplotlib
!pip install torch
!pip install tqdm

In [1]:
import gymnasium as gym
import torch
import numpy as np
import yfinance as yf
import datetime
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import configure

In [2]:
class TensorboardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TensorboardCallback, self).__init__(verbose)
        self.episode_rewards = []

    def _on_step(self) -> bool:
        # Log custom metrics across all environments
        for i in range(len(self.locals["infos"])):
            if "current_networth" in self.locals["infos"][i]:
                self.logger.record(f"networth/env_{i}", self.locals["infos"][i]["current_networth"])
            if "current_gain" in self.locals["infos"][i]:
                self.logger.record(f"gain/env_{i}", self.locals["infos"][i]["diff_norm"])
            
            #log reward for all envs in each step
            if "episode" in self.locals["infos"][i]:
                episode_reward = self.locals["infos"][i]["episode"]["r"]
                self.episode_rewards.append(episode_reward)
                self.logger.record(f"reward/env_{i}", episode_reward)
        #log average reward for all envs in each step
        if len(self.episode_rewards) > 0:
            avg_reward = np.mean(self.episode_rewards)
            self.logger.record("reward/average", avg_reward)
            
        # log average gain
        if "current_gain" in self.locals["infos"][0]:
            avg_gain = np.mean([info["diff_norm"] for info in self.locals["infos"]])
            self.logger.record("gain/average", avg_gain)
            
        return True

        # Log PPO losses
    def _on_training_step(self) -> bool:
        loss_info = self.locals["loss"] if "loss" in self.locals else {}
        self.logger.record("loss/policy_loss", loss_info.get("policy_loss", 0))
        self.logger.record("loss/value_loss", loss_info.get("value_loss", 0))
        self.logger.record("loss/entropy_loss", loss_info.get("entropy_loss", 0))

In [3]:

class TradingEnv(gym.Env):
    def __init__(self, stock='AAPL', minbuy=10, gran='1d', period='max', shares=3, boughtat="2023-12-01", randomize=True, seed=0, verbose=False):
        super(TradingEnv, self).__init__()
        self.running_mean = 0
        self.running_std = 1  
        self.decay_factor = 0.99  
        self.epsilon = 1e-7  
        
        self.stock = stock
        self.minbuy = minbuy
        self.gran = gran
        self.period = period
        self.shares = shares
        self.boughtat = boughtat
        self.seed = seed
        self.randomize = randomize
        self.max_networth = 0
        self.print = verbose
        self.streak = 0
        self.is_init = True
        
        if self.gran in ['5m', '15m', '30m']:
            print("[WARNING] Data granularity can only go back to 60 Days Maxiumum")
        if self.gran in ['1h', '60m']:
            print("[WARNING] Data granularity can only go back to 730 Days Maxiumum")
        self.random_symbols = [
            "NVDA" , "AMZN", "GOOGL", "MSFT", "AAPL", "META", "NVDA", "ADBE", "NFLX", "NANC", "KRUZ", "VOO", "FTEC", "TSLA"
        ]
        self.random_gran_period_pairs = {
            '1d' : ['max']
        }
        if self.randomize:
            self._randomize()
        else:
            self.data = yf.Ticker(self.stock).history(period=self.period, interval=self.gran, auto_adjust=True)
        
        self._init_obs()

        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(1, 48), dtype=np.float32
        )
        self.action_space = gym.spaces.Discrete(2)
        self.networth_rewd = 0
        
    def _init_obs(self):
        self.current_day = self.boughtat
        self.cur_indx = self.data.index.get_loc(self.current_day)
        self.init_networth = self.shares * self.data.loc[self.boughtat, 'Close']
        self.networth = self.init_networth
        self.previous_open_close = self.data.loc[self.boughtat, 'Open'], self.data.loc[self.boughtat, 'Close']
        self.previous_high_low = self.data.loc[self.boughtat, 'High'], self.data.loc[self.boughtat, 'Low']
        self.previous_volume = self.data.loc[self.boughtat, 'Volume']
        self.previous_momentum_wk = self._calculate_momentum(7)
        self.previous_momentum_mon = self._calculate_momentum(30)
        self.previous_wk_close = np.zeros((1,7))
        self.previous_mo_close = np.zeros((1,30))
        self.bought_price = self.data.loc[self.boughtat, 'Close']
        self.current_shares = self.shares
        self.relneworth = 0
        self.current_action = 0
        self.max_networth = 0
        self.streak = 0
        self.is_init = True

        self.has_position = True
        self.previous_obs = np.zeros((1, 48)).astype(np.float32)
        
        
    def _randomize(self):
        self.random_seed = np.random.randint(0, 100)
        np.random.seed(self.random_seed)
        # Randomly select number of shares from 1 to 10
        self.shares = np.random.randint(1, 10)
        
        # Randomly select granularity and period
        while True:
            self.gran = np.random.choice(list(self.random_gran_period_pairs.keys()))
            self.period = np.random.choice(self.random_gran_period_pairs[self.gran])
            
            # Randomly select a stock
            self.stock = np.random.choice(self.random_symbols)
            
            try:
                # Fetch data for the randomized stock, period, and granularity
                self.data = yf.Ticker(self.stock).history(period=self.period, interval=self.gran, auto_adjust=True)
                self.boughtat = np.random.choice(self.data.index[31:])
                break
            except:
                continue
    
    def reset(self, seed=0):
        super(TradingEnv, self).reset()
        if self.randomize:
            self._randomize()
        else:
            self.data = yf.Ticker(self.stock).history(period=self.period, interval=self.gran, auto_adjust=True)
        self.seed = seed
        self._init_obs()
        info = {
            "init_networth": self.init_networth,
            "current_networth": self.networth,
            "current_action": self.current_action,
            "current_gain": self.relneworth,
            "diff": self.networth - self.init_networth,
            "diff_norm" : (self.networth - self.init_networth) / self.init_networth,
            "reward": self._reward(self.current_action),
            
        }
        return self.previous_obs, info
    
    def _calculate_momentum(self, interval):
        if self.cur_indx - interval < 0:
            return 0  # Avoid accessing invalid index
        past_price = self.data.iloc[self.cur_indx - interval]['Close']
        current_price = self.data.iloc[self.cur_indx]['Close']
        return (current_price - past_price) / past_price

    
    def _get_previous_close(self, interval):
        return self.data.loc[self.data.index[self.cur_indx-interval:self.cur_indx], 'Close'].to_numpy().reshape(1, interval)
    
    def step(self, action):
        if action == 1:
            if self.has_position:
                self.relneworth = self.current_shares * (self.previous_open_close[1] - self.bought_price)
                self.has_position = False
                self.networth += self.relneworth
            else:
                self.has_position = True
                self.relneworth = 0
            self.streak = 0
            self.bought_price = self.previous_open_close[1]
            self.current_action = 1
            self.max_networth = 0
        else:
            self.streak += 1
            self.current_action = 0
            self.relneworth = self.shares * (self.previous_open_close[1] - self.bought_price)
            if self.relneworth > self.max_networth:
                self.max_networth = self.relneworth
            
        self.cur_indx += 1
        info = {
            "init_networth": self.init_networth,
            "current_networth": self.networth,
            "current_action": self.current_action,
            "current_gain": self.relneworth,
            "diff": self.networth - self.init_networth,
            "diff_norm" : (self.networth - self.init_networth) / self.init_networth,
            "reward": self._reward(self.current_action),
        }
        try:
            self.current_day = self.data.index[self.cur_indx]
            terminated = False
            truncated = False
        except IndexError:
            print("[INFO] Data Exhausted")
            self.current_day = self.data.index[-1]
            terminated = False
            truncated = True
            return (
                self.prev_obs,
                self._reward(action),
                True,
                True,
                info
            )

        self.previous_open_close = np.array([self.data.loc[self.current_day, 'Open'], self.data.loc[self.current_day, 'Close']])
        self.previous_high_low = np.array([self.data.loc[self.current_day, 'High'], self.data.loc[self.current_day, 'Low']])
        self.previous_volume = np.array([self.data.loc[self.current_day, 'Volume']])
        self.previous_momentum_wk = self._calculate_momentum(7)
        self.previous_momentum_mon = self._calculate_momentum(30)
        self.previous_wk_close = self.data.loc[self.data.index[self.cur_indx-7:self.cur_indx], 'Close'].to_numpy()
        self.previous_mo_close = self.data.loc[self.data.index[self.cur_indx-30:self.cur_indx], 'Close'].to_numpy()

        # Flatten and construct the observation array
        observation = np.concatenate([
            self.previous_open_close.flatten(), # 2
            self.previous_high_low.flatten(), # 2
            self.previous_volume.flatten(), # 1
            self.previous_wk_close.flatten(), # 7
            self.previous_mo_close.flatten(), # 30
            np.array([
                self.previous_momentum_wk, # 1
                self.previous_momentum_mon, # 1
                self.bought_price, # 1
                self.current_shares, # 1
                self.relneworth, # 1
                self.current_action # 1
            ])
        ]).astype(np.float32)
        
        observation = observation.reshape(1, -1)
        self.prev_obs = observation

        # Termination terms
        if self.networth < (self.init_networth * 0.90):
            # print("[INFO] Terminated due to Loss")
            terminated = True
        truncated = False
        if self.cur_indx >= len(self.data.index) - 1:
            truncated = True


        # Reward
        reward = self._reward(action)
        
        self.is_init = False

        # Additional Info
        if self.print:
            print(" ")
            print("---------------------------------------------------------")
            print(f"[INFO] Current Stock: {self.stock}")
            print(f"[INFO] Current Day: {self.current_day}")
            print(f"[INFO] Current Open: {self.previous_open_close[0]}")
            print(f"[INFO] Current Net Worth: {self.networth}")
            print(f"[INFO] Holding Share: {self.has_position}")
            print(f"[INFO] Current Share Net Worth: {self.relneworth}")
            print("[INFO] Current Action: " + ("Hold" if self.current_action == 0 else "Sell" if self.current_action == 1 and not self.has_position else "Buy"))
            print("---------------------------------------------------------")

        return (
            observation,
            reward,
            terminated,
            truncated,
            info
        )
    def _reward(self, action):
        reward_weights = {
            "relnetworth": 0.7,
            "streak": 0.3,
            
        }
        
        self.networth_rewd, self.streak_rewd = 0, 0
        
        # self.networth_rewd = self.relneworth
        if self.has_position:
            if self.relneworth < self.max_networth and self.max_networth > 0:
                self.networth_rewd = (self.relneworth - self.max_networth)/self.max_networth
            else:
                self.networth_rewd = self.relneworth / self.bought_price
        else:
            self.networth_rewd = (self.relneworth / self.bought_price) * -1
        
        if (self.streak < self.minbuy and not self.is_init) and action == 1:
            self.streak_rewd = -10
        else:
            self.streak_rewd = 2
        
        # Optionally clip the normalized reward to avoid very large values
        reward_min = -7
        reward_max = 7
        
        normalized_reward = reward_weights["relnetworth"] * self.networth_rewd + reward_weights["streak"] * self.streak_rewd
        normalized_reward = np.clip(normalized_reward, reward_min, reward_max)
        
        return normalized_reward
    
    def render(self):
        # Graph out the stock price with the buy and sell points
        pass
    
    def close(self):
        pass

## Training

In [4]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

In [5]:
env = TradingEnv(randomize=True, verbose=False)

In [9]:
def linear_schedule(initial_value):
    def func(progress_remaining):
        return initial_value * progress_remaining  # Linearly decreases to 0
    return func

In [None]:
log_dir = "./tensorboard_logs/"
n_envs = 10
n_steps = 1000
batch_size = 500
ent_coef = 0.02
new_logger = configure(log_dir, ["stdout", "tensorboard"])
vec_env = make_vec_env(TradingEnv, n_envs=n_envs)
ent_coef = 0.02  # Default: 0.0 (increase to encourage exploration)
model = PPO("MlpPolicy", vec_env, verbose=1, device="cpu", n_steps=n_steps, tensorboard_log=log_dir, batch_size=batch_size, learning_rate=linear_schedule(3e-4), ent_coef=ent_coef)
model.learn(total_timesteps=4000000, progress_bar=True, tb_log_name="stock_ppo", callback=TensorboardCallback(), log_interval=1)

Logging to ./tensorboard_logs/
Using cpu device
Logging to ./tensorboard_logs/stock_ppo_1


Output()

----------------------------------
| gain/              |           |
|    average         | 1.01      |
|    env_0           | 0.938     |
|    env_1           | 0.563     |
|    env_2           | 0.494     |
|    env_3           | 3.13      |
|    env_4           | 0.129     |
|    env_5           | 0.256     |
|    env_6           | 0.216     |
|    env_7           | 3.86      |
|    env_8           | 0.0256    |
|    env_9           | 0.504     |
| networth/          |           |
|    env_0           | 4.31      |
|    env_1           | 52.6      |
|    env_2           | 2.1e+03   |
|    env_3           | 9.18      |
|    env_4           | 1.58e+03  |
|    env_5           | 226       |
|    env_6           | 66.8      |
|    env_7           | 53.1      |
|    env_8           | 185       |
|    env_9           | 220       |
| reward/            |           |
|    average         | -326      |
|    env_0           | -534      |
|    env_1           | -615      |
|    env_2          

----------------------------------------
| gain/                   |            |
|    average              | 3.24       |
|    env_0                | 11.1       |
|    env_1                | 1.3        |
|    env_2                | 0.341      |
|    env_3                | 0.527      |
|    env_4                | 0.229      |
|    env_5                | 0.782      |
|    env_6                | 0.959      |
|    env_7                | 13.8       |
|    env_8                | 0.367      |
|    env_9                | 3.11       |
| networth/               |            |
|    env_0                | 26.8       |
|    env_1                | 77.3       |
|    env_2                | 2.98       |
|    env_3                | 2.14e+03   |
|    env_4                | 1.73e+03   |
|    env_5                | 321        |
|    env_6                | 108        |
|    env_7                | 161        |
|    env_8                | 246        |
|    env_9                | 600        |
| reward/       

-----------------------------------------
| gain/                   |             |
|    average              | 19.3        |
|    env_0                | 104         |
|    env_1                | 0.0894      |
|    env_2                | 3.17        |
|    env_3                | 1.8         |
|    env_4                | 3.77        |
|    env_5                | 2.11        |
|    env_6                | 1.5         |
|    env_7                | 76.5        |
|    env_8                | 0.314       |
|    env_9                | 0.066       |
| networth/               |             |
|    env_0                | 232         |
|    env_1                | 340         |
|    env_2                | 9.27        |
|    env_3                | 3.93e+03    |
|    env_4                | 70.5        |
|    env_5                | 561         |
|    env_6                | 137         |
|    env_7                | 847         |
|    env_8                | 237         |
|    env_9                | 2.37  

-----------------------------------------
| gain/                   |             |
|    average              | 10          |
|    env_0                | 0.591       |
|    env_1                | 1.36        |
|    env_2                | 0.632       |
|    env_3                | 0.0988      |
|    env_4                | 3.42        |
|    env_5                | 0.105       |
|    env_6                | 1.65        |
|    env_7                | 80.3        |
|    env_8                | 2.64        |
|    env_9                | 9.67        |
| networth/               |             |
|    env_0                | 232         |
|    env_1                | 342         |
|    env_2                | 54.9        |
|    env_3                | 2.44        |
|    env_4                | 65.2        |
|    env_5                | 1.55e+03    |
|    env_6                | 146         |
|    env_7                | 888         |
|    env_8                | 656         |
|    env_9                | 23.7  

-----------------------------------------
| gain/                   |             |
|    average              | 3.74        |
|    env_0                | 3           |
|    env_1                | 0.349       |
|    env_2                | 1.41        |
|    env_3                | 6.22        |
|    env_4                | 3.89        |
|    env_5                | 0.91        |
|    env_6                | 3.15        |
|    env_7                | 0.107       |
|    env_8                | -0.0947     |
|    env_9                | 18.4        |
| networth/               |             |
|    env_0                | 584         |
|    env_1                | 1.89e+03    |
|    env_2                | 81.1        |
|    env_3                | 16          |
|    env_4                | 72.3        |
|    env_5                | 2.68e+03    |
|    env_6                | 228         |
|    env_7                | 2.46        |
|    env_8                | 30.4        |
|    env_9                | 43.1  

------------------------------------------
| gain/                   |              |
|    average              | 4.69         |
|    env_0                | 0.202        |
|    env_1                | 0.939        |
|    env_2                | 4.35         |
|    env_3                | 26.6         |
|    env_4                | 0.968        |
|    env_5                | 3.19         |
|    env_6                | 9.88         |
|    env_7                | 0.62         |
|    env_8                | -0.00331     |
|    env_9                | 0.17         |
| networth/               |              |
|    env_0                | 2.67         |
|    env_1                | 2.72e+03     |
|    env_2                | 180          |
|    env_3                | 61.3         |
|    env_4                | 422          |
|    env_5                | 608          |
|    env_6                | 598          |
|    env_7                | 54.5         |
|    env_8                | 311          |
|    env_9 

In [None]:
for k in range(50):
    print(f"Epoch {k+1}")
    obs = vec_env.reset()
    total_rewards = np.zeros(6)
    for i in tqdm(range(1000)):
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, done, info = vec_env.step(action)
        total_rewards = np.add(total_rewards, rewards)
    # Get average reward over the 12 environments
    print(f"Average reward: {np.mean(total_rewards) / 1000}")
    

## Evaluation of the model

In [None]:

env = make_vec_env(TradingEnv, n_envs=1, env_kwargs={"randomize": False, "verbose": True, "stock": "AAPL", "shares": 3, "boughtat": "2023-11-13"})
obs = env.reset()
netgains = 0
for j in range(300):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    print(f"Reward {rewards}")
    if done:
        break

NameError: name 'model' is not defined