# Install Dependencies

In [22]:
# !pip install tensorflow-gpu==1.15.0 tensorflow==1.15.0 stable-baselines gym-anytrading gym
# !pip install "gym==0.19.0"
# !pip install gym[all]
# # %conda install -c conda-forge ta-lib
# !pip install yfinance
# !pip install protobuf==3.20.0
# !pip install stable-baselines3
# !pip install shimmy
# !pip install gymnasium

In [23]:
# #!pip install stable-baselines3 gym-anytrading gym
# !pip install finta
# !pip install quantstats

In [24]:
import warnings

# Suppress the specific DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [25]:
# Gym stuff
import gym
import gym_anytrading
from gym_anytrading.envs import StocksEnv

# Stable baselines - rl stuff
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3 import A2C, PPO, DQN

#Quant Finance
from finta import TA
import quantstats as qs

# Processing libraries
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Source Trading Data

In [26]:
data = pd.read_csv("tickers.csv")
data = data[data.tic == "^TNX"]
data.head()

Unnamed: 0.1,Unnamed: 0,date,open,high,low,close,volume,tic,day
1,2,2000-01-04,6.53,6.548,6.485,6.485,0,^TNX,1
3,4,2000-01-05,6.521,6.599,6.508,6.599,0,^TNX,2
5,6,2000-01-06,6.558,6.585,6.54,6.549,0,^TNX,3
7,8,2000-01-07,6.545,6.595,6.504,6.504,0,^TNX,4
9,11,2000-01-11,6.6,6.664,6.595,6.664,0,^TNX,6


In [27]:
data = data.drop(["Unnamed: 0", "tic", "day"], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5727 entries, 1 to 11453
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    5727 non-null   object 
 1   open    5727 non-null   float64
 2   high    5727 non-null   float64
 3   low     5727 non-null   float64
 4   close   5727 non-null   float64
 5   volume  5727 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 313.2+ KB


In [28]:
#Change the Time to proper type and set as index
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)
data.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-04,6.53,6.548,6.485,6.485,0
2000-01-05,6.521,6.599,6.508,6.599,0
2000-01-06,6.558,6.585,6.54,6.549,0
2000-01-07,6.545,6.595,6.504,6.504,0
2000-01-11,6.6,6.664,6.595,6.664,0


In [29]:
#Set the values so the earliest time is first and the latest is last in the dataframe
data.sort_values('date', ascending=True, inplace=True)
data.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-04,6.53,6.548,6.485,6.485,0
2000-01-05,6.521,6.599,6.508,6.599,0
2000-01-06,6.558,6.585,6.54,6.549,0
2000-01-07,6.545,6.595,6.504,6.504,0
2000-01-11,6.6,6.664,6.595,6.664,0


In [30]:
#Title the columns for gym formatting reasons
data = data.rename(columns = {'open': 'Open', 'high': 'High', 'low': 'Low', 'close': 'Close', 'volume': 'Volume'})

In [31]:
data.shape

(5727, 5)

In [32]:
#Data tyeps neet to be numbers not objects
data.dtypes

Open      float64
High      float64
Low       float64
Close     float64
Volume      int64
dtype: object

# Add Custom Indicators

### SMA RSI OBV

In [33]:
#Create columns for technical indicators & add them to the dataframe
data['RSI'] = TA.RSI(data,16)
data['SMA'] = TA.SMA(data)
data['OBV'] = TA.OBV(data)
data.fillna(0, inplace=True)

# Build Gym

In [34]:
#Create a function to properly format data frame to be passed through environment
def signals(env):
    start = env.frame_bound[0] - env.window_size
    end = env.frame_bound[1]
    prices = env.df.loc[:,'Low'].to_numpy()[start:end]
    signal_features = env.df.loc[:, ['Low','Open','High','Volume','RSI','SMA','OBV',]].to_numpy()[start:end]
    return prices, signal_features

In [35]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,RSI,SMA,OBV
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-04,6.53,6.548,6.485,6.485,0,0.0,0.0,0.0
2000-01-05,6.521,6.599,6.508,6.599,0,100.0,0.0,0.0
2000-01-06,6.558,6.585,6.54,6.549,0,68.127377,0.0,0.0
2000-01-07,6.545,6.595,6.504,6.504,0,52.165981,0.0,0.0
2000-01-11,6.6,6.664,6.595,6.664,0,74.671755,0.0,0.0


In [36]:
#Replace default data process with custom function from above
class MyCustomEnv(StocksEnv):
    _process_data = signals

#Initialize an environment setting the window size and train data
env2 = MyCustomEnv(df=data, window_size=100, frame_bound=(100, 9000))

In [37]:
#Create a Dummy Vector of our environment
env_maker = lambda: env2
env = DummyVecEnv([env_maker])

# Train Agent

In [38]:
# Create log dir to save path to store our callback and model
log_dir = "reinforcement_learning/tmp/"
save_path = os.path.join('reinforcement_learning','tmp')

In [39]:
#Stop training when the model reaches a stable training threshold
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)

#Check training & evaluate performance unitl threshold has been met
eval_callback = EvalCallback(env,
                            callback_on_new_best=stop_callback,
                            eval_freq=10000,
                            best_model_save_path=save_path,
                            verbose=1)

In [40]:
#initialize our model and train
actor_critic = A2C('MlpPolicy', env, verbose=1)
actor_critic.learn(total_timesteps=100000, callback=eval_callback)

Using cpu device
------------------------------------
| time/                 |          |
|    fps                | 468      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.148   |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.000366 |
|    value_loss         | 0.000137 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 353      |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.094   |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 2.48e-05 |
|    value_loss      



Eval num_timesteps=10000, episode_reward=-2.02 +/- 0.00
Episode length: 5626.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 5.63e+03 |
|    mean_reward        | -2.02    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0.572   |
|    explained_variance | -0.0461  |
|    learning_rate      | 0.0007   |
|    n_updates          | 1999     |
|    policy_loss        | 0.0207   |
|    value_loss         | 0.00165  |
------------------------------------
New best mean reward!
------------------------------
| time/              |       |
|    fps             | 182   |
|    iterations      | 2000  |
|    time_elapsed    | 54    |
|    total_timesteps | 10000 |
------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 187       |
|    iterations         | 2100      |
| 

<stable_baselines3.a2c.a2c.A2C at 0x7be57d0eaef0>

In [41]:
# PPO = PPO('MlpPolicy', env, verbose=1)
# PPO.learn(total_timesteps=1000000, callback=eval_callback)

In [42]:
# DQN = DQN('MlpPolicy', env, verbose=1)
# DQN.learn(total_timesteps=1000000, callback=eval_callback)

# Test

In [43]:
#Load our trained model
model = A2C.load("reinforcement_learning/tmp/best_model.zip")

In [44]:
#Create a new environment with validation data
env = MyCustomEnv(df=data, window_size=100, frame_bound=(8900,11243))
obs = env.reset()

In [45]:
while True:
    obs = obs[np.newaxis, ...]
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        print("info", info)
        break

TypeError: ignored

In [None]:
#Plot the results
plt.figure(figsize=(15,6),facecolor='w')
plt.cla()
env.render_all()
plt.show()

# Quant Reports

In [None]:
qs.extend_pandas()

net_worth = pd.Series(env.history['total_profit'], index=data.index[8900+1:])
returns = net_worth.pct_change().iloc[1:]

qs.reports.full(returns)

# Future Improvements

* ~~Create custom indicators for actions~~
* ~~Create a callback function to stop and save best training weights~~
* ~~Import a larger dataset for a longer period of time~~
* Try different RL Models like DQN or PPO
* ~~Bring in some quantstats to evaluate the performance better~~