# Bot Trader Using Reinforcement Learning

## Imports needed

In [2]:
import gym
import numpy as np
import pandas as pd
import yfinance as yf
import seaborn as sns
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from gym import spaces
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Create Stock Environment

In [51]:
class StockTradingEnv(gym.Env):
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.n_steps = len(df)
        self.current_step = 0

        self.initial_balance = 600.0
        self.balance = self.initial_balance
        self.shares_held = 0

        # Observation space: [price, balance, shares held]
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(3,), dtype=np.float32)
        # Action space: 0 = hold, 1 = buy, 2 = sell
        self.action_space = spaces.Discrete(3)

    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares_held = 0
        return self._get_obs()

    def _get_obs(self):
        price = float(self.df.iloc[self.current_step]['Close'])
        return np.array([price, self.balance, self.shares_held], dtype=np.float32)

    def step(self, action):
        price = float(self.df.iloc[self.current_step]['Close'])

        # Simple buy/sell logic
        if action == 1 and self.balance >= price: # Buy Share
            self.shares_held += 1
            self.balance -= price
        elif action == 2 and self.shares_held > 0: # Sell Share
            self.shares_held -= 1
            self.balance += price

        self.current_step += 1
        done = self.current_step >= self.n_steps - 1
        obs = self._get_obs()

        # Reward = total assets gain/loss
        total_assets = self.balance + self.shares_held * price
        reward = total_assets - self.initial_balance

        return obs, reward, done, {}



## Prepare Data

In [52]:
# Data split
df = yf.download("RIVN", start="2022-01-01", end="2024-01-01")
df = df.reset_index()
train_data, test_data = train_test_split(df, test_size=0.2, shuffle=False)

[*********************100%***********************]  1 of 1 completed


## Train Model

In [53]:
env = StockTradingEnv(df)
env_train=StockTradingEnv(train_data)


#Training the model
model = PPO("MlpPolicy", env_train, verbose=1)
model.learn(total_timesteps=10000)


obs = env.reset()
done = False
i=0;
action_hist=[];
states_hist=[];
obs_hist=[];
reward_hist=[];
while not done:
    action, states = model.predict(obs)
    states_hist.append(states)
    action_hist.append(action)
    obs, reward, done, _ = env.step(action)
    obs_hist.append(obs)
    reward_hist.append(reward)
    
    i+=1

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  price = float(self.df.iloc[self.current_step]['Close'])
  price = float(self.df.iloc[self.current_step]['Close'])


----------------------------------
| rollout/           |           |
|    ep_len_mean     | 399       |
|    ep_rew_mean     | -6.23e+04 |
| time/              |           |
|    fps             | 479       |
|    iterations      | 1         |
|    time_elapsed    | 4         |
|    total_timesteps | 2048      |
----------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 399          |
|    ep_rew_mean          | -6.5e+04     |
| time/                   |              |
|    fps                  | 413          |
|    iterations           | 2            |
|    time_elapsed         | 9            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 6.800535e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   | 4.51e-05     |
|    

In [59]:
# Use Seaborn's style for a cleaner look
sns.set(style="whitegrid")

# Example data
x = np.linspace(0, len(reward_hist), len(reward_hist))

reward_hist_array=np.array(reward_hist)

plt.figure(figsize=(10,5), tight_layout="tight")
plt.subplot(2,2,1)
plt.plot(x, reward_hist_array+600, color='tab:blue', linewidth=2)


# Add title and labels
plt.title('Net worth over time', fontsize=16)
plt.xlabel('Days', fontsize=14)
plt.ylabel('Reward', fontsize=14)

# Show legend


plt.subplot(2,2,2)
plt.plot(range(len(df['Open'])),df['Open'], label= "Opening Price")

plt.plot(range(len(df['High'])),df['High']+30, label="High Price+30")

plt.plot(range(len(df['Low'])),df['Low']-30,label="Low Price-30")

plt.title('price vs time', fontsize=16)
plt.xlabel('Days', fontsize=14)
plt.ylabel('Price', fontsize=14)
plt.legend()


plt.subplot(2,2,4)

n=np.array(obs_hist)
plt.plot(range(len(action_hist)), n[:,2], color="r")
plt.title("Number of Share Over Time")
plt.xlabel('Days', fontsize=14)
plt.ylabel('Share', fontsize=14)

plt.subplot(2,2,3)
df_array=np.array(df)
plt.plot(range(len(df_array[:,4])),600/np.array(df_array[0,4])*df_array[:,4])
plt.title("Net Worth Of Hold over time")


# Show the plot
plt.show()


In [58]:
%matplotlib qt