In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_checker import check_env

In [2]:

# Download Apple stock hourly data
data = yf.download('AAPL', interval='1h', period='1mo')
data.to_csv('AAPL_hourly.csv')

[*********************100%***********************]  1 of 1 completed


In [8]:
data = pd.read_csv('AAPL_hourly.csv')
data.head()

Unnamed: 0,Price,Close,High,Low,Open,Volume
0,Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
1,Datetime,,,,,
2,2024-12-04 14:30:00+00:00,243.4499969482422,244.10499572753906,242.50999450683594,242.8800048828125,8409865
3,2024-12-04 15:30:00+00:00,242.90499877929688,243.82000732421875,242.85000610351562,243.44000244140625,3476264
4,2024-12-04 16:30:00+00:00,242.03990173339844,242.90989685058594,241.25,242.90499877929688,7183414


In [14]:
# Load Apple stock data
df = pd.read_csv('AAPL_hourly.csv')
# Drop the 'data' column
df = df.drop(columns=['Price'])

# Delete the first two rows
df = df.iloc[2:].reset_index(drop=True)

df.head()


Unnamed: 0,Close,High,Low,Open,Volume
0,243.4499969482422,244.10499572753903,242.50999450683597,242.8800048828125,8409865
1,242.90499877929688,243.82000732421875,242.8500061035156,243.44000244140625,3476264
2,242.03990173339844,242.90989685058597,241.25,242.90499877929688,7183414
3,242.2586975097656,242.50999450683597,241.63999938964844,242.0399932861328,2189900
4,243.07000732421875,243.1764068603516,241.92999267578125,242.2400054931641,3055452


In [15]:

df.head()

Unnamed: 0,Close,High,Low,Open,Volume
0,243.4499969482422,244.10499572753903,242.50999450683597,242.8800048828125,8409865
1,242.90499877929688,243.82000732421875,242.8500061035156,243.44000244140625,3476264
2,242.03990173339844,242.90989685058597,241.25,242.90499877929688,7183414
3,242.2586975097656,242.50999450683597,241.63999938964844,242.0399932861328,2189900
4,243.07000732421875,243.1764068603516,241.92999267578125,242.2400054931641,3055452


In [27]:
class StockTradingEnv(gym.Env):
    # ...existing code...
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.action_space = gym.spaces.Discrete(3)  # Buy, Hold, Sell
        self.observation_space = gym.spaces.Box(low=0, high=np.inf, shape=(df.shape[1],), dtype=np.float32)
        self.current_step = 0
        self.balance = 10000
        self.shares_held = 0
        self.net_worth = 10000

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.balance = 10000
        self.shares_held = 0
        self.net_worth = 10000
        return self._next_observation().astype(np.float32), {}

    def _next_observation(self):
        return self.df.iloc[self.current_step].values

    def step(self, action):
        # ...existing code...
        self.current_step += 1
        reward = self.net_worth - 10000
        done = self.current_step >= len(self.df) - 1
        obs = self._next_observation().astype(np.float32)
        terminated = done
        truncated = False
        info = {}
        return obs, reward, terminated, truncated, info

    def render(self, mode='human'):
        # ...existing code...
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Shares held: {self.shares_held}')
        print(f'Net worth: {self.net_worth}')


In [28]:
# Create and check environment
env = StockTradingEnv(df)
check_env(env)


In [29]:
# Vectorize environment
vec_env = DummyVecEnv([lambda: env])

In [30]:
# Train model
model = PPO('MlpPolicy', vec_env, verbose=1)
model.learn(total_timesteps=10000)

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1176 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 768        |
|    iterations           | 2          |
|    time_elapsed         | 5          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00883119 |
|    clip_fraction        | 0.0323     |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.09      |
|    explained_variance   | 0          |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0167     |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.00136   |
|    value_loss           | 0.00104    |
----------------------------------------
-----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x2b259485400>

In [31]:
# Save model
model.save("ppo_stock_trading")


In [32]:
# Load model
model = PPO.load("ppo_stock_trading")

In [38]:
# Test model
obs = vec_env.reset()
net_worths = []
for i in range(len(df)):
    action, _states = model.predict(obs)
    obs, rewards, terminated, truncated = vec_env.step(action)
    net_worths.append(vec_env.envs[0].net_worth)
    vec_env.render()


