In [None]:
import numpy as np
import pandas as pd
import gym
from gym import spaces
from stable_baselines3 import PPO

class StockTradingEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000):
        super(StockTradingEnv, self).__init__()

        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance

        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=1, shape=(window_size, 5), dtype=np.float32)

        self.reset()

    def reset(self):
        self.balance = self.initial_balance
        self.current_step = 0
        self.done = False
        self.positions = []

        return self._next_observation()

    def step(self, action):
        current_price = self.data.iloc[self.current_step]['Close']
        action = action[0]

        if action > 0:
            shares_to_buy = self.balance // current_price
            self.positions.append(shares_to_buy)
            self.balance -= shares_to_buy * current_price
        elif action < 0:
            if self.positions:
                shares_to_sell = self.positions.pop(0)
                self.balance += shares_to_sell * current_price

        self.current_step += 1

        if self.current_step >= len(self.data) - 1:
            self.done = True

        obs = self._next_observation()
        reward = self.balance - self.initial_balance
        done = self.done
        info = {}

        return obs, reward, done, info

    def _next_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size].values
        return obs


data = pd.read_csv("your_data_file.csv")
env = StockTradingEnv(data)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)

obs = env.reset()
while not env.done:
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)

print(f"Final balance: {env.balance}")