In [None]:
import pandas as pd

# Load the preprocessed data
data = pd.read_csv('preprocessed_NQ_data_with_indicators.csv', index_col=0)

# Check for any NaN values in the DataFrame
print("Checking for NaN values in the dataset:")
print(data.isnull().sum())

# Optionally drop rows with NaN values
data.dropna(inplace=True)

# Ensure that no NaN values remain
print("NaN values after cleaning:")
print(data.isnull().sum())


Checking for NaN values in the dataset:
rtype                0
publisher_id         0
instrument_id        0
open                 0
high                 0
low                  0
close                0
volume               0
symbol               0
RSI                152
ATR                 13
Fib_0.236           19
Fib_0.382           19
Fib_0.500           19
Fib_0.618           19
Fib_0.786           19
Fib_0.236_upper     19
Fib_0.236_lower     19
Fib_0.382_upper     19
Fib_0.382_lower     19
Fib_0.500_upper     19
Fib_0.500_lower     19
Fib_0.618_upper     19
Fib_0.618_lower     19
Fib_0.786_upper     19
Fib_0.786_lower     19
dtype: int64
NaN values after cleaning:
rtype              0
publisher_id       0
instrument_id      0
open               0
high               0
low                0
close              0
volume             0
symbol             0
RSI                0
ATR                0
Fib_0.236          0
Fib_0.382          0
Fib_0.500          0
Fib_0.618          0
Fib_0.7

In [None]:
import gym
from gym import spaces
import numpy as np
from stable_baselines3 import PPO

class TradingEnv(gym.Env):
    def __init__(self, data, initial_balance=10000):
        super(TradingEnv, self).__init__()

        # Load preprocessed data and drop non-numeric columns
        self.data = data.drop(columns=['symbol'])  # Exclude 'symbol' or any other non-numeric columns
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.positions = 0  # Number of assets held
        self.net_worth = initial_balance
        self.done = False

        # Define action and observation space
        # Actions: 0 = Hold, 1 = Buy, 2 = Sell
        self.action_space = spaces.Discrete(3)

        # Observations: Only numeric data
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(self.data.shape[1],), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.positions = 0
        self.net_worth = self.initial_balance
        self.done = False
        return self._next_observation()

    def _next_observation(self):
        obs = self.data.iloc[self.current_step].values

        # Debugging: Check for NaN values in observations
        if np.isnan(obs).any():
            print(f"Warning: NaN values detected in observation at step {self.current_step}")
            print(obs)

        return obs

    def step(self, action):
        print(f"Action taken: {action}")

        current_price = self.data['close'].iloc[self.current_step]
        reward = 0

        # Check if current_price is valid
        if np.isnan(current_price):
            print(f"Warning: NaN detected in current_price at step {self.current_step}")

        # Calculate maximum risk amount (0.5% of current balance)
        risk_amount = self.balance * 0.005

        if action == 1:  # Buy
            # Determine the number of shares to buy based on the risk amount and current price
            shares_to_buy = risk_amount / current_price
            self.positions += shares_to_buy
            self.balance -= shares_to_buy * current_price

        elif action == 2:  # Sell
            if self.positions > 0:
                # Sell all positions or the equivalent of the risk amount, whichever is smaller
                shares_to_sell = min(self.positions, risk_amount / current_price)
                self.positions -= shares_to_sell
                self.balance += shares_to_sell * current_price
                reward = shares_to_sell * (current_price - self.data['close'].iloc[self.current_step - 1])

        # Update net worth
        self.net_worth = self.balance + self.positions * current_price

        # Advance to the next step
        self.current_step += 1

        # Clip the reward to avoid overflow
        reward = np.clip(reward, -1000, 1000)

        print(f"Step: {self.current_step}, Reward: {reward}, Balance: {self.balance}, Net Worth: {self.net_worth}, Positions: {self.positions}")

        # Check for NaN in any of the critical variables
        if np.isnan(reward) or np.isnan(self.balance) or np.isnan(self.net_worth):
            print("Critical NaN detected in environment variables.")
            print(f"reward: {reward}, balance: {self.balance}, net_worth: {self.net_worth}")
            self.done = True  # Force end the episode if NaN is detected

        if self.current_step >= len(self.data) - 1:
            self.done = True

        return self._next_observation(), reward, self.done, {}

    def render(self):
        # Custom rendering logic
        print(f'Step: {self.current_step}, Balance: {self.balance}, Net Worth: {self.net_worth}, Positions: {self.positions}')


In [None]:
# Assuming your data is preprocessed and loaded as 'data'
# Example: data = pd.read_csv('preprocessed_NQ_data_with_indicators.csv', index_col=0)

# Create the environment
env = TradingEnv(data)

# Instantiate the PPO model
model = PPO("MlpPolicy", env, verbose=1)

# Train the model for a specified number of timesteps
model.learn(total_timesteps=10000)

# Save the trained model
model.save("ppo_trading_model")

print("Model training complete. The model has been saved as 'ppo_trading_model'.")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Action taken: 0
Step: 1, Reward: 0, Balance: 10000, Net Worth: 10000.0, Positions: 0
Action taken: 2
Step: 2, Reward: 0, Balance: 10000, Net Worth: 10000.0, Positions: 0
Action taken: 0
Step: 3, Reward: 0, Balance: 10000, Net Worth: 10000.0, Positions: 0
Action taken: 2
Step: 4, Reward: 0, Balance: 10000, Net Worth: 10000.0, Positions: 0
Action taken: 1
Step: 5, Reward: 0, Balance: 9950.0, Net Worth: 10000.0, Positions: 187.5783548115983
Action taken: 2
Step: 6, Reward: 0.04351667191490066, Balance: 9999.75, Net Worth: 10000.043773637764, Positions: 1.1011474885514758
Action taken: 2
Step: 7, Reward: -0.00024411755701001344, Balance: 10000.043529520208, Net Worth: 10000.043529520208, Positions: 0.0
Action taken: 2
Step: 8, Reward: 0, Balance: 10000.043529520208, Net Worth: 10000.043529520208, Positions: 0.0
Action taken: 1
Step: 9, Reward: 0, Balance: 9950.043311872607, Net Worth: 10000.043529



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step: 7759, Reward: -0.03577926541391562, Balance: 7008.7850957417795, Net Worth: 24687.691309380905, Positions: 64778.21282578241
Action taken: 2
Step: 7760, Reward: 0.034426052672495794, Balance: 7043.829021220488, Net Worth: 24705.075589832803, Positions: 64649.93267217233
Action taken: 0
Step: 7761, Reward: 0, Balance: 7043.829021220488, Net Worth: 24688.480076929194, Positions: 64649.93267217233
Action taken: 0
Step: 7762, Reward: 0, Balance: 7043.829021220488, Net Worth: 24705.829931328422, Positions: 64649.93267217233
Action taken: 0
Step: 7763, Reward: 0, Balance: 7043.829021220488, Net Worth: 24689.234418424818, Positions: 64649.93267217233
Action taken: 0
Step: 7764, Reward: 0, Balance: 7043.829021220488, Net Worth: 24708.092955815275, Positions: 64649.93267217233
Action taken: 1
Step: 7765, Reward: 0, Balance: 7008.609876114386, Net Worth: 24691.49744291167, Positions: 64778.95341901968
Action taken: 2
Step: 77

In [None]:
from stable_baselines3 import PPO

# Load the trained model
model = PPO.load("ppo_trading_model")

# Reset the environment to start evaluation
obs = env.reset()

# Initialize variables to track performance
total_reward = 0

# Run through the environment using the trained model
for step in range(len(data) - 1):
    # Predict the action using the trained model
    action, _states = model.predict(obs)

    # Take the action in the environment
    obs, reward, done, info = env.step(action)

    # Accumulate the reward
    total_reward += reward

    # Print the environment's state after each action
    print(f"Step: {step + 1}, Action: {action}, Reward: {reward}, Balance: {env.balance}, Net Worth: {env.net_worth}, Positions: {env.positions}")

    # If the environment signals done, break the loop
    if done:
        break

# Print the total accumulated reward after evaluation
print(f"Total Reward: {total_reward}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step: 1311475, Reward: 0, Balance: 1.5868128545092127e+137, Net Worth: 4.353089714794866e+139, Positions: 1.0902435895902994e+140
Step: 1311475, Action: 1, Reward: 0, Balance: 1.5868128545092127e+137, Net Worth: 4.353089714794866e+139, Positions: 1.0902435895902994e+140
Action taken: 2
Step: 1311476, Reward: -1000.0, Balance: 1.5947469187817588e+137, Net Worth: 3.988549880342895e+137, Positions: 1.0866420034801223e+140
Step: 1311476, Action: 2, Reward: -1000.0, Balance: 1.5947469187817588e+137, Net Worth: 3.988549880342895e+137, Positions: 1.0866420034801223e+140
Action taken: 0
Step: 1311477, Reward: 0, Balance: 1.5947469187817588e+137, Net Worth: 4.3437860035806674e+139, Positions: 1.0866420034801223e+140
Step: 1311477, Action: 0, Reward: 0, Balance: 1.5947469187817588e+137, Net Worth: 4.3437860035806674e+139, Positions: 1.0866420034801223e+140
Action taken: 2
Step: 1311478, Reward: -1000.0, Balance: 1.6027206533756676e

  shares_to_buy = risk_amount / current_price
  self.balance -= shares_to_buy * current_price
  self.net_worth = self.balance + self.positions * current_price
