In [58]:
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box


# Create a sample DataFrame
df = pd.read_csv("/Users/larka/GitHub/RLmodel/MGOL.csv")  # Replace with actual file
df = df.drop(columns=['datetime','symbol', 'frame'])
#df['datetime'] = pd.to_datetime(df['datetime'], format='%m/%d/%y %H:%M')
#df.set_index('datetime', inplace=True)
#df = pd.DataFrame(data)

In [84]:
class TradingEnv(Env):
    def __init__(self, df):
        super(TradingEnv, self).__init__()
        self.df = df
        self.current_step = 0
        self.balance = 10000  # Starting cash
        self.shares = 0
        self.buy_price = 0
        self.sell_price = 0
        self.total_reward = 0
        self.position_open = False  # Track open positions
        self.round_trip_trades = 0  # Counter for buy-sell cycles
        reward = 0

        # Action Space: 0 = Hold, 1 = Buy, 2 = Sell
        self.action_space = Discrete(3)

        # Observation Space: Open, High, Low, Close, Volume
        #self.observation_space = Box(
        #    low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32
        #)
        self.observation_space = Box(
            low=-np.inf, high=np.inf, shape=(9,), dtype=np.float32
        )
    
    def get_valid_actions(self):
        valid_actions = [0]  # Hold is always valid
        if not self.position_open:  # Allow Buy only if no position is open
            valid_actions.append(1)
        if self.position_open:  # Allow Sell only if a position is open
            valid_actions.append(2)
        return valid_actions

    def get_obs(self):
        # Include the current position and valid actions in the observation
        position = [1] if self.position_open else [0]  # Shape: (1,)
        valid_actions = self.get_valid_actions()
        action_mask = [1 if i in valid_actions else 0 for i in range(self.action_space.n)]  # Shape: (3,)
        
        # Combine all features into a single observation array
        obs = np.concatenate([
            self.df.iloc[self.current_step][["open", "high", "low", "close", "volume"]].values,  # Shape: (5,)
            position,  # Shape: (1,)
            action_mask  # Shape: (3,)
        ])
        
        return obs
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.balance = 10000
        self.shares = 0
        self.position_open = False
        self.round_trip_trades = 0
        self.buy_price = 0
        self.sell_price = 0
        self.net_profit = 0
        reward = 0
        obs = self.get_obs()  # Use the updated observation method
        return obs, {}

    def step(self, action):
        if self.current_step >= len(self.df) - 1:
            return np.zeros(self.observation_space.shape), 0, True, False, {}
    
        current_price = self.df.iloc[self.current_step]["close"]
        reward = 0
        done = False
        truncated = False
    
        valid_actions = self.get_valid_actions()
        if action not in valid_actions:
            #print('convert action',action,'to hold.')
            action = 0  # Force to hold if invalid action
            reward -= 0.1  # Small penalty for holding
        else:
            if action == 1:  # Buy
                self.shares = 1
                self.balance -= current_price
                self.position_open = True
                self.buy_price = current_price  # Store the buy price
                reward += 10#.001  # Small penalty to avoid random buying
    
            elif action == 2:  # Sell
                if self.position_open:  # Ensure position is open before selling
                    self.shares = 0
                    self.balance += current_price
                    self.sell_price = current_price
                    reward = abs(current_price - self.buy_price) * 1000  # Reward profit
                    self.net_profit = self.sell_price - self.buy_price
                    self.buy_price = 0
                    self.position_open = False
                    self.round_trip_trades += 1  # Only increment after completing a round trip
                    #print(f"Round trip trade completed. Total round trips: {self.round_trip_trades}")

                else:
                    reward -= 0.1  # Small penalty for trying to sell without position
            else:
                reward -= 0.1  # Small penalty for trying to sell without position

        # Update the total reward for the episode
        self.total_reward += reward
    
        # Terminate after 10 round-trip trades or end of data
        if self.round_trip_trades >= 10:
            done = True
        if self.current_step >= len(self.df) - 1:
            done = True
    
        self.current_step += 1
        obs = self.get_obs()  # Use the updated observation method
        info = {"valid_actions": valid_actions}  # Include valid actions in info for debugging
    
        return obs, reward, done, truncated, info

In [85]:
# Initialize the environment
env = TradingEnv(df)

In [11]:
def simple_policy(env):
    # Alternate between buying and selling
    if env.position_open:
        return 2  # Sell
    else:
        return 1  # Buy

In [64]:
import unittest

class TestTradingEnv(unittest.TestCase):

    def setUp(self):
        # Create a sample DataFrame
        data = {
            'open': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121],
            'high': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122],
            'low': [99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120],
            'close': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121],
            'volume': [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000]
        }
        data = df


        self.df = pd.DataFrame(data)
        self.env = TradingEnv(self.df)

    def test_initial_state(self):
        obs, _ = self.env.reset()
        self.assertEqual(self.env.current_step, 0)
        self.assertEqual(self.env.balance, 10000)
        self.assertEqual(self.env.shares, 0)
        self.assertFalse(self.env.position_open)
        self.assertEqual(self.env.round_trip_trades, 0)
        self.assertEqual(self.env.buy_price, 0)
        self.assertEqual(obs.shape, (9,))  # Check observation shape

    def test_buy_action(self):
        self.env.reset()
        action = 1  # Buy
        obs, reward, done, truncated, info = self.env.step(action)
        self.assertTrue(self.env.position_open)
        self.assertEqual(self.env.shares, 1)
        self.assertEqual(self.env.balance, 10000 - self.df.iloc[0]["close"])
        self.assertEqual(self.env.buy_price, self.df.iloc[0]["close"])
        self.assertEqual(obs.shape, (9,))  # Check observation shape

    def test_sell_action(self):
        print('test buy, then sell')
        self.env.reset()
        self.env.step(1)  # Buy first
        action = 2  # Sell
        obs, reward, done, truncated, info = self.env.step(action)
        print(obs)
        print(reward)
        print(done)
        print(truncated)
        print(info)
        self.assertFalse(self.env.position_open)
        self.assertEqual(self.env.shares, 0)
        self.assertEqual(obs.shape, (9,))  # Check observation shape
        print(self.env.buy_price)
        print(self.env.sell_price)
        print(self.env.net_profit)

    def test_hold_action(self):
        self.env.reset()
        action = 0  # Hold
        obs, reward, done, truncated, info = self.env.step(action)
        self.assertFalse(self.env.position_open)
        self.assertEqual(self.env.shares, 0)
        self.assertEqual(self.env.balance, 10000)
        self.assertEqual(obs.shape, (9,))  # Check observation shape

    def test_invalid_action(self):
        print('test sell first')
        self.env.reset()
        action = 2  # Sell (invalid since no position is open)
        obs, reward, done, truncated, info = self.env.step(action)
        print(obs)
        print(reward)
        print(done)
        print(truncated)
        print(info)
        self.assertFalse(self.env.position_open)
        self.assertEqual(self.env.shares, 0)
        self.assertEqual(self.env.balance, 10000)
        self.assertEqual(obs.shape, (9,))  # Check observation shape

    def test_round_trip_trades(self):
        self.env.reset()
        for _ in range(10):
            print(f"Counter: {_}")
            print(f"Step number: {self.env.current_step}")
            
            # Step 1: Buy action
            obs, reward, done, truncated, info = self.env.step(1)  # Buy
            
            # Step 2: Sell action
            obs, reward, done, truncated, info = self.env.step(2)  # Sell
            
            print(f"Round trip trades: {self.env.round_trip_trades}")
        self.assertTrue(self.env.round_trip_trades >= 10)
        self.assertTrue(done)

# Create a test suite
suite = unittest.TestLoader().loadTestsFromTestCase(TestTradingEnv)

# Run the tests
unittest.TextTestRunner().run(suite)


......
----------------------------------------------------------------------
Ran 6 tests in 0.039s

OK


test sell first
[3.3860e-01 3.3880e-01 3.2650e-01 3.2650e-01 1.1337e+04 0.0000e+00
 1.0000e+00 1.0000e+00 0.0000e+00]
0
False
False
{'valid_actions': [0, 1]}
Counter: 0
Step number: 0
Round trip trades: 1
Counter: 1
Step number: 2
Round trip trades: 2
Counter: 2
Step number: 4
Round trip trades: 3
Counter: 3
Step number: 6
Round trip trades: 4
Counter: 4
Step number: 8
Round trip trades: 5
Counter: 5
Step number: 10
Round trip trades: 6
Counter: 6
Step number: 12
Round trip trades: 7
Counter: 7
Step number: 14
Round trip trades: 8
Counter: 8
Step number: 16
Round trip trades: 9
Counter: 9
Step number: 18
Round trip trades: 10
test buy, then sell
[3.290e-01 3.352e-01 3.240e-01 3.288e-01 8.203e+03 0.000e+00 1.000e+00
 1.000e+00 0.000e+00]
-12.1
False
False
{'valid_actions': [0, 2]}
0
0.3265
-0.0121


<unittest.runner.TextTestResult run=6 errors=0 failures=0>

In [86]:
from stable_baselines3 import A2C

# Create a model using Stable Baselines3
model_a2c = A2C('MlpPolicy', env, tensorboard_log="./tensorboard_logs/", ent_coef=0.1)

# Train the model for 10000 timesteps
model_a2c.learn(total_timesteps=50000)

# Evaluate the model
total_reward = 0
episodes = 10
for episode in range(episodes):
    obs, info = env.reset()  # Corrected: Unpack reset() correctly
    done = False
    episode_reward = 0

    while not done:
        action, _ = model_a2c.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)  # Correct unpacking
        episode_reward += reward

    total_reward += episode_reward
    print(f"Episode {episode + 1}: Total Reward = {episode_reward}")

print(f"Average reward over {episodes} episodes: {total_reward / episodes}")

Episode 1: Total Reward = -85.79999999999937
Episode 2: Total Reward = -85.79999999999937
Episode 3: Total Reward = -85.79999999999937
Episode 4: Total Reward = -85.79999999999937
Episode 5: Total Reward = -85.79999999999937
Episode 6: Total Reward = -85.79999999999937
Episode 7: Total Reward = -85.79999999999937
Episode 8: Total Reward = -85.79999999999937
Episode 9: Total Reward = -85.79999999999937
Episode 10: Total Reward = -85.79999999999937
Average reward over 10 episodes: -85.79999999999937


In [83]:
from stable_baselines3 import PPO

# Create a model using Stable Baselines3 PPO
model_ppo = PPO('MlpPolicy', env, tensorboard_log="./tensorboard_logs/", ent_coef=0.1)

# Train the model for 50000 timesteps
model_ppo.learn(total_timesteps=50000)

# Evaluate the model
total_reward = 0
episodes = 10
for episode in range(episodes):
    obs, info = env.reset()  # Corrected: Unpack reset() correctly
    done = False
    episode_reward = 0

    while not done:
        action, _ = model_ppo.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)  # Correct unpacking
        episode_reward += reward

    total_reward += episode_reward
    print(f"Episode {episode + 1}: Total Reward = {episode_reward}")

print(f"Average reward over {episodes} episodes: {total_reward / episodes}")

Episode 1: Total Reward = -95.89999999999883
Episode 2: Total Reward = -95.89999999999883
Episode 3: Total Reward = -95.89999999999883
Episode 4: Total Reward = -95.89999999999883
Episode 5: Total Reward = -95.89999999999883
Episode 6: Total Reward = -95.89999999999883
Episode 7: Total Reward = -95.89999999999883
Episode 8: Total Reward = -95.89999999999883
Episode 9: Total Reward = -95.89999999999883
Episode 10: Total Reward = -95.89999999999883
Average reward over 10 episodes: -95.89999999999883
