In [1]:
!pip install gymnasium



In [2]:
import gymnasium as gym
import numpy as np
import pandas as pd
from gymnasium import spaces

class BTCTradingEnv(gym.Env):
    def __init__(self, df, initial_balance=100000, window_size=50, render_mode=None):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.initial_balance = initial_balance
        self.window_size = window_size
        self.render_mode = render_mode

        # Action space: 0 = Hold, 1 = Buy, 2 = Sell
        self.action_space = spaces.Discrete(3)

        # Observation: OHLCV + indicators + balance + holdings
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(26,), dtype=np.float32
        )

    def reset(self, seed=None, options=None):
        self.balance = self.initial_balance
        self.holdings = 0
        self.current_step = self.window_size
        self.net_worth = self.initial_balance
        self.prev_net_worth = self.net_worth
        self.net_worth_history = [self.net_worth]
        self.return_history = []
        self.prev_action = 0
        return self._next_observation(), {}

    def _next_observation(self):
        data = self.df.loc[self.current_step, [
            'open', 'high', 'low', 'close', 'volume', 'quote_asset_volume', 'RSI14', 'RSI30', 'RSI200',
            'EMA10', 'EMA30', 'EMA200', 'MOM10', 'MOM30', 'EMA12', 'EMA26', 'MACD', 'PROC9', 'Low14',
            'High14', '%K', '%K10', '%K30', '%K200'
        ]].values
        obs = np.array(list(data) + [self.balance, self.holdings], dtype=np.float32)
        return obs

    def step(self, action):
        price = self.df.loc[self.current_step, 'close']

        # Execute trade
        if action == 1 and self.balance >= price:  # Buy
            self.holdings += 1
            self.balance -= price
        elif action == 2 and self.holdings > 0:  # Sell
            self.holdings -= 1
            self.balance += price
        # else: Hold

        self.current_step += 1
        done = self.current_step >= len(self.df) - 1

        # Update net worth
        self.net_worth = self.balance + self.holdings * price
        self.net_worth_history.append(self.net_worth)

        # --- Compute Reward ---
        profit_reward = (self.net_worth - self.prev_net_worth) / self.initial_balance  # Scale to initial capital
        profit_reward *= 100  # Amplify the signal

        self.prev_net_worth = self.net_worth

        # Daily return
        if len(self.net_worth_history) > 1:
            daily_return = self.net_worth_history[-1] / self.net_worth_history[-2] - 1
            self.return_history.append(daily_return)

        volatility_penalty = np.std(self.return_history[-self.window_size:]) if len(self.return_history) >= self.window_size else 0
        volatility_penalty *= 10  # Amplify to be meaningful

        # Drawdown penalty
        peak = max(self.net_worth_history)
        drawdown = (self.net_worth - peak) / (peak + 1e-9)
        drawdown_penalty = abs(drawdown) if drawdown < 0 else 0
        drawdown_penalty *= 50  # Strong penalty for large drops

        # Trade penalty
        trade_penalty = 1.0 if action != self.prev_action else 0  # Now actually meaningful
        self.prev_action = action

        # Final reward (tuned weights)
        reward = (
            + 1.0 * profit_reward         # main driver
            - 0.3 * volatility_penalty    # mild penalty
            - 0.5 * drawdown_penalty      # stronger penalty
            - 0.05 * trade_penalty        # discourage flip-flopping
        )

        return self._next_observation(), reward, done, False, {}

    def render(self):
        if self.render_mode == "human":
            print(f"Step: {self.current_step}")
            print(f"Balance: {self.balance}, Holdings: {self.holdings}, Net Worth: {self.net_worth}")

In [3]:
gym.register(
    id="gymnasium_env/BTCTradingEnv",
    entry_point=BTCTradingEnv,
)


In [4]:
!pip install pandas numpy gym



In [5]:
import pandas as pd
df = pd.read_csv("/content/BTC_DATA.csv")
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
print (df.columns)
env = BTCTradingEnv(df)


Index(['timestamp', 'open', 'high', 'low', 'close', 'volume',
       'quote_asset_volume', 'RSI14', 'RSI30', 'RSI200', 'EMA10', 'EMA30',
       'EMA200', 'MOM10', 'MOM30', 'EMA12', 'EMA26', 'MACD', 'PROC9', 'Low14',
       'High14', '%K', '%K10', '%K30', '%K200', 'SMA_10', 'SMA_20', 'SMA_30',
       'BB_upper', 'BB_middle', 'BB_lower', 'ROC_14', 'ROC_30'],
      dtype='object')


In [6]:
#env = BTCTradingEnv(df)
obs = env.reset()

for i in range (0,10):
    #print("Initial observation:", obs)
    print(i+1)

    # Run one step with a random action
    action = env.action_space.sample()
    print ("action", action)
    next_obs, reward, terminated, truncated, _ = env.step(action)

    #print("Next observation:", next_obs)
    print("Reward:", reward)
    print("Terminated:", terminated)
    env.render()
    obs= next_obs


1
action 1
Reward: -0.05
Terminated: False
2
action 0
Reward: -0.12820000000001514
Terminated: False
3
action 0
Reward: 0.035460000000002816
Terminated: False
4
action 0
Reward: 0.0003799999999974232
Terminated: False
5
action 1
Reward: -0.020529999999998837
Terminated: False
6
action 0
Reward: -0.05917477193784375
Terminated: False
7
action 2
Reward: -0.09568368196235148
Terminated: False
8
action 0
Reward: -0.020279684317230907
Terminated: False
9
action 2
Reward: -0.08030149422727635
Terminated: False
10
action 0
Reward: -0.058091494227269946
Terminated: False


In [7]:
env.action_space.sample()

np.int64(0)

In [8]:
!pip install stable-baselines3[extra] --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
!pip install 'shimmy>=2.0'

Collecting shimmy>=2.0
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Downloading Shimmy-2.0.0-py3-none-any.whl (30 kB)
Installing collected packages: shimmy
Successfully installed shimmy-2.0.0


In [10]:
!pip install sb3-contrib

Collecting sb3-contrib
  Downloading sb3_contrib-2.6.0-py3-none-any.whl.metadata (4.1 kB)
Downloading sb3_contrib-2.6.0-py3-none-any.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.8/92.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sb3-contrib
Successfully installed sb3-contrib-2.6.0


In [11]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.monitor import Monitor
from sb3_contrib import QRDQN

In [None]:
!pip install tensorboard



In [12]:
from gymnasium.wrappers import TimeLimit
vec_env = DummyVecEnv([lambda: Monitor(TimeLimit(BTCTradingEnv(df), max_episode_steps=1000))])
env= VecNormalize(vec_env, norm_obs=True, norm_reward=True)

In [17]:
model1 = A2C("MlpPolicy", env, verbose=1)
model1.learn(total_timesteps=50000)

Using cuda device




------------------------------------
| time/                 |          |
|    fps                | 325      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.09    |
|    explained_variance | -3.13    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0717  |
|    value_loss         | 0.0313   |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 1e+03     |
|    ep_rew_mean        | -1.13e+03 |
| time/                 |           |
|    fps                | 324       |
|    iterations         | 200       |
|    time_elapsed       | 3         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -1        |
|    explained_variance | 0.0205    |
|    learning_rate      | 

<stable_baselines3.a2c.a2c.A2C at 0x7af5b2029350>

In [16]:
model2 = PPO("MlpPolicy", env, verbose=1)
model2.learn(total_timesteps=50000)

Using cuda device




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -574     |
| time/              |          |
|    fps             | 416      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -476        |
| time/                   |             |
|    fps                  | 379         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009143602 |
|    clip_fraction        | 0.0757      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.264       |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x7af5b3e3fb90>

In [None]:
model3 = DQN("MlpPolicy", env, verbose=1)
model3.learn(total_timesteps=50000)

In [13]:
from sb3_contrib import RecurrentPPO
from sb3_contrib.ppo_recurrent.policies import RecurrentActorCriticPolicy

model4 = RecurrentPPO(
    RecurrentActorCriticPolicy,
    env,
    verbose=1,
)
model4.learn(total_timesteps=50000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    value_loss           | 0.000527    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1e+03        |
|    ep_rew_mean          | -423         |
| time/                   |              |
|    fps                  | 164          |
|    iterations           | 154          |
|    time_elapsed         | 119          |
|    total_timesteps      | 19712        |
| train/                  |              |
|    approx_kl            | 0.0027849474 |
|    clip_fraction        | 0.0742       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.326       |
|    explained_variance   | -2.36        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0147      |
|    n_updates            | 1530         |
|    policy_gradient_loss | -0.0081      |
|    value_loss           | 0.0006

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x7af5bcd77a50>

In [18]:
model2.save("ppo_btc_trading")
model1.save("a2c_btc_trading")


In [15]:
# Save
model4.save("recurrent_ppo_btc_trading_2")

# Load (later, if needed)
# model = PPO.load("ppo_btc_trading")
