In [None]:
def choose_market_type():
    market_types = [
        "Standard Market",
        "High Volatility Market",
        "Bull Market",
        "Bear Market",
        "Sideways Market"
    ]

    print("Which trading environment do you want to use?\n Enter the corresponding number.")
    for i, mtype in enumerate(market_types):
        print(f"{i+1}: {mtype}")
    choice= input("Your choice: ")
    if choice=='1':
      print("standard")
      return None
    else:
      print (market_types[int(choice)-1])
      return market_types[int(choice)-1]

market_condition= choose_market_type()

Which trading environment do you want to use?
 Enter the corresponding number.
1: Standard Market
2: High Volatility Market
3: Bull Market
4: Bear Market
5: Sideways Market
Your choice: 1
standard


In [None]:
import numpy as np
import gymnasium as gym
from enum import Enum
import math
import matplotlib.pyplot as plt

class Actions(Enum):
    Hold = 0
    Sell = -1
    Buy = 1

class Positions(Enum):
    Short = -1
    Long = 1

    def opposite(self):
        return Positions.Short if self == Positions.Long else Positions.Long

class CryptoEnv3(gym.Env):
    metadata = {"render_modes": ["human"], "render_fps": 3}
    def __init__(
        self,
        df,
        window_size,
        frame_bound,
        initial_amount=100000,
        render_mode=None,
        market_condition=None,
        profit_weight=300,
        volatility_weight=0.1,
        drawdown_weight=0.1,
        trade_penalty_weight=0.0005,
        reward_up_limit=30,
        reward_low_limit=-30,
        risk_percentage=0.05,
        stop_loss=0.05
        ):
      super().__init__()
      self.df = df
      self.window_size = window_size
      self.frame_bound = frame_bound
      self.initial_amount = initial_amount
      self.render_mode = render_mode
      self.trade_fee_percent = 0.0015  # 0.15%
      self.market_condition = market_condition
      self.profit_weight = profit_weight
      self.volatility_weight = volatility_weight
      self.drawdown_weight = drawdown_weight
      self.trade_penalty_weight = trade_penalty_weight
      self.reward_up_limit=reward_up_limit
      self.reward_low_limit=reward_low_limit
      self.risk_percentage=risk_percentage
      self.stop_loss=stop_loss

      self._init_data()
      self._init_spaces()

      self.action_map = {
        0: Actions.Hold.value,
        1: Actions.Sell.value,
        2: Actions.Buy.value,
        }
      self.action_space = gym.spaces.Discrete(len(self.action_map))


    def _init_data(self):
        prices = self.df["close"].values[
            self.frame_bound[0] - self.window_size : self.frame_bound[1]
        ].astype(np.float32)

        # Prompt-based simulation
        if self.market_condition == "high_volatility":
            noise = np.random.normal(0, 0.05, size=prices.shape)
            prices = prices * (1 + noise)

        elif self.market_condition == "bull_market":
            trend = np.linspace(1, 1.2, num=len(prices))  # 20% upward drift
            prices = prices * trend

        elif self.market_condition == "bear_market":
            trend = np.linspace(1, 0.8, num=len(prices))  # 20% downward drift
            prices = prices * trend

        elif self.market_condition == "sideways":
            mean_price = np.mean(prices)
            prices = mean_price + np.random.normal(0, 0.002 * mean_price, size=prices.shape)  # ~0.2% std dev

        self.prices = prices

        self.signal_features = self.df.iloc[
            self.frame_bound[0] - self.window_size : self.frame_bound[1], 1:
        ].values.astype(np.float32)

        self.shape = (self.window_size, self.signal_features.shape[1])
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1

    def _init_spaces(self):
    # original shape for signal features
        self.feature_dim = self.signal_features.shape[1]
    # +2 for balance and holdings
        self.observation_space = gym.spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(self.window_size * self.feature_dim + 2,),
            dtype=np.float32,
        )

        self.position_map = {
          0: Positions.Short.value,
          1: Positions.Long.value,
         }
        self.position_space = gym.spaces.Discrete(len(self.position_map))

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._current_tick = self._start_tick
        self._last_trade_tick = self._current_tick - 1
        self._position = Positions.Short
        self._position_history = [None] * self._start_tick + [self._position]
        self.trailing_peak = None  # For Long
        self.trailing_trough = None  # For Short


        # Capital tracking
        self.balance = self.initial_amount
        self.holdings = 0
        self.net_worth = self.initial_amount
        self.prev_net_worth = self.net_worth
        self._total_profit = self.initial_amount
        self._total_reward = 0.0

        self.return_history = []
        self.reward_history = []
        self.net_worth_history = [self.net_worth]
        self.prev_action = Actions.Hold.value

        self._truncated = False
        return self._get_observation(), self._get_info()

    def _get_observation(self):
        signal_obs = self.signal_features[
            self._current_tick - self.window_size + 1 : self._current_tick + 1
        ].flatten()

        norm_balance = self.balance / self.initial_amount
        norm_holdings = self.holdings  # you can normalize this if needed

        return np.concatenate([signal_obs, [norm_balance, norm_holdings]]).astype(np.float32)


    def step(self, action_index):
        action = self.action_map[action_index]
        self._current_tick += 1
        self._truncated = self._current_tick >= self._end_tick

        reward = self._calculate_reward(action)
        self.reward_history.append(reward)
        self._total_reward += reward

        self._update_profit(action)
        self._handle_position_change()
        self._position_history.append(self._position)

        return (
            self._get_observation(),
            reward,
            False,  # No terminal condition except truncated
            self._truncated,
            self._get_info(),
        )

    #def _handle_position_change(self, action):
    #    if (
    #        (action == Actions.Buy.value and self._position == Positions.Short)
    #        or (action == Actions.Sell.value and self._position == Positions.Long)
    #    ):
    #        self._position = self._position.opposite()
    #        self._last_trade_tick = self._current_tick

    def _handle_position_change(self):
      if self.holdings > 0:
        if self._position != Positions.Long:
          self.trailing_peak = self.prices[self._current_tick]
        self._position = Positions.Long
        if self.trailing_peak is None:
          self.trailing_peak = self.prices[self._current_tick]
        else:
          self.trailing_peak = max(self.trailing_peak, self.prices[self._current_tick])

      elif self.holdings <= 0:
        if self._position != Positions.Short:
          self.trailing_trough = self.prices[self._current_tick]
        self._position = Positions.Short
        if self.trailing_trough is None:
          self.trailing_trough = self.prices[self._current_tick]
        else:
          self.trailing_trough = min(self.trailing_trough, self.prices[self._current_tick])



    def _calculate_reward(self, action):
        current_price = self.prices[self._current_tick]
        if np.isnan(current_price) or np.isinf(current_price) or current_price < 1e-8:
          current_price = 1.0


        # Execute trades
       # if self.net_worth<= 0.1 * self.initial_amount:
       #     self.balance = self.initial_amount
       #     self.holdings = 0
        if action == Actions.Buy.value and self.balance >= current_price:
            risk_per_trade=self.risk_percentage*self.balance
            pos_size=risk_per_trade/ 0.05   #stop loss= 10%-->0.1
            shares_to_buy = np.clip(pos_size // current_price, 0, 1e6)
            self.balance -= current_price *(1 + self.trade_fee_percent) *(pos_size // current_price)
        elif action == Actions.Sell.value:
            risk_per_trade=self.risk_percentage*self.balance
            pos_size= risk_per_trade/ 0.05   #stop loss= 10%-->0.1
            to_sell=pos_size // current_price
            if math.isnan(to_sell):
              print("NAN VALUE. current price is ", current_price)
              to_sell=0
            self.holdings -= to_sell
            self.balance += current_price *(1 + self.trade_fee_percent) *(to_sell)

        # Update net worth
        self.net_worth = np.clip(self.balance + self.holdings * current_price, 0, 1e10)
        if np.isnan(self.net_worth) or np.isinf(self.net_worth):
          self.net_worth = self.prev_net_worth

        self.net_worth_history.append(self.net_worth)

        # Profit reward normalized to initial amount
        profit_reward = (self.net_worth - self.prev_net_worth) / self.initial_amount
        self.prev_net_worth = self.net_worth

        # Daily return for volatility
        if len(self.net_worth_history) > 1:
          prev = self.net_worth_history[-2]
          curr = self.net_worth_history[-1]
          if prev and np.isfinite(prev) and np.isfinite(curr):
            daily_return = (curr / prev) - 1
          else:
            daily_return = 0.0  # or np.nan if you prefer
          self.return_history.append(daily_return)


        volatility_penalty = (
            np.std(self.return_history[-self.window_size :])
            if len(self.return_history) >= self.window_size
            else 0
        )

        # Drawdown penalty
        peak = max(self.net_worth_history) if self.net_worth_history else self.initial_amount
        drawdown = (self.net_worth - peak) / (peak + 1e-8)
        drawdown_penalty = abs(drawdown) if drawdown < 0 else 0

        # Trade penalty
        # trade_penalty = 1.0 if action != self.prev_action else 0
        trade_penalty = 0.001 * current_price  # proportional to cost
        self.prev_action = action

        reward = (
        + self.profit_weight* profit_reward
        - self.volatility_weight * volatility_penalty
        - self.drawdown_weight * drawdown_penalty
        - self.trade_penalty_weight * trade_penalty
    )
        reward = np.clip(reward, self.reward_low_limit, self.reward_up_limit)
        if np.isnan(reward) or np.isinf(reward):
          reward = 0.0
        return reward

    def _update_profit(self, action):
        if action == Actions.Hold.value:
            return

        current_price = self.prices[self._current_tick]
        if np.isnan(current_price) or np.isinf(current_price) or current_price < 1e-8:
          current_price = 1.0
        last_trade_price = self.prices[self._last_trade_tick]
        if np.isnan(last_trade_price) or np.isinf(last_trade_price) or last_trade_price < 1e-8:
          last_trade_price = 1.0
        shares = (self._total_profit * (1 - self.trade_fee_percent)) / last_trade_price
        shares = np.clip(shares, -1e6, 1e6)

        if self._position == Positions.Long:
            self._total_profit = shares * current_price * (1 - self.trade_fee_percent)
        elif self._position == Positions.Short:
            price_diff = last_trade_price - current_price
            self._total_profit = shares * (last_trade_price + price_diff) * (1 - self.trade_fee_percent)
            self._total_profit = np.clip(self._total_profit, -1e9, 1e9)


    def _get_info(self):
        return {
            "total_reward": self._total_reward,
            "total_profit": self._total_profit,
            "net_worth": self.net_worth,
            "balance": self.balance,
            "holdings": self.holdings,
            "initial_amount": self.initial_amount,
            "current_step": self._current_tick,
            "position": self._position.name,
        }


    def render(self):
      if not self.net_worth_history or not self.reward_history:
        print("Nothing to render yet.")
        return

      fig, axs = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

      axs[0].plot(self.net_worth_history, label="Net Worth", color="blue")
      axs[0].set_ylabel("Net Worth")
      axs[0].legend()
      axs[0].grid(True)

      axs[1].plot(self.reward_history, label="Reward", color="green")
      axs[1].set_ylabel("Reward")
      axs[1].set_xlabel("Time Step")
      axs[1].legend()
      axs[1].grid(True)

      plt.tight_layout()
      plt.show()


      plt.tight_layout()
      plt.show()


    def close(self):
        pass


In [None]:
import numpy as np
import gymnasium as gym
from enum import Enum

class Actions(Enum):
    Hold = 0
    Sell = -1
    Buy = 1

class Positions(Enum):
    Short = -1
    Long = 1

    def opposite(self):
        return Positions.Short if self == Positions.Long else Positions.Long

class CryptoEnv4(gym.Env):
    metadata = {"render_modes": ["human"], "render_fps": 3}

    def __init__(self, df, window_size, frame_bound, initial_amount=100000, render_mode=None, market_condition= None):
        super().__init__()
        self.df = df
        self.window_size = window_size
        self.frame_bound = frame_bound
        self.initial_amount = initial_amount
        self.render_mode = render_mode
        self.trade_fee_percent = 0.001  # 0.1%
        self.market_condition = market_condition

        self._init_data()
        self._init_spaces()

        self.action_map = {
          0: Actions.Hold.value,
          1: Actions.Sell.value,
          2: Actions.Buy.value,
         }
        self.action_space = gym.spaces.Discrete(len(self.action_map))


    def _init_data(self):
        prices = self.df["close"].values[
            self.frame_bound[0] - self.window_size : self.frame_bound[1]
        ].astype(np.float32)

        # Prompt-based simulation
        if self.market_condition == "high_volatility":
            noise = np.random.normal(0, 0.05, size=prices.shape)
            prices = prices * (1 + noise)

        elif self.market_condition == "bull_market":
            trend = np.linspace(1, 1.2, num=len(prices))  # 20% upward drift
            prices = prices * trend

        elif self.market_condition == "bear_market":
            trend = np.linspace(1, 0.8, num=len(prices))  # 20% downward drift
            prices = prices * trend

        elif self.market_condition == "sideways":
            mean_price = np.mean(prices)
            prices = mean_price + np.random.normal(0, 0.002 * mean_price, size=prices.shape)  # ~0.2% std dev

        self.prices = prices

        self.signal_features = self.df.iloc[
            self.frame_bound[0] - self.window_size : self.frame_bound[1], 1:
        ].values.astype(np.float32)

        self.shape = (self.window_size, self.signal_features.shape[1])
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1


    def _init_spaces(self):
    # original shape for signal features
        self.feature_dim = self.signal_features.shape[1]
    # +2 for balance and holdings
        self.observation_space = gym.spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(self.window_size * self.feature_dim + 2,),
            dtype=np.float32,
        )

        self.position_map = {
          0: Positions.Short.value,
          1: Positions.Long.value,
         }
        self.position_space = gym.spaces.Discrete(len(self.position_map))

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._current_tick = self._start_tick
        self._last_trade_tick = self._current_tick - 1
        self._position = Positions.Short
        self._position_history = [None] * self._start_tick + [self._position]
        self.trailing_peak=  self.prices[self._start_tick]
        self.trailing_trough=  self.prices[self._start_tick]

        # Capital tracking
        self.balance = self.initial_amount
        self.holdings = 0
        self.net_worth = self.initial_amount
        self.prev_net_worth = self.net_worth
        self._total_profit = self.initial_amount
        self._total_reward = 0.0

        self.return_history = []
        self.net_worth_history = [self.net_worth]
        self.prev_action = Actions.Hold.value

        self._truncated = False
        return self._get_observation(), self._get_info()

    def _get_observation(self):
        signal_obs = self.signal_features[
            self._current_tick - self.window_size + 1 : self._current_tick + 1
        ].flatten()

        norm_balance = self.balance / self.initial_amount
        norm_holdings = self.holdings  # you can normalize this if needed

        return np.concatenate([signal_obs, [norm_balance, norm_holdings]]).astype(np.float32)


    def step(self, action_index):
        action = self.action_map[action_index]
        self._current_tick += 1
        self._truncated = self._current_tick >= self._end_tick

        reward = self._calculate_reward(action)
        self._total_reward += reward

        self._update_profit(action)
        self._handle_position_change()
        self._position_history.append(self._position)
        self.exit_pos()

        return (
            self._get_observation(),
            reward,
            False,  # No terminal condition except truncated
            self._truncated,
            self._get_info(),
        )

    #def _handle_position_change(self, action):
    #    if (
    #        (action == Actions.Buy.value and self._position == Positions.Short)
    #        or (action == Actions.Sell.value and self._position == Positions.Long)
    #    ):
    #        self._position = self._position.opposite()
    #        self._last_trade_tick = self._current_tick

    #def _handle_position_change(self):
    #    if self.holdings > 0:
    #        self._position = Positions.Long
    #    elif self.holdings <= 0:
    #        self._position = Positions.Short

    def _handle_position_change(self):
      if self.holdings > 0:
        if self._position != Positions.Long:
          self.trailing_peak = self.prices[self._current_tick]
        self._position = Positions.Long
        if self.trailing_peak is None:
          self.trailing_peak = self.prices[self._current_tick]
        else:
          self.trailing_peak = max(self.trailing_peak, self.prices[self._current_tick])

      elif self.holdings <= 0:
        if self._position != Positions.Short:
          self.trailing_trough = self.prices[self._current_tick]
        self._position = Positions.Short
        if self.trailing_trough is None:
          self.trailing_trough = self.prices[self._current_tick]
        else:
          self.trailing_trough = min(self.trailing_trough, self.prices[self._current_tick])

    def exit_pos(self):
      #stop loss= 0.1
      current_price = self.prices[self._current_tick]
      if self._position== Positions.Long and current_price<=self.trailing_peak*(1-0.1):
        self.balance += self.holdings * current_price
        self.holdings = 0
        print("exited long position")
      elif self._position== Positions.Short and current_price>=self.trailing_trough*(1+0.1):
        self.balance -= (-self.holdings) * current_price
        self.holdings = 0
        print("exited short position")


    def _calculate_reward(self, action):
        current_price = self.prices[self._current_tick]

        # Execute trades
       # if self.net_worth<= 0.1 * self.initial_amount:
       #     self.balance = self.initial_amount
       #     self.holdings = 0
        if action == Actions.Buy.value and self.balance >= current_price:
            risk_per_trade=0.05*self.balance
            pos_size= risk_per_trade/ 0.05   #stop loss= 10%-->0.1
            shares_to_buy= min( pos_size // current_price, self.balance//current_price)
            self.holdings += shares_to_buy
            self.balance -= current_price *(1 + self.trade_fee_percent) *(shares_to_buy)
        elif action == Actions.Sell.value and self.holdings > 0:
            risk_per_trade=0.05*self.balance
            pos_size= risk_per_trade/ 0.05   #stop loss= 10%-->0.1
            to_sell= (pos_size // current_price)
            self.holdings -= to_sell
            self.balance += current_price *(1 + self.trade_fee_percent) *(to_sell)

        # Update net worth
        self.net_worth = self.balance + self.holdings * current_price
        self.net_worth_history.append(self.net_worth)

        # Profit reward normalized to initial amount
        profit_reward = (self.net_worth - self.prev_net_worth) / self.initial_amount
        self.prev_net_worth = self.net_worth

        # Daily return for volatility
        if len(self.net_worth_history) > 1:
            daily_return = (
                self.net_worth_history[-1] / self.net_worth_history[-2] - 1
            )
            self.return_history.append(daily_return)

        volatility_penalty = (
            np.std(self.return_history[-self.window_size :])
            if len(self.return_history) >= self.window_size
            else 0
        )

        # Drawdown penalty
        peak = max(self.net_worth_history) if self.net_worth_history else self.initial_amount
        drawdown = (self.net_worth - peak) / (peak + 1e-8)
        drawdown_penalty = abs(drawdown) if drawdown < 0 else 0

        # Trade penalty
        # trade_penalty = 1.0 if action != self.prev_action else 0
        trade_penalty = 0.001 * current_price  # proportional to cost
        self.prev_action = action

        # Final reward
        # reward = (
        #     + 100 * profit_reward
        #     - 0.3 * volatility_penalty
        #     - 0.5 * drawdown_penalty
        #     - 0.05 * trade_penalty
        # )

        reward = (
        + 300 * profit_reward
        - 0.1 * volatility_penalty
        - 0.1 * drawdown_penalty
        - 0.0005 * trade_penalty
    )
        reward = np.clip(reward, -30, 30)


        return reward

    def _update_profit(self, action):
        if action == Actions.Hold.value:
            return

        current_price = self.prices[self._current_tick]
        last_trade_price = self.prices[self._last_trade_tick]
        shares = (self._total_profit * (1 - self.trade_fee_percent)) / last_trade_price

        if self._position == Positions.Long:
            self._total_profit = shares * current_price * (1 - self.trade_fee_percent)
        elif self._position == Positions.Short:
            price_diff = last_trade_price - current_price
            self._total_profit = shares * (last_trade_price + price_diff) * (1 - self.trade_fee_percent)

    def _get_info(self):
        return {
            "total_reward": self._total_reward,
            "total_profit": self._total_profit,
            "net_worth": self.net_worth,
            "balance": self.balance,
            "holdings": self.holdings,
            "initial_amount": self.initial_amount,
            "current_step": self._current_tick,
            "position": self._position.name,
        }

    def render(self):
        #print(f"Step: {self._current_tick} | Net Worth: ₹{self.net_worth:.2f} | Balance: ₹{self.balance:.2f} | Holdings: {self.holdings} | Position: {self._position.name}")
        if not self.net_worth_history or not self.reward_history:
          print("Nothing to render yet.")
          return

        fig, axs = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

        axs[0].plot(self.net_worth_history, label="Net Worth", color="blue")
        axs[0].set_ylabel("Net Worth")
        axs[0].legend()
        axs[0].grid(True)

        axs[1].plot(self.reward_history, label="Reward", color="green")
        axs[1].set_ylabel("Reward")
        axs[1].set_xlabel("Time Step")
        axs[1].legend()
        axs[1].grid(True)

        plt.tight_layout()
        plt.show()

    def close(self):
        pass


In [None]:
import pandas as pd

btc_df = pd.read_csv("/content/BTC_DATA.csv")  # your full-featured BTC file

btc_df.dropna(inplace=True)
btc_df.reset_index(drop=True, inplace=True)

# Convert timestamp column to datetime objects
#btc_df['timestamp'] = pd.to_datetime(btc_df['timestamp'], format="%d-%m-%Y %H:%M:%S")
#btc_df.sort_values('timestamp', inplace=True)
#btc_df.reset_index(drop=True, inplace=True)

# Define split point (e.g., 80% train, 20% test)
split_index = int(len(btc_df) * 0.8)

df_train = btc_df.iloc[:split_index].copy()
df_test = btc_df.iloc[split_index:].copy()

# from your_env_file import CryptoEnv # Removed the incorrect import

#btc_env = CryptoEnv(df=df_train, window_size=30, frame_bound=(30, 1000) ,initial_amount=100000 )
btc_env = CryptoEnv4(df=df_train, window_size=30, frame_bound=(30, 1000) ,initial_amount=100000 )
btc_env.reset()

obs, info = btc_env.reset()
done = False
i=0
while not done:
    # action = btc_env.action_space.sample()  # random action (for testing)
    action = btc_env.action_space.sample()         # Sample in [0,1,2]
    #action = btc_env.action_map[action_index]
    #position_index = btc_env.position_space.sample()
    #position = btc_env.position_map[position_index]
    print ("action", action)
    obs, reward, _, done, info = btc_env.step(action)
    # print(info)

    #print("position:", position)
    print("Reward:", reward)
    # print("Terminated:", terminated)

    i+=1
    if i>=10:
      break

action 1
Reward: -0.0040792455
action 0
Reward: -0.00405891
action 0
Reward: -0.0040417053
action 0
Reward: -0.004062495
action 2
Reward: -0.29482928
action 0
Reward: -0.7683233
action 1
Reward: 2.2013178
action 0
Reward: -2.1715906
action 1
Reward: -0.29569846
action 1
Reward: 0.57134414


In [None]:
!pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (

In [None]:
!pip install sb3-contrib

Collecting sb3-contrib
  Downloading sb3_contrib-2.7.0-py3-none-any.whl.metadata (4.1 kB)
Downloading sb3_contrib-2.7.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sb3-contrib
Successfully installed sb3-contrib-2.7.0


In [None]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.monitor import Monitor
from sb3_contrib import QRDQN

In [None]:
from gymnasium.wrappers import TimeLimit
vec_env = DummyVecEnv([
    lambda: Monitor(CryptoEnv4(btc_df, window_size=60, frame_bound=(60, 1000), market_condition= None))
])
env = VecNormalize(vec_env, norm_obs=True, norm_reward=True)

In [None]:
model2 = PPO("MlpPolicy", env, verbose=1)
model2.learn(total_timesteps= 150000)

Using cpu device


  self._total_profit = shares * current_price * (1 - self.trade_fee_percent)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 929      |
|    ep_rew_mean     | 41.2     |
| time/              |          |
|    fps             | 505      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 929          |
|    ep_rew_mean          | 40.9         |
| time/                   |              |
|    fps                  | 341          |
|    iterations           | 2            |
|    time_elapsed         | 12           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0066870702 |
|    clip_fraction        | 0.0458       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   | 0.0393       |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x7bcb562f37d0>

In [None]:
from sb3_contrib import RecurrentPPO
from sb3_contrib.ppo_recurrent.policies import RecurrentActorCriticPolicy

model4 = RecurrentPPO(
    RecurrentActorCriticPolicy,
    env,
    verbose=1,
)
model4.learn(total_timesteps=150000)

Using cpu device
----------------------------
| time/              |     |
|    fps             | 74  |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 128 |
----------------------------


KeyboardInterrupt: 

In [None]:
model2.save("modelPPO")
#model4.save("modelRPPO")

NameError: name 'model2' is not defined

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# === Initialize Environment ===
window_size = 70
initial_amount = 1_000_000

test_env = DummyVecEnv([
    lambda: CryptoEnv4(
        df=df_test,
        window_size=window_size,
        frame_bound=(1500, len(df_test)),
        initial_amount=initial_amount
    )
])

# === Load Trained Model ===
model = PPO.load("/content/modelPPO.zip")  # Replace with actual path

# === Run Backtest ===
obs = test_env.reset()
done = False
net_worths, balances, holdings, actions, rewards, dates = [], [], [], [], [], []

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = test_env.step(action)

    env_info = info[0]
    current_step = env_info["current_step"]
    if 'timestamp' in df_test.columns and 0 <= current_step < len(df_test):
      date = df_test.iloc[current_step]['timestamp']
    else:
      date = current_step  # fallback

    net_worths.append(env_info["net_worth"])
    balances.append(env_info["balance"])
    holdings.append(env_info["holdings"])
    actions.append(action[0])
    rewards.append(reward[0])
    dates.append(date)

# === Create DataFrame ===
results = pd.DataFrame({
    "date": pd.to_datetime(dates),
    "net_worth": net_worths,
    "balance": balances,
    "holdings": holdings,
    "action": actions,
    "reward": rewards,
})
results.set_index("date", inplace=True)

# === Financial Metrics ===
results["returns"] = results["net_worth"].pct_change()
results.dropna(inplace=True)

initial_capital = results["net_worth"].iloc[0]
final_capital = results["net_worth"].iloc[-1]
total_return = final_capital / initial_capital - 1

# Duration calculations
duration_hours = (results.index[-1] - results.index[0]).total_seconds() / 3600
duration_years = duration_hours / 8760

cagr = (final_capital / initial_capital) ** (1 / duration_years) - 1 if duration_years > 0 else 0

# Risk metrics
volatility = results["returns"].std() * np.sqrt(365 * 24)
sharpe = results["returns"].mean() / (results["returns"].std() + 1e-8) * np.sqrt(365 * 24)

rolling_max = results["net_worth"].cummax()
drawdown = (results["net_worth"] - rolling_max) / (rolling_max + 1e-8)
max_drawdown = drawdown.min()

# Exposure Time: when agent is not holding
exposure_time_by_action = (results["action"] != 0).mean() * 100
exposure_time_by_holdings = (results["holdings"] > 0).mean() * 100

# === Buy & Hold Benchmark ===
start_idx = 1500 - window_size
end_idx = len(df_test) - 1
buy_price = df_test["close"].iloc[start_idx]
sell_price = df_test["close"].iloc[end_idx]
buy_and_hold_return = (sell_price - buy_price) / buy_price
#btc_df['timestamp'] = pd.to_datetime(btc_df['timestamp'], format="%Y-%m-%d %H:%M:%S")
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'], format="%Y-%m-%d %H:%M:%S")
buy_and_hold_years = (df_test["timestamp"].iloc[end_idx] - df_test["timestamp"].iloc[start_idx]).total_seconds() / (3600 * 24 * 365)
cagr_bh = (1 + buy_and_hold_return) ** (1 / buy_and_hold_years) - 1 if buy_and_hold_years > 0 else 0

# === Print Summary ===
actions_series = results["action"]

print("\n📈 Buy & Hold Benchmark")
print(f"Buy Price:           {buy_price:.2f}")
print(f"Sell Price:          {sell_price:.2f}")
print(f"Total Return:        {buy_and_hold_return * 100:.2f}%")
print(f"CAGR:                {cagr_bh * 100:.2f}%")

print("\n📊 RL Strategy Backtest")
print(f"Initial Capital:     {initial_capital:.2f}")
print(f"Final Capital:       {final_capital:.2f}")
print(f"Total Return:        {total_return * 100:.2f}%")
print(f"CAGR:                {cagr * 100:.2f}%")
print(f"Exposure (Actions):  {exposure_time_by_action:.2f}%")
print(f"Exposure (Holdings): {exposure_time_by_holdings:.2f}%")
print(f"Annual Volatility:   {volatility * 100:.2f}%")
print(f"Sharpe Ratio:        {sharpe:.2f}")
print(f"Max Drawdown:        {max_drawdown * 100:.2f}%")

print("\n🪙 Trade Actions")
print(f"Buy:                 {(actions_series == 2).sum()}")
print(f"Sell:                {(actions_series == 1).sum()}")
print(f"Hold:                {(actions_series == 0).sum()}")

# === Plot Net Worth ===
plt.figure(figsize=(14, 6))
results["net_worth"].plot(label="RL Net Worth")
plt.ylabel("Portfolio Value")
plt.title("RL Portfolio Performance")
plt.grid(True)
plt.legend()
plt.show()

# === Plot Actions ===
plt.figure(figsize=(14, 3))
plt.plot(results.index, results["action"], marker='o', linestyle='-', markersize=2)
plt.yticks([0, 1, 2], ['Hold', 'Sell', 'Buy'])
plt.title("Agent Actions Over Time")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
!pip install optuna[visualization]

Collecting optuna[visualization]
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
[0mCollecting alembic>=1.5.0 (from optuna[visualization])
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna[visualization])
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from gymnasium.wrappers import TimeLimit
import optuna

# === Load Data ===
btc_df = pd.read_csv("/content/BTC_DATA.csv")
btc_df.dropna(inplace=True)
btc_df.reset_index(drop=True, inplace=True)

# === Train-Test Split ===
split_index = int(len(btc_df) * 0.8)
df_train = btc_df.iloc[:split_index].copy()
df_test = btc_df.iloc[split_index:].copy()


In [None]:
# === Objective Function for Optuna ===
def optuna_objective(trial):
    window_size = trial.suggest_int("window_size", 24,70 )
    #profit_weight = trial.suggest_int("profit_weight", 200, 600)
    #volatility_weight = trial.suggest_float("volatility_weight", 0.1, 1)
    #drawdown_weight = trial.suggest_float("drawdown_weight", 0.1, 1)
    #trade_penalty_weight = trial.suggest_float("trade_penalty_weight", 0.001, 0.005)
    #reward_up_limit = trial.suggest_int("reward_up_limit", 10, 50)
    #reward_low_limit = trial.suggest_int("reward_low_limit", -50, 10)
    #risk_percentage = trial.suggest_float("risk_percentage", 0.001, 0.1)
    #stop_loss = trial.suggest_float("stop_loss", 0.01, 0.08)

    vec_env = DummyVecEnv([
        lambda: Monitor(CryptoEnv4(
            df=df_train,
            window_size=window_size,
            frame_bound=(window_size, len(df_train)),
            initial_amount=100000,
            render_mode=None,
            market_condition=None,
        ))
    ])
    env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, epsilon=1e-8) # Added epsilon

    # Add a wrapper to handle potential NaN observations and rewards
    class NanHandlingVecEnv(DummyVecEnv):
        def step_wait(self):
            obs, rews, dones, infos = super().step_wait()
            # Replace NaN observations with zeros
            obs[np.isnan(obs)] = 0.0
            # Replace NaN rewards with zeros
            rews[np.isnan(rews)] = 0.0
            return obs, rews, dones, infos

        def reset(self):
            obs = super().reset()
            # Replace NaN observations with zeros
            obs[np.isnan(obs)] = 0.0
            return obs

    env = NanHandlingVecEnv([lambda: env.envs[0]]) # Wrap the normalized environment

    model2 = PPO("MlpPolicy", env, verbose=1)
    model2.learn(total_timesteps=150000)
    model2.save("/content/model2_3")

    mean_reward, _ = evaluate_policy(model2, env, n_eval_episodes=5, deterministic=True)
    return mean_reward

# === Run Optuna Study ===
study = optuna.create_study(direction="maximize")
study.optimize(optuna_objective, n_trials=10)

print("Best trial:")
print(study.best_trial)

# === Train Final Model ===
best_params = study.best_trial.params

best_vec_env = DummyVecEnv([
    lambda: Monitor(CryptoEnv4(
        df=df_train,
        window_size=best_params["window_size"],
        frame_bound=(best_params["window_size"], len(df_train)),
        initial_amount=100000,
        #profit_weight=best_params["profit_weight"],
        #volatility_weight=best_params["volatility_weight"],
        #drawdown_weight=best_params["drawdown_weight"],
        #trade_penalty_weight=best_params["trade_penalty_weight"],
        #reward_up_limit=best_params["reward_up_limit"],
        #reward_low_limit=best_params["reward_low_limit"],
        #risk_percentage=best_params["risk_percentage"],
        #stop_loss=best_params["stop_loss"]
    ))
])
best_env = VecNormalize(best_vec_env, norm_obs=True, norm_reward=True, epsilon=1e-8) # Added epsilon

# Wrap the best normalized environment
class NanHandlingVecEnv(DummyVecEnv):
    def step_wait(self):
        obs, rews, dones, infos = super().step_wait()
        # Replace NaN observations with zeros
        obs[np.isnan(obs)] = 0.0
        # Replace NaN rewards with zeros
        rews[np.isnan(rews)] = 0.0
        return obs, rews, dones, infos

    def reset(self):
        obs = super().reset()
        # Replace NaN observations with zeros
        obs[np.isnan(obs)] = 0.0
        return obs

best_env = NanHandlingVecEnv([lambda: best_env.envs[0]]) # Wrap the normalized environment


model2 = PPO("MlpPolicy", best_env, verbose=1)
model2.learn(total_timesteps=150000)
model2.save("/content/modelfinal")

# === Backtesting ===
initial_amount = 1_000_000

test_vec_env = DummyVecEnv([
    lambda: CryptoEnv3(
        df=df_test,
        window_size=window_size,
        frame_bound=(window_size, len(df_test)),
        initial_amount=initial_amount
    )
])
test_env = VecNormalize(test_vec_env, training=False, norm_obs=True, norm_reward=True, epsilon=1e-8) # Added epsilon

# Wrap the test normalized environment
class NanHandlingVecEnv(DummyVecEnv):
    def step_wait(self):
        obs, rews, dones, infos = super().step_wait()
        # Replace NaN observations with zeros
        obs[np.isnan(obs)] = 0.0
        # Replace NaN rewards with zeros
        rews[np.isnan(rews)] = 0.0
        return obs, rews, dones, infos

    def reset(self):
        obs = super().reset()
        # Replace NaN observations with zeros
        obs[np.isnan(obs)] = 0.0
        return obs

test_env = NanHandlingVecEnv([lambda: test_env.envs[0]]) # Wrap the normalized environment


model = PPO.load("/content/model2_3.zip")
obs = test_env.reset()
done = False
net_worths, balances, holdings, actions, rewards, dates = [], [], [], [], [], []

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = test_env.step(action)

    env_info = info[0]
    current_step = env_info["current_step"]
    if 'timestamp' in df_test.columns and 0 <= current_step < len(df_test):
        date = df_test.iloc[current_step]['timestamp']
    else:
        date = current_step

    net_worths.append(env_info["net_worth"])
    balances.append(env_info["balance"])
    holdings.append(env_info["holdings"])
    actions.append(action[0])
    rewards.append(reward[0])
    dates.append(date)

# === Backtest Results ===
results = pd.DataFrame({
    "date": pd.to_datetime(dates),
    "net_worth": net_worths,
    "balance": balances,
    "holdings": holdings,
    "action": actions,
    "reward": rewards,
})
results.set_index("date", inplace=True)

results["returns"] = results["net_worth"].pct_change()
results.dropna(inplace=True)

initial_capital = results["net_worth"].iloc[0]
final_capital = results["net_worth"].iloc[-1]
total_return = final_capital / initial_capital - 1

# Duration
duration_hours = (results.index[-1] - results.index[0]).total_seconds() / 3600
duration_years = duration_hours / 8760
cagr = (final_capital / initial_capital) ** (1 / duration_years) - 1 if duration_years > 0 else 0

# Risk Metrics
volatility = results["returns"].std() * np.sqrt(365 * 24)
sharpe = results["returns"].mean() / (results["returns"].std() + 1e-8) * np.sqrt(365 * 24)

# Calmar Ratio
# Need to calculate max_drawdown before Calmar ratio
rolling_max = results["net_worth"].cummax()
drawdown = (results["net_worth"] - rolling_max) / (rolling_max + 1e-8)
max_drawdown = drawdown.min()

calmar = cagr / abs(max_drawdown + 1e-8)

# Win Percentage and Profit Ratio
positive_trades = results[results["reward"] > 0]
negative_trades = results[results["reward"] < 0]
win_percentage = len(positive_trades) / (len(positive_trades) + len(negative_trades) + 1e-8) * 100
profit_ratio = positive_trades["reward"].mean() / (abs(negative_trades["reward"].mean()) + 1e-8)

# Alpha and Beta (requires a benchmark) - assuming a simple benchmark for now
# You would typically compare against a relevant index or buy-and-hold strategy
# For simplicity, let's create a dummy benchmark returns series of the same length
# In a real scenario, you'd load or calculate actual benchmark returns
benchmark_returns = pd.Series(np.random.randn(len(results)), index=results.index) # Dummy benchmark
strategy_returns = results["returns"]

if len(benchmark_returns) == len(strategy_returns):
    beta = np.cov(strategy_returns, benchmark_returns)[0, 1] / (np.var(benchmark_returns) + 1e-8)
    alpha = (strategy_returns.mean() - beta * benchmark_returns.mean()) * 365 * 24
else:
    alpha, beta = np.nan, np.nan  # length mismatch fallback

# Sortino Ratio
downside_returns = results["returns"].copy()
downside_returns[downside_returns > 0] = 0
downside_std = downside_returns.std() * np.sqrt(365 * 24)
sortino = results["returns"].mean() / (downside_std + 1e-8)


exposure_time_by_action = (results["action"] != 0).mean() * 100
exposure_time_by_holdings = (results["holdings"] > 0).mean() * 100

# Buy & Hold Benchmark
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'], format="%Y-%m-%d %H:%M:%S")
# Adjust start_idx to be relative to df_test after train-test split
start_idx = max(0, 1500 - window_size - len(df_train)) # Ensure start_idx is not negative and relative to df_test
end_idx = len(df_test) - 1
buy_price = df_test["close"].iloc[start_idx]
sell_price = df_test["close"].iloc[end_idx]
buy_and_hold_return = (sell_price - buy_price) / buy_price
buy_and_hold_years = (df_test["timestamp"].iloc[end_idx] - df_test["timestamp"].iloc[start_idx]).total_seconds() / (3600 * 24 * 365)
cagr_bh = (1 + buy_and_hold_return) ** (1 / buy_and_hold_years) - 1 if buy_and_hold_years > 0 else 0

# === Summary ===
actions_series = results["action"]

print("\n📈 Buy & Hold Benchmark")
print(f"Buy Price:           {buy_price:.2f}")
print(f"Sell Price:          {sell_price:.2f}")
print(f"Total Return:        {buy_and_hold_return * 100:.2f}%")
print(f"CAGR:                {cagr_bh * 100:.2f}%")

print("\n📊 RL Strategy Backtest")
print(f"Initial Capital:     {initial_capital:.2f}")
print(f"Final Capital:       {final_capital:.2f}")
print(f"Total Return:        {total_return * 100:.2f}%")
print(f"CAGR:                {cagr * 100:.2f}%")
print(f"Exposure (Actions):  {exposure_time_by_action:.2f}%")
print(f"Exposure (Holdings): {exposure_time_by_holdings:.2f}%")
print(f"Annual Volatility:   {volatility * 100:.2f}%")
print(f"Sharpe Ratio:        {sharpe:.2f}")
print(f"Max Drawdown:        {max_drawdown * 100:.2f}%")
print(f"Sortino Ratio:       {sortino:.2f}")
print(f"Calmar Ratio:        {calmar:.2f}")
print(f"Win Percentage:      {win_percentage:.2f}%")
print(f"Profit Ratio:        {profit_ratio:.2f}")
print(f"Alpha:               {alpha:.4f}")
print(f"Beta:                {beta:.4f}")

print("\n🪙 Trade Actions")
print(f"Buy:                 {(actions_series == 2).sum()}")
print(f"Sell:                {(actions_series == 1).sum()}")
print(f"Hold:                {(actions_series == 0).sum()}")

# === Plot Net Worth ===
plt.figure(figsize=(14, 6))
results["net_worth"].plot(label="RL Net Worth")
plt.ylabel("Portfolio Value")
plt.title("RL Portfolio Performance")
plt.grid(True)
plt.legend()
plt.show()

# === Plot Actions ===
plt.figure(figsize=(14, 3))
plt.plot(results.index, results["action"], marker='o', linestyle='-', markersize=2)
plt.yticks([0, 1, 2], ['Hold', 'Sell', 'Buy'])
plt.title("Agent Actions Over Time")
plt.grid(True)
plt.tight_layout()
plt.show()

[I 2025-07-31 13:05:43,890] A new study created in memory with name: no-name-d15a95aa-d8b5-4a89-b67a-d1f728885ff5


Using cpu device


  self._total_profit = shares * current_price * (1 - self.trade_fee_percent)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited shor

[I 2025-07-31 13:16:38,376] Trial 0 finished with value: -2810.957765 and parameters: {'window_size': 68}. Best is trial 0 with value: -2810.957765.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited short position
exited shor