<a href="https://colab.research.google.com/github/Niraj-K-fin/Regime-Switching-Reinforcement-Learning-Portfolio-Manager/blob/main/project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**" Regime-Switching Reinforcement Learning Portfolio Manager "**

~ Niraj Kumar (*nirajjuly12@gmail.com*)

[LinkedIn](https://www.linkedin.com/in/nirajkofficial/) | [Instagram](https://www.instagram.com/nirajkumar_real/) | [GitHub](https://github.com/Niraj-K-fin)


*Installing Dependencies...*

In [None]:
!pip install gymnasium==0.29.1
!pip install stable-baselines3==2.1.0
!pip install yfinance
!pip install ta
!pip install tensorboard
!pip install matplotlib

*Importing Modules...*

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd
import yfinance as yf
from ta.volatility import BollingerBands, AverageTrueRange
from ta.momentum import RSIIndicator
from ta.trend import MACD, EMAIndicator
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
import torch
import matplotlib.pyplot as plt

SEED = 42
np.random.seed(SEED)

**Enhanced Custom Portfolio Environment** (*Highly Robust* )

In [None]:
class EnhancedPortfolioEnv(gym.Env):

    metadata = {'render.modes': ['human']}

    def __init__(self, tickers, initial_balance=100000, window_size=30, transaction_cost=0.001,
                 stop_loss_pct=0.07, take_profit_pct=0.15, risk_target=0.01):
        super().__init__()
        self.tickers = tickers
        self.n_assets = len(tickers)
        self.initial_balance = initial_balance
        self.window_size = window_size
        self.transaction_cost = transaction_cost
        self.stop_loss_pct = stop_loss_pct
        self.take_profit_pct = take_profit_pct
        self.risk_target = risk_target

        self._download_data()

        self.action_space = spaces.Box(low=-1, high=1, shape=(self.n_assets,), dtype=np.float32)

        obs_len = self.n_assets * (self.window_size + 6) + 3 + len(self.macro_indicators.columns)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_len,), dtype=np.float32)
        self.reset()

    def _download_data(self):
        macro_tickers = ['^VIX', '^TNX']
        all_tickers = list(set(self.tickers + macro_tickers))
        try:
            df = yf.download(all_tickers, period="4y")
            self.price_data = df['Adj Close'] if 'Adj Close' in df else df['Close']
            self.price_data = self.price_data.dropna()
        except Exception as e:
            print(f"Error downloading data: {e}")
            raise e
        self.dates = self.price_data.index

        self.returns = self.price_data[self.tickers].pct_change().dropna()

        self.regimes = self._calculate_regimes()

        self.macro_indicators = self._calculate_macro_indicators()

    def _calculate_regimes(self):
        regimes = pd.DataFrame(index=self.price_data.index, columns=self.tickers)
        for ticker in self.tickers:
            price = self.price_data[ticker]
            bb = BollingerBands(close=price, window=20, window_dev=2)
            rsi = RSIIndicator(close=price, window=14)
            macd = MACD(close=price)
            bull = (price > bb.bollinger_hband()) & (rsi.rsi() > 60) & (macd.macd_diff() > 0)
            bear = (price < bb.bollinger_lband()) & (rsi.rsi() < 40) & (macd.macd_diff() < 0)
            regimes[ticker] = 1
            regimes.loc[bull, ticker] = 2
            regimes.loc[bear, ticker] = 0
        return regimes.fillna(1).astype(int)

    def _calculate_macro_indicators(self):
        macro_df = pd.DataFrame(index=self.price_data.index)
        macro_df['VIX'] = self.price_data['^VIX'] if '^VIX' in self.price_data else 0
        macro_df['10Y'] = self.price_data['^TNX'] if '^TNX' in self.price_data else 0
        spy = self.price_data['SPY'] if 'SPY' in self.price_data else self.price_data.iloc[:, 0]
        macro_df['SPY_RSI'] = RSIIndicator(close=spy, window=14).rsi()
        macro_df['SPY_MACD'] = MACD(close=spy).macd_diff()
        macro_df.fillna(method='bfill', inplace=True)
        return macro_df

    def _get_obs(self):
        obs = []
        for ticker in self.tickers:
            ret_window = self.returns[ticker].iloc[self.current_step - self.window_size:self.current_step].values
            if len(ret_window) < self.window_size:
                ret_window = np.pad(ret_window, (self.window_size - len(ret_window), 0), 'constant')
            obs.extend(ret_window)

            regime = self.regimes[ticker].iloc[self.current_step]
            regime_one_hot = [0, 0, 0]
            regime_one_hot[regime] = 1
            obs.extend(regime_one_hot)

            price = self.price_data[ticker].iloc[self.current_step - self.window_size:self.current_step]
            if len(price) >= 50:
                ema50 = EMAIndicator(close=price, window=50).ema_indicator().iloc[-1]
            else:
                ema50 = price.iloc[-1]
            if len(price) >= 14:
                atr = AverageTrueRange(high=price, low=price, close=price, window=14).average_true_range().iloc[-1]
                rsi = RSIIndicator(close=price, window=14).rsi().iloc[-1]
            else:
                atr = 0
                rsi = 50
            obs.extend([atr, ema50, rsi])

        obs.append(self.cash / self.initial_balance)
        obs.append(self.total_portfolio_value / self.initial_balance)
        obs.append(self.portfolio_volatility)

        obs.extend(self.macro_indicators.iloc[self.current_step].values)
        return np.array(obs, dtype=np.float32)

    def reset(self, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)
        self.current_step = self.window_size + 50
        self.cash = self.initial_balance
        self.shares_held = np.zeros(self.n_assets, dtype=np.float32)
        self.avg_price = np.zeros(self.n_assets, dtype=np.float32)
        self.total_portfolio_value = self.initial_balance
        self.portfolio_volatility = 0.01
        self.prev_value = self.initial_balance
        self.stop_prices = np.zeros(self.n_assets, dtype=np.float32)
        self.take_profit_prices = np.zeros(self.n_assets, dtype=np.float32)
        self.trade_history = []
        return self._get_obs(), {}

    def step(self, action):
        done = False
        info = {}
        current_prices = self.price_data[self.tickers].iloc[self.current_step].values

        sum_abs = np.sum(np.abs(action))
        if sum_abs < 1e-8:
            target_weights = np.zeros_like(action)
        else:
            target_weights = action / sum_abs

        recent_vol = np.std(self.returns.iloc[self.current_step-20:self.current_step].values, axis=0)
        inv_vol = 1 / (recent_vol + 1e-4)
        scaled_weights = target_weights * inv_vol
        scaled_weights /= (np.sum(np.abs(scaled_weights)) + 1e-6)
        target_alloc = scaled_weights * self.total_portfolio_value

        for i in range(self.n_assets):
            price = current_prices[i]
            desired_shares = target_alloc[i] / price
            delta_shares = desired_shares - self.shares_held[i]
            cost = abs(delta_shares) * price * self.transaction_cost
            trade_value = delta_shares * price
            if self.cash - cost - max(0, trade_value) < 0:
                continue
            self.cash -= cost + max(0, trade_value)
            self.shares_held[i] += delta_shares
            if delta_shares > 0:
                self.avg_price[i] = (self.avg_price[i] * (self.shares_held[i] - delta_shares) + price * delta_shares) / (self.shares_held[i] + 1e-6)
            self.stop_prices[i] = self.avg_price[i] * (1 - self.stop_loss_pct)
            self.take_profit_prices[i] = self.avg_price[i] * (1 + self.take_profit_pct)

        for i in range(self.n_assets):
            price = current_prices[i]
            if self.shares_held[i] > 0:
                if price < self.stop_prices[i] or price > self.take_profit_prices[i]:
                    self.cash += self.shares_held[i] * price * (1 - self.transaction_cost)
                    self.shares_held[i] = 0
                    self.avg_price[i] = 0
            elif self.shares_held[i] < 0:
                if price > self.stop_prices[i] or price < self.take_profit_prices[i]:
                    self.cash -= abs(self.shares_held[i]) * price * (1 + self.transaction_cost)
                    self.shares_held[i] = 0
                    self.avg_price[i] = 0

        asset_values = self.shares_held * current_prices
        self.total_portfolio_value = self.cash + asset_values.sum()
        self.portfolio_volatility = np.std(self.returns.iloc[self.current_step-20:self.current_step].values) if self.current_step > 20 else 0.01

        ret = (self.total_portfolio_value - self.prev_value) / self.prev_value
        vol = max(self.portfolio_volatility, 1e-3)
        reward = ret / vol
        self.prev_value = self.total_portfolio_value

        self.current_step += 1
        if self.current_step >= len(self.price_data) - 1 or self.total_portfolio_value < self.initial_balance * 0.5:
            done = True

        obs = self._get_obs()
        self.trade_history.append({
            'step': self.current_step,
            'portfolio_value': self.total_portfolio_value,
            'cash': self.cash,
            'shares_held': self.shares_held.copy(),
            'prices': current_prices.copy(),
            'action': action,
            'reward': reward
        })
        return obs, reward, done, False, info

    def render(self, mode='human'):
        print(f"Step: {self.current_step}")
        print(f"Portfolio Value: {self.total_portfolio_value:.2f}")
        print(f"Cash: {self.cash:.2f}")
        print(f"Shares Held: {self.shares_held}")
        print(f"Avg Prices: {self.avg_price}")

**Environment Helper**

In [None]:
def make_env(tickers):
    def _init():
        env = EnhancedPortfolioEnv(tickers)
        env = Monitor(env)
        return env
    return _init

**RL Model Training**

In [None]:
def train_and_save_model(tickers, total_timesteps=300_000, log_dir="/content/tb_logs/"):
    env = DummyVecEnv([make_env(tickers)])
    model = PPO(
        "MlpPolicy",
        env,
        verbose=1,
        learning_rate=3e-4,
        n_steps=2048,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.01,
        seed=SEED,
        tensorboard_log=log_dir,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    model.learn(total_timesteps=total_timesteps)
    model.save("/content/ppo_regime_portfolio_best.zip")
    print("Model saved to /content/ppo_regime_portfolio_best.zip")
    return model

**Loading Model**...

In [None]:
def load_model(path="/content/ppo_regime_portfolio_best.zip"):
    model = PPO.load(path)
    print("Model loaded.")
    return model

**Back-Testing and Visualization**

In [None]:
def backtest(model, tickers):
    env = EnhancedPortfolioEnv(tickers)
    obs, _ = env.reset()
    done = False
    portfolio_values = []
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, _, _ = env.step(action)
        portfolio_values.append(env.total_portfolio_value)
    portfolio_values = pd.Series(portfolio_values)
    returns = portfolio_values.pct_change().dropna()
    sharpe_ratio = returns.mean() / returns.std() * np.sqrt(252)
    max_drawdown = ((portfolio_values / portfolio_values.cummax()) - 1).min()
    print(f"Sharpe Ratio: {sharpe_ratio:.4f}")
    print(f"Max Drawdown: {max_drawdown:.2%}")
    print(f"Final Portfolio Value: ${portfolio_values.iloc[-1]:,.2f}")
    plt.figure(figsize=(12,6))
    plt.plot(portfolio_values, label='Portfolio Value')
    plt.title('Portfolio Value Over Time')
    plt.xlabel('Time Step')
    plt.ylabel('Portfolio Value ($)')
    plt.legend()
    plt.grid(True)
    plt.show()
    return portfolio_values

**Model Execution (Training + Learning + Results)** - *may take few hours to train due to 3 lakh timesteps and complexity level*
~ *Just be Patient* **(RUN)**

In [None]:
tickers = ['SPY', 'TLT', 'GLD', 'QQQ']
model = train_and_save_model(tickers, total_timesteps=300_000)
backtest(model, tickers)

**Launch TensorBoard >>>**

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/tb_logs/