# DQN Based Hierarchical RL Model

## Imports

In [None]:
import os
import pandas as pd
import gym
from gym import spaces
import numpy as np
import random
import pickle
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.optimizers import Adam


## Setting up the DQN

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0   # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep Q-learning Model
        model = Sequential()
        model.add(Input(shape=(self.state_size,)))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])   # returns action with the highest Q-value

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

## Data Loaders

In [None]:
def load_data(tickers, daily_folder='data/processed', intraday_folder='data/processed_intra_day', synthetic_data_dir = 'data/synthetic_data/'):
    intraday_data = {}

    data_dir = 'data/processed/'
    intra_data_dir = 'data/processed_intra_day/'
    synthetic_data_dir = 'data/synthetic_data/'
    processed_data = {}
    synthetic_data = {}
 
    for file in os.listdir(data_dir):
        if file.endswith('_final.csv'):
            ticker = file.replace('_final.csv', '')
            df = pd.read_csv(os.path.join(data_dir, file))
            df.dropna(inplace=True)  # Drop any rows with NaN values
            processed_data[ticker] = df
            
    for file in os.listdir(intra_data_dir):
        if file.endswith('_intraday_processed.csv'):
            ticker = file.replace('_intraday_processed.csv', '')
            df = pd.read_csv(os.path.join(intra_data_dir, file))
            df.dropna(inplace=True)  # Drop any rows with NaN values
            intraday_data[ticker] = df
            
    for ticker_folder in os.listdir(synthetic_data_dir):
        ticker_folder_path = os.path.join(synthetic_data_dir, ticker_folder)
        
        if os.path.isdir(ticker_folder_path):
            for file in os.listdir(ticker_folder_path):
                if file.endswith('_synthetic.csv'):
                    ticker = file.split('_')[0]
                    file_path = os.path.join(ticker_folder_path, file)
                    df.dropna(inplace=True)  # Drop any rows with NaN values
                    synthetic_data[ticker] = pd.read_csv(file_path)

    return processed_data, intraday_data, synthetic_data


In [None]:
midcap_stocks = [
    "MSUMI.NS", "TORNTPOWER.NS", "GODREJPROP.NS", "SRF.NS",
    "APLAPOLLO.NS", "TVSMOTOR.NS", "LTIM.NS", "PAGEIND.NS",
    "AUROPHARMA.NS", "JINDALSTEL.NS", "BAJAJHLDNG.NS", "BATAINDIA.NS",
    "BHEL.NS", "CANBK.NS", "CHOLAFIN.NS", "CUB.NS", "DALMIASUG.NS",
    "ESCORTS.NS", "FEDERALBNK.NS", "FORTIS.NS", "GICRE.NS",
    "GMRINFRA.NS", "GNFC.NS", "GODREJAGRO.NS", "GRASIM.NS", "HAVELLS.NS",
    "HINDPETRO.NS", "INDHOTEL.NS", "JUBLFOOD.NS", "LICHSGFIN.NS",
    "M&MFIN.NS", "MANAPPURAM.NS", "MRF.NS", "NATCOPHARM.NS",
    "NCC.NS", "NMDC.NS", "OBEROIRLTY.NS", "PERSISTENT.NS", "PETRONET.NS",
    "RAMCOCEM.NS", "RBLBANK.NS", "SAIL.NS", "SUNTV.NS", "TATACOMM.NS",
    "TATAPOWER.NS", "THYROCARE.NS", "TORNTPHARM.NS", "TRENT.NS", "VOLTAS.NS",
    "WHIRLPOOL.NS", "YESBANK.NS", "ZEEL.NS", "ZYDUSWELL.NS",
    "ABBOTINDIA.NS", "ASHOKLEY.NS", "BALKRISIND.NS", "BEL.NS", "CONCOR.NS",
    "CROMPTON.NS", "DEEPAKNTR.NS", "DIXON.NS", "EMAMILTD.NS",
    "INDIAMART.NS", "IRCTC.NS", "JUBLPHARMA.NS", "LTTS.NS", "MFSL.NS",
    "METROPOLIS.NS", "OBEROIRLTY.NS", "PIIND.NS", "POLYCAB.NS", "RECLTD.NS",
    "SUPREMEIND.NS", "TATACONSUM.NS", "TV18BRDCST.NS", "VGUARD.NS",
    "VBL.NS", "VINATIORGA.NS", "ZENSARTECH.NS", "IDFCFIRSTB.NS",
    "SONACOMS.NS", "AMBUJACEM.NS", "GAIL.NS", "TATAELXSI.NS", "MAXHEALTH.NS",
    "LALPATHLAB.NS", "JSWENERGY.NS", "AARTIIND.NS", "ADANIGREEN.NS",
    "ABFRL.NS", "BANDHANBNK.NS", "BANKINDIA.NS", "BERGEPAINT.NS", "BOSCHLTD.NS",
    "CUMMINSIND.NS", "DMART.NS", "GLENMARK.NS", "GUJGASLTD.NS",
    "HAL.NS", "IIFLWAM.NS", "LICI.NS", "LUXIND.NS", "M&MFIN.NS",
    "NAUKRI.NS", "PHOENIXLTD.NS", "RAJESHEXPO.NS", "SHREECEM.NS",
    "TATACHEM.NS", "THERMAX.NS", "TTKPRESTIG.NS", "UJJIVANSFB.NS", "VAKRANGEE.NS"
]


In [None]:
processed_data, intraday_data, synthetic_data = load_data(midcap_stocks)

In [None]:
processed_data

## Defining the Environment

In [None]:
class TradingEnv(gym.Env):
    def __init__(self, data, ticker_list):
        super(TradingEnv, self).__init__()
        self.data = data
        self.ticker_list = ticker_list
        self.current_step = 0
        self.current_ticker = ticker_list[0]
        self.initial_portfolio_value = 1000000  # Initial portfolio value (e.g., $1,000,000)
        self.portfolio_value = self.initial_portfolio_value  # Start with the initial value
        self.position = 0  # Number of shares currently held

        # State space size depends on the number of features (e.g., OHLCV, indicators)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(len(data[self.current_ticker].columns),), dtype=np.float32)
        self.action_space = spaces.Discrete(3)  # 0 = Hold, 1 = Buy, 2 = Sell

    def reset(self):
        self.current_step = 0
        self.current_ticker = np.random.choice(self.ticker_list)  # Randomly select a ticker
        self.position = 0  # Reset position
        self.portfolio_value = self.initial_portfolio_value  # Reset portfolio value
        return self.data[self.current_ticker].iloc[self.current_step].values

    def step(self, action):
        done = False
        reward = 0

        # Market data for the current step
        current_price = self.data[self.current_ticker].iloc[self.current_step]['Close']

        # Implement trade logic and update the portfolio
        if action == 1:  # Buy
            reward = self._execute_buy(current_price)
        elif action == 2:  # Sell
            reward = self._execute_sell(current_price)
        else:
            reward = self._hold_position(current_price)

        self.current_step += 1
        if self.current_step >= len(self.data[self.current_ticker]):
            done = True
            self.current_step = len(self.data[self.current_ticker]) - 1  # Adjust to the last valid index

        next_state = self.data[self.current_ticker].iloc[self.current_step].values
        return next_state, reward, done, {}

    def _execute_buy(self, current_price):
        if self.position == 0:  # Buy only if not already holding
            self.position = self.portfolio_value // current_price  # Buy as many shares as possible
            self.portfolio_value -= self.position * current_price  # Update cash
        reward = self._calculate_profit(current_price)
        return reward

    def _execute_sell(self, current_price):
        if self.position > 0:  # Sell only if holding shares
            self.portfolio_value += self.position * current_price  # Sell all shares
            self.position = 0  # Reset position
        reward = self._calculate_profit(current_price)
        return reward

    def _hold_position(self, current_price):
        reward = self._calculate_profit(current_price)
        return reward

    def _calculate_profit(self, current_price):
        # Calculate unrealized profit if still holding
        if self.position > 0:
            total_value = self.portfolio_value + self.position * current_price
        else:
            total_value = self.portfolio_value
        return total_value - self.initial_portfolio_value  # Reward based on portfolio value increase
    
    def get_state(self):
        """
        Returns the current state of the environment.
        This includes market data, technical indicators, and current portfolio status.
        """
        market_data = self.data[self.current_ticker].iloc[self.current_step]
        state = {
            'Open': market_data['Open'],
            'High': market_data['High'],
            'Low': market_data['Low'],
            'Close': market_data['Close'],
            'Volume': market_data['Volume'],
            # Include other indicators as needed
            'SMA_50': market_data.get('SMA_50', 0),  # Example for Simple Moving Average
            'SMA_200': market_data.get('SMA_200', 0),
            'RSI': market_data.get('RSI', 0),
            'MACD': market_data.get('MACD', 0),
            'MACD_Signal': market_data.get('MACD_Signal', 0),
            'Bollinger_Upper': market_data.get('Bollinger_Upper', 0),
            'Bollinger_Lower': market_data.get('Bollinger_Lower', 0),
            # Add other indicators and state variables as necessary
        }
        return state


## High Level Agent

In [None]:
class HighLevelAgent:
    def __init__(self, env, total_budget=100000):
        self.env = env
        self.tickers = env.ticker_list
        self.total_budget = total_budget
        self.initial_portfolio_value = 1000000  # Initial portfolio value
        self.portfolio_value = self.initial_portfolio_value
        self.profit_target = 0.05  # 5% profit target
        self.take_profit_threshold = 0.03  # Take profit at 3% gain

    def choose_equities_and_allocate_budget(self):
        # Select multiple equities
        selected_tickers = self.select_tickers()

        # Allocate budget based on Sharpe ratio
        allocated_budgets = self.allocate_budget(selected_tickers)

        # Create and manage mid- and low-level agents for each selected equity
        agents = []
        for ticker, budget in zip(selected_tickers, allocated_budgets):
            mid_agent = MidLevelAgent(66, 20)
            low_agent = LowLevelAgent(20, 3)
            agents.append((ticker, budget, mid_agent, low_agent))

        return agents

    def select_tickers(self, num_tickers=3):
        # Select `num_tickers` tickers based on the evaluation score
        scores = {ticker: self.evaluate_ticker(ticker) for ticker in self.tickers}
        sorted_tickers = sorted(scores, key=scores.get, reverse=True)
        selected_tickers = sorted_tickers[:num_tickers]
        return selected_tickers

    def evaluate_ticker(self, ticker):
        data = self.env.data[ticker]
        returns = data['Close'].pct_change().dropna()
        avg_return = returns.mean()
        volatility = returns.std()
        sharpe_ratio = avg_return / volatility
        max_drawdown = (data['Close'].max() - data['Close'].min()) / data['Close'].max()

        # Combine metrics into a single score
        score = sharpe_ratio - max_drawdown
        return score

    def allocate_budget(self, selected_tickers):
        # Calculate the Sharpe ratio for each selected ticker
        sharpe_ratios = {ticker: self.calculate_sharpe_ratio(ticker) for ticker in selected_tickers}

        # Total sum of all Sharpe ratios
        total_sharpe = sum(sharpe_ratios.values())

        # Allocate budget proportionally to the Sharpe ratio of each equity
        allocated_budgets = [(sharpe_ratios[ticker] / total_sharpe) * self.total_budget for ticker in selected_tickers]

        return allocated_budgets

    def calculate_sharpe_ratio(self, ticker):
        data = self.env.data[ticker]
        returns = data['Close'].pct_change().dropna()
        avg_return = returns.mean()
        volatility = returns.std()

        # Calculate Sharpe ratio (assuming a risk-free rate of 0 for simplicity)
        sharpe_ratio = avg_return / volatility
        return sharpe_ratio

    def calculate_reward(self, current_ticker):
        # Calculate portfolio return
        self.portfolio_value = self.get_portfolio_value()
        portfolio_return = (self.portfolio_value - self.initial_portfolio_value) / self.initial_portfolio_value

        # Calculate drawdown
        rolling_max = self.env.data[current_ticker]['Close'].cummax()
        drawdown = (rolling_max - self.env.data[current_ticker]['Close']) / rolling_max
        max_drawdown = drawdown.max()

        # Calculate risk-adjusted returns (Sharpe and Sortino ratios)
        returns = self.env.data[current_ticker]['Close'].pct_change().dropna()
        sharpe_ratio = returns.mean() / returns.std()
        sortino_ratio = returns.mean() / returns[returns < 0].std()

        # Consistency factor (standard deviation of returns)
        consistency = -returns.std()  # Negative because lower volatility (std) is preferred

        # Profit target bonus/penalty
        profit_target = 0.05  # 5% profit target
        profit_bonus = 1 if portfolio_return >= profit_target else -0.5

        # Calculate final reward with stronger penalties for drawdowns and rewards for consistency
        reward = (portfolio_return 
                  - 0.7 * max_drawdown 
                  + 0.4 * sharpe_ratio 
                  + 0.3 * sortino_ratio 
                  + 0.3 * consistency 
                  + profit_bonus)

        return reward

    
    def take_profit(self, current_price, entry_price):
        # Implementing a simple take profit mechanism
        gain = (current_price - entry_price) / entry_price
        if gain >= self.take_profit_threshold:
            return True  # Signal to take profit
        return False

    
    def get_portfolio_value(self):
        # This method should retrieve the most recent portfolio value from the environment or relevant agent
        return self.env.portfolio_value


## Mid Level Agent

In [None]:
class MidLevelAgent:
    def __init__(self, state_size, action_size):
        self.strategies = {
            0 : self.moving_average_crossover,
            1 : self.rsi_reversion,
            2 : self.macd_trend_following,
            3 : self.bollinger_bands,
            4 : self.adx_trend_strength,
            5 : self.stochastic_oscillator,
            6 : self.volume_price_trend,
            7 : self.cci_correction,
            8 : self.ema_rsi_combo,
            9 : self.ichimoku_cloud,
            10 : self.parabolic_sar,
            11 : self.momentum,
            12 : self.roc_trend,
            13 : self.williams_percent_r,
            14 : self.keltner_channel,
            15 : self.atr_volatility,
            16 : self.vwap_mean_reversion,
            17 : self.trix_trend_following,
            18 : self.donchian_channel,
            19 : self.pivot_point_support_resistance
        }
        
        self.strategy_performance = {name: 0 for name in self.strategies}
        self.epsilon = 0.5
        self.min_epsilon = 0.1
        self.decay_rate = 0.995
        self.recent_rewards = []
        self.state_size = state_size
        self.action_size = len(self.strategies)
        self.dqn = DQNAgent(state_size, self.action_size)
        self.learning_rate = 0.0001
        self.max_steps = 1000  # Max steps per episode
        self.step_count = 0
        
    def select_strategy(self, state, epsilon):
        action = self.dqn.act(state)  # Select action using DQN
        return action
    
    def take_action(self, action, state):
        # Get the strategy based on the action selected by DQN
        strategy_function = self.strategies[action]
        strategy_signal = strategy_function(state)
        reward = self.calculate_reward(strategy_signal, 1)
        self.step_count += 1
        done = self.step_count >= self.max_steps
        return reward, done
    
    def update_strategy_performance(self, strategy_name, reward, decay=0.99):
        # Update the performance score of the strategy based on the received reward
        self.strategy_performance[strategy_name] = (
            self.strategy_performance[strategy_name] * decay + reward
        )

    def choose_strategy(self, state):
        if random.random() < self.epsilon:
            chosen_strategy = random.choice(list(self.strategies.keys()))
        else:
            state_array = np.array(state).reshape(1, -1)  # Reshape state for DQN
            strategy_index = np.argmax(self.dqn_agent.model.predict(state_array))
            chosen_strategy = list(self.strategies.keys())[strategy_index]
        self.epsilon = max(self.min_epsilon, self.epsilon * self.decay_rate)
        return chosen_strategy
    
    def learn(self, state, action, reward, next_state, done):
        state = np.reshape(state, [1, self.state_size])
        next_state = np.reshape(next_state, [1, self.state_size])
        self.dqn_agent.remember(state, action, reward, next_state, done)
        if len(self.dqn_agent.memory) > 32:
            self.dqn_agent.replay(32)

    def moving_average_crossover(self, state):

        short_ma = state[6]
        long_ma = state[45]
        if short_ma > long_ma:
            return 1  # Signal to buy
        elif short_ma < long_ma:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def rsi_reversion(self, state):
        rsi = state[8]
        if rsi < 30:
            return 1  # Signal to buy
        elif rsi > 70:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def macd_trend_following(self, state):
        macd = state[12]
        signal = state[13]
        if macd > signal:
            return 1  # Signal to buy
        elif macd < signal:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def bollinger_bands(self, state):
 
        upper_band = state[29]
        lower_band = state[30]
        close = state[3]
        if close < lower_band:
            return 1  # Signal to buy
        elif close > upper_band:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def adx_trend_strength(self, state):
       
        adx = state[47]
        if adx > 25:
            return 1  # Signal to trade based on trend
        return 0  # Signal to hold

    def stochastic_oscillator(self, state):
       

        stoch_rsi = state[9]
        if stoch_rsi < 0.2:
            return 1  # Signal to buy
        elif stoch_rsi > 0.8:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def volume_price_trend(self, state):
       
        vpt = state[43]
        if vpt > 0:
            return 1  # Signal to buy
        elif vpt < 0:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def cci_correction(self, state):
       

        cci = state[21]
        if cci < -100:
            return 1  # Signal to buy
        elif cci > 100:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def ema_rsi_combo(self, state):
       
        ema = state[8]
        rsi = state[9]
        if rsi < 30 and state[3] > ema:
            return 1  # Signal to buy
        elif rsi > 70 and state[3] < ema:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def ichimoku_cloud(self, state):
       
        span_a = state[23]
        span_b = state[24]
        close = state[3]
        if close > span_a and close > span_b:
            return 1  # Signal to buy
        elif close < span_a and close < span_b:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def parabolic_sar(self, state):
       

        sar = state[16]  # Assume KAMA is used for parabolic SAR substitute
        close = state[3]
        if close > sar:
            return 1  # Signal to buy
        elif close < sar:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def momentum(self, state):
        

        momentum = state[17]
        if momentum > 0:
            return 1  # Signal to buy
        elif momentum < 0:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def roc_trend(self, state):
        
        roc = state[17]
        if roc > 0:
            return 1  # Signal to buy
        elif roc < 0:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def williams_percent_r(self, state):
        
        will_r = state[10]
        if will_r < -80:
            return 1  # Signal to buy
        elif will_r > -20:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def keltner_channel(self, state):
        

        # Example: Assuming `state` has raw data where indices correspond to specific features
        close_prices = state[0]  # Example index for 'Close' prices
        middle_band = close_prices
        upper_band = middle_band + 2 * close_prices
        lower_band = middle_band - 2 * close_prices

        current_close = close_prices
        if current_close > upper_band:
            return -1  # Signal to sell
        elif current_close < lower_band:
            return 1  # Signal to buy
        else:
            return 0  # Signal to hold
        
    def atr_volatility(self, state):

        atr = state[38]
        if atr > atr:
            return 1  # Signal to buy (volatility breakout)
        return 0  # Signal to hold

    def vwap_mean_reversion(self, state):
        

        vwap = state[44]
        close = state[3]
        if close < vwap:
            return 1  # Signal to buy
        elif close > vwap:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def trix_trend_following(self, state):
        
        trix = state[19]
        if trix > 0:
            return 1  # Signal to buy
        elif trix < 0:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def donchian_channel(self, state):
        
        upper_band = state[36]
        lower_band = state[37]
        close = state[3]
        if close > upper_band:
            return 1  # Signal to buy
        elif close < lower_band:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def pivot_point_support_resistance(self, state):
        

        pivot = state[28]  # Using Bollinger Mid as a pivot substitute
        close = state[3]
        if close > pivot:
            return 1  # Signal to buy
        elif close < pivot:
            return -1  # Signal to sell
        return 0  # Signal to hold

    def calculate_reward(self, profit, transaction_costs, duration_penalty=0.01):
        net_profit = profit - transaction_costs
        time_penalty = duration_penalty
        reward = net_profit - time_penalty
        return reward
    
    def reset(self):
        self.step_count = 0


## Low Level Agent

In [None]:
class LowLevelAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.dqn = DQNAgent(state_size, action_size)
        self.max_trades = 10
        self.trade_count = 0
        self.current_position = None
        self.current_price = None

    def select_action(self, state):
        action = self.dqn.act(state)
        return action

    def take_action(self, action, state, price):
        self.current_price = price

        if action == 1:  # Buy
            reward = self.buy(state)
        elif action == 2:  # Sell
            reward = self.sell(state)
        else:  # Hold
            reward = self.hold(state)
        
        self.trade_count += 1
        done = self.trade_count >= self.max_trades
        return reward, done

    def buy(self, state):
        if self.current_position is not None:
            # Cannot buy if already holding a position
            return -10  # Penalty for invalid action
        
        # Simulate buying
        self.current_position = 'long'
        buy_price = self.current_price
        reward = self.calculate_trade_reward(buy_price, 'buy')
        return reward

    def sell(self, state):
        if self.current_position != 'long':
            # Cannot sell if not holding a position
            return -10  # Penalty for invalid action

        # Simulate selling
        self.current_position = None
        sell_price = self.current_price
        reward = self.calculate_trade_reward(sell_price, 'sell')
        return reward

    def hold(self, state):
        # No action taken, so no immediate reward
        return 0

    def calculate_trade_reward(self, trade_price, action_type):
        # Example reward calculation
        # Assuming `trade_price` is the price at which the trade was executed
        if action_type == 'buy':
            # Buying: Reward based on future price changes
            future_price = self.get_future_price()  # Placeholder for future price
            reward = (future_price - trade_price) * 100  # Example: Profit or Loss
        elif action_type == 'sell':
            # Selling: Reward based on past price changes
            past_price = self.get_past_price()  # Placeholder for past price
            reward = (trade_price - past_price) * 100  # Example: Profit or Loss
        else:
            reward = 0
        
        return reward

    def get_future_price(self):
        # Simulate or get future price for reward calculation
        return self.current_price * (1 + np.random.uniform(-0.01, 0.01))  # Random price change

    def get_past_price(self):
        # Simulate or get past price for reward calculation
        return self.current_price * (1 + np.random.uniform(-0.01, 0.01))  # Random price change

    def reset(self):
        self.trade_count = 0
        self.current_position = None
        self.current_price = None

## Initializing and Defining

In [None]:
def get_mid_level_state(raw_state):
    mid_state = {
        'Open': raw_state[0],
        'High': raw_state[1],
        'Low': raw_state[2],
        'Close': raw_state[3],
        'Adjusted_Close': raw_state[4],
        'Volume': raw_state[5],
        'Williams_R': raw_state[6],
        'Awesome_Oscillator': raw_state[7],
        'MACD': raw_state[8],
        'MACD_Signal': raw_state[9],
        'MACD_Diff': raw_state[10],
        'TSI': raw_state[11],
        'KAMA': raw_state[12],
        'ROC': raw_state[13],
        'Vortex_Diff': raw_state[14],
        'TRIX': raw_state[15],
        'Mass_Index': raw_state[16],
        'CCI': raw_state[17],
        'DPO': raw_state[18],
        'Ichimoku_A': raw_state[19],
        'Ichimoku_B': raw_state[20],
        'Aroon_Up': raw_state[21],
        'Aroon_Down': raw_state[22],
        'Aroon_Indicator': raw_state[23],
        'Bollinger_Mid': raw_state[24],
        'Bollinger_Upper': raw_state[25],
        'Bollinger_Lower': raw_state[26],
        'Bollinger_PBand': raw_state[27],
        'Bollinger_WBand': raw_state[28],
        'Keltner_Channel_Center': raw_state[29],
        'Keltner_Channel_Upper': raw_state[30],
        'Keltner_Channel_Lower': raw_state[31],
        'Donchian_Channel_Upper': raw_state[32],
        'Donchian_Channel_Lower': raw_state[33],
        'ATR': raw_state[34],
        'OBV': raw_state[35],
        'Chaikin_MF': raw_state[36],
        'Force_Index': raw_state[37],
        'Ease_of_Movement': raw_state[38],
        'Volume_Price_Trend': raw_state[39],
        'VWAP': raw_state[40],
        'SMA_200': raw_state[41],
        'EMA_200': raw_state[42],
        'ADX': raw_state[43],
        'Vortex_Pos': raw_state[44],
        'Vortex_Neg': raw_state[45]
    }
    return mid_state

In [None]:
# Initialize the environment
env = TradingEnv(data=processed_data, ticker_list=list(processed_data.keys()))

state_size = 66
action_size = 20

# Initialize the Mid and Low-Level Agents
high_agent = HighLevelAgent(env)
mid_agent = MidLevelAgent(state_size, action_size)
low_agent = LowLevelAgent(state_size, action_size)

In [None]:
replay_memory = deque(maxlen=2000)

# Initialize optimizers
learning_rate = 0.001
mid_agent.dqn.model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

# For the low-level agent
low_agent.dqn.model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

## Training and Model Saving Function

In [None]:
# Hyperparameters
epsilon_start = 1.0       # Starting value of epsilon (exploration rate)
epsilon_min = 0.01        # Minimum value of epsilon
epsilon_decay = 0.995     # Decay rate for epsilon after each episode
num_episodes = 1000       # Total number of episodes for training
batch_size = 32           # Size of minibatch for DQN updates
gamma = 0.99              # Discount factor for future rewards

# Learning rates
learning_rate = 0.001


In [None]:
def train_dqn():
    epsilon = epsilon_start
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            agents = high_agent.choose_equities_and_allocate_budget()

            for ticker, budget, mid_agent, low_agent in agents:
                mid_state = get_mid_level_state(state)
                
                # Mid-Level Agent: Select strategy using DQN
                mid_action = mid_agent.select_strategy(mid_state, epsilon)
                mid_reward, done = mid_agent.take_action(mid_action, state)
                
                # Store transition in replay memory
                mid_next_state = env.get_state()
                mid_agent.dqn.remember(mid_state, mid_action, mid_reward, mid_next_state, done)
                
                # Learn from replay memory
                if len(mid_agent.dqn.memory) > batch_size:
                    mid_agent.dqn.replay(batch_size)

                # Low-Level Agent: Implement selected strategy and trade
                low_action = low_agent.select_action(mid_state)
                low_reward, done = low_agent.take_action(low_action, state, state[0])
                
                # Store transition in replay memory
                low_next_state = env.get_state()
                low_agent.dqn.remember(mid_state, low_action, low_reward, low_next_state, done)

                # Learn from replay memory
                if len(low_agent.dqn.memory) > batch_size:
                    low_agent.dqn.replay(batch_size)

                total_reward += mid_reward + low_reward

            epsilon = max(epsilon_min, epsilon * epsilon_decay)

        print(f"Episode {episode} - Total Reward: {total_reward}")

        if episode % 100 == 0:
            torch.save(mid_agent.dqn.model.state_dict(), f'mid_level_dqn_{episode}.pth')
            torch.save(low_agent.dqn.model.state_dict(), f'low_level_dqn_{episode}.pth')

    torch.save(mid_agent.dqn.model.state_dict(), 'mid_level_dqn_final.pth')
    torch.save(low_agent.dqn.model.state_dict(), 'low_level_dqn_final.pth')


In [None]:
train_dqn()