# Imports

In [1]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import yfinance as yf
import pandas as pd
import torch
import matplotlib.pyplot as plt
import gym
from gym import spaces
import random


# Data Processing

In [2]:
def fetch_data(stocks, start_date, end_date):
    data = {}
    for stock in stocks:
        df = yf.download(stock, start=start_date, end=end_date)
        df['10D_MA'] = df['Close'].rolling(10).mean()
        df['20D_MA'] = df['Close'].rolling(20).mean()
        df['50D_MA'] = df['Close'].rolling(50).mean()
        
        # Drop rows with NaN values in the MA columns
        df = df.dropna(subset=['10D_MA', '20D_MA', '50D_MA'])
        
        data[stock] = df
    return data

stocks = [
    "AAPL", 
    "MSFT", 
    "GOOGL", 
    "AMZN", 
    "BRK",
    "JPM",
    "NVDA",
    "GS"
] 

data = fetch_data(stocks, "2010-01-01", "2023-01-01")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [4]:
data


{'AAPL':                   Open        High         Low       Close   Adj Close  \
 Date                                                                     
 2010-03-16    8.006429    8.035000    7.946786    8.016071    6.804013   
 2010-03-17    8.032143    8.087500    7.973929    8.004286    6.794009   
 2010-03-18    8.003571    8.035714    7.950357    8.023214    6.810077   
 2010-03-19    8.028214    8.044286    7.901071    7.937500    6.737322   
 2010-03-22    7.873929    8.071429    7.862500    8.026786    6.813107   
 ...                ...         ...         ...         ...         ...   
 2022-12-23  130.919998  132.419998  129.639999  131.860001  131.299820   
 2022-12-27  131.380005  131.410004  128.720001  130.029999  129.477585   
 2022-12-28  129.669998  131.029999  125.870003  126.040001  125.504539   
 2022-12-29  127.989998  130.479996  127.730003  129.610001  129.059372   
 2022-12-30  128.410004  129.949997  127.430000  129.929993  129.378006   
 
               

# Environment

In [3]:
import numpy as np
import gym
from gym import spaces

class StockTradingEnvironment(gym.Env):
    def __init__(self, stock_data, initial_cash=100000):
        super(StockTradingEnvironment, self).__init__()
        
        # Stock data
        self.stock_data = stock_data
        self.stocks = list(stock_data.keys())
        self.num_stocks = len(self.stocks)
        
        # Initial cash and current step
        self.cash = initial_cash
        self.initial_cash = initial_cash
        self.current_step = 0
        
        # Initialize the portfolio
        self.portfolio = {stock: 0 for stock in self.stocks}
        
        # Action and observation spaces
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.num_stocks,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(self.num_stocks*10 + 1 + self.num_stocks,), dtype=np.float32)
        
        # Calculate max steps
        self.max_steps = min([len(self.stock_data[stock]) for stock in self.stocks]) - 1

    def reset(self):
        # Only reset if the portfolio value is below zero
        if self._get_portfolio_value() < 0:
            self.current_step = 0
            self.cash = self.initial_cash
            self.portfolio = {stock: 0.0 for stock in self.stocks}
        return self._get_state()

    def _get_state(self):
        # Get stock data for the current step
        stock_data = []
        for stock in self.stocks:
            stock_data.extend(self.stock_data[stock].iloc[self.current_step].values)
        
        # Append cash and portfolio details
        stock_data.append(self.cash)
        for stock in self.stocks:
            stock_data.append(self.portfolio[stock])
        
        return np.array(stock_data)

    def step(self, actions):
        if self.current_step >= self.max_steps:
            return self._get_state(), 0, True, {}

        prev_value = self._get_portfolio_value()

        # Update portfolio based on actions
        for idx, action in enumerate(actions):
            stock = self.stocks[idx]
            price = self.stock_data[stock].iloc[self.current_step]['Close']

            if action > 0:  # Buy
                max_affordable = self.cash // price
                stocks_to_buy = int(min(max_affordable, action))
                self.cash -= price * stocks_to_buy
                self.portfolio[stock] += stocks_to_buy

            elif action < 0:  # Sell
                stocks_to_sell = min(self.portfolio[stock], abs(action))
                self.cash += price * stocks_to_sell
                self.portfolio[stock] -= stocks_to_sell

        reward = self._get_portfolio_value() - prev_value
        self.current_step += 1
        
        done = self.current_step >= self.max_steps or self._get_portfolio_value() < 0
        if self._get_portfolio_value() < 0:
            reward -= 1000

        return self._get_state(), reward, done, {}

    def _get_portfolio_value(self):
        value = self.cash
        for stock in self.stocks:
            value += self.portfolio[stock] * self.stock_data[stock].iloc[self.current_step]['Close']
        return value

    def get_stock_proportions(self):
        total_portfolio_value = self._get_portfolio_value()
        if total_portfolio_value == 0:
            return {stock: 0 for stock in self.stocks}
        
        proportions = {}
        for stock in self.stocks:
            stock_value = self.portfolio[stock] * self.stock_data[stock].iloc[self.current_step]['Close']
            proportions[stock] = stock_value / total_portfolio_value
        return proportions


In [4]:
class ModelUtilities:
    def __init__(self, input_range=(-1, 1), weight_clip_value=0.1):
        self.input_range = input_range
        self.weight_clip_value = weight_clip_value

    def scale_inputs(self, inputs):

        min_vals = np.min(inputs, axis=0)
        max_vals = np.max(inputs, axis=0)
        scale = max_vals - min_vals
        # Avoid division by zero by setting scale to 1 where min and max values are the same
        scale = np.where(scale == 0, 1, scale)
        inputs_std = (inputs - min_vals) / scale
        inputs_scaled = inputs_std * (self.input_range[1] - self.input_range[0]) + self.input_range[0]
        return torch.tensor(inputs_scaled, dtype=torch.float32)

    def clip_weights(self, parameters):
        with torch.no_grad():
            for p in parameters:
                p.data.clamp_(-self.weight_clip_value, self.weight_clip_value)

    def clip_inputs(self, inputs):
        return torch.clamp(inputs, self.input_range[0], self.input_range[1])

    @staticmethod
    def apply_gradient_scaling(loss, optimizer, scale_factor):
        loss = loss * scale_factor
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        # Unscales the gradients of optimizer's assigned params in-place
        torch.nn.utils.clip_grad_norm_(optimizer.param_groups[0]['params'], max_norm=1)


    @staticmethod
    def init_weights_kaiming(m):
        if isinstance(m, nn.Linear):
            torch.nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
            if m.bias is not None:
                m.bias.data.fill_(0.01)

    @staticmethod
    def init_weights_xavier(m):
        if isinstance(m, nn.Linear):
            torch.nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                m.bias.data.fill_(0.01)



# Time Series Transformer

In [5]:
import torch.nn as nn

class TransformerLSTM(nn.Module):
    def __init__(self, input_dim, lstm_hidden_dim, nhead, num_layers, num_lstm_layers):
        super(TransformerLSTM, self).__init__()
        
        # Transformer Encoder layers
        transformer_layers = nn.TransformerEncoderLayer(
            d_model=input_dim, 
            nhead=nhead, 
            dim_feedforward=2*input_dim  
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer=transformer_layers, 
            num_layers=num_layers
        )
        
        # Layer Normalization after Transformer Encoder
        self.ln_after_transformer = nn.LayerNorm(input_dim)
        
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=input_dim, 
            hidden_size=lstm_hidden_dim, 
            num_layers=num_lstm_layers, 
            batch_first=True
        )
        
        # Fully connected layer to output Q-value for DDPG
        self.fc = nn.Linear(lstm_hidden_dim, 1)

        # Layer Normalization after LSTM
        self.ln_after_lstm = nn.LayerNorm(lstm_hidden_dim)
        
    def forward(self, x):
        # x.shape should be (batch_size, sequence_length, input_dim)
        #print("TransformerLSTM Forward Method")
        #print(f"input x in forward method: {x}")
        # Apply transformer encoder
        x = self.transformer_encoder(x)
        #print(f"self.transformer_encoder(x) in forward method: {x}")

        # Apply layer normalization
        x = self.ln_after_transformer(x)
        #print(f"self.ln_after_transformer(x) in forward method: {x}")
      
        # Reshape x to add an extra dimension for LSTM (batch_first=True)
        x = x.unsqueeze(1)  # Now shape is (batch_size, 1, sequence_length, input_dim)
        #print(f" x.unsqueeze(1) : in forward method: {x}")
        # Apply LSTM
        lstm_out, _ = self.lstm(x)
        #print(f" self.lstm(x) : in forward method: {lstm_out}")
        
        # Apply layer normalization
        lstm_out = self.ln_after_lstm(lstm_out)
        #print(f" self.ln_after_lstm(lstm_out) : in forward method: {lstm_out}")
        
        # Take the last output for each sequence
        lstm_out_last = lstm_out[:, -1, :]  # Shape is (batch_size, lstm_hidden_dim)
        #print(f" lstm_out[:, -1, :]  : in forward method: {lstm_out_last}")
        
        # Pass through fully connected layer to get Q-value
        q_val = self.fc(lstm_out_last)
        #print(f"q_val: output of TransformerLSTM forward method: {q_val}")
        
        return q_val


# GAN

In [6]:
def gradient_penalty(real_data, fake_data, discriminator):
    alpha = torch.rand(real_data.size(0), 1).expand_as(real_data)
    interpolated = alpha * real_data + (1 - alpha) * fake_data
    interpolated.requires_grad_()
    disc_interpolated = discriminator(interpolated)
    gradients = torch.autograd.grad(outputs=disc_interpolated, 
                                    inputs=interpolated, 
                                    grad_outputs=torch.ones(disc_interpolated.size()), 
                                    create_graph=True, 
                                    retain_graph=True)[0]
    gradients = gradients.view(real_data.size(0), -1)
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gradient_penalty


class Generator(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ELU(),
            nn.Linear(hidden_dim, hidden_dim),  
            nn.ELU(),
            nn.Linear(hidden_dim, input_dim)  
        )

    def forward(self, x):
        return self.net(x)
    
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ELU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ELU(),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x):
        return self.net(x)


'''
class Generator(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),  
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)  
        )

    def forward(self, x):
        return self.net(x)
    
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x):
        return self.net(x)

'''      


class GAN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GAN, self).__init__()
        
        self.generator = Generator(input_dim, hidden_dim, output_dim)        
        self.discriminator = Discriminator(input_dim, hidden_dim)  

        self.optimizer_g = torch.optim.Adam(self.generator.parameters(), lr=0.0001)
        self.optimizer_d = torch.optim.Adam(self.discriminator.parameters(), lr=0.0001)
        self.criterion = nn.BCELoss()

    def train_gan(self, states, actions, rewards, next_states, lambda_gp=10):
        batch_size = states.size(0)
        # Concatenate states, actions, rewards, and next_states
        real_data = torch.cat([states, actions, rewards, next_states], dim=1)
        #print("real_data shape:", real_data.shape)
        self.optimizer_d.zero_grad()

        labels_real = torch.ones(batch_size, 1)
        predictions_real = self.discriminator(real_data)

        # Update here
        noise = torch.randn(batch_size, self.generator.net[0].in_features)
        generated_data = self.generator(noise)
        #print(f"generated_data shape in train_gan method: {generated_data.shape}")

        predictions_fake = self.discriminator(generated_data.detach())
        gp = gradient_penalty(real_data, generated_data, self.discriminator)
        loss_d = -(torch.mean(predictions_real) - torch.mean(predictions_fake)) + lambda_gp * gp
        loss_d.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(self.discriminator.parameters(), max_norm=1.0)

        self.optimizer_d.step()
        
        self.optimizer_g.zero_grad()
        predictions = self.discriminator(generated_data)
        loss_g = -torch.mean(predictions)
        loss_g.backward()
        nn.utils.clip_grad_norm_(self.generator.parameters(), max_norm=1.0)
        self.optimizer_g.step()

        return loss_g.item(), loss_d.item()





# DDPG

In [7]:
import pickle
class Actor(nn.Module):
    def __init__(self, input_dim, hidden_dim, action_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(action_dim)
        self.dropout = nn.Dropout(p=0.1)
        
    def forward(self, x):
        print(f"x input to Actor forward method: {x}")
        x = F.relu(self.bn1(self.fc1(x)))
        print(f" F.relu(self.bn1(self.fc1(x))) Actor forward method: {x}")       
        x = self.dropout(x)
        x = torch.tanh(self.bn2(self.fc2(x)))
        print(f"output from Actor: {x}")
        return x


# 1. Complete the Critic using TransformerLSTM
class Critic(nn.Module):
    def __init__(self, input_dim, lstm_hidden_dim, nhead, num_layers, num_lstm_layers):
        super(Critic, self).__init__()
        self.transformer_lstm = TransformerLSTM(input_dim, lstm_hidden_dim, nhead, num_layers, num_lstm_layers)

    def forward(self, state, action):
        print(f"state shape passed to Critic forward {state}")
        print(f"action shape passed to Critic forward {action}")
        x = torch.cat([state, action], 1)
        print(f" x = torch.cat([state, action], 1) = {x}")
        return self.transformer_lstm(x)


# 2. Ornstein-Uhlenbeck Noise
class OUNoise:
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = np.random.seed(seed)
        self.size = size
        self.reset()

    def reset(self):
        self.state = np.copy(self.mu)

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.standard_normal(self.size)
        self.state = x + dx
        return self.state


# 3. Memory Replay
import random

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def add(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, indices):
        # This new sample method will return experiences at the given indices
        return [self.buffer[i] for i in indices]

    def __len__(self):
        return len(self.buffer)



# 4. DDPG Agent
class DDPGAgent:
    def __init__(self, state_dim, action_dim, actor_hidden_dim=256, critic_lstm_hidden_dim=256, nhead=1, num_layers=2, num_lstm_layers=1):
        self.actor = Actor(state_dim, actor_hidden_dim, action_dim)
        self.critic = Critic(state_dim + action_dim, critic_lstm_hidden_dim, nhead, num_layers, num_lstm_layers)
        
        self.target_actor = Actor(state_dim, actor_hidden_dim, action_dim)
        self.target_critic = Critic(state_dim + action_dim, critic_lstm_hidden_dim, nhead, num_layers, num_lstm_layers)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4, weight_decay=1e-5)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3, weight_decay=1e-5)
        self.noise = OUNoise(action_dim, seed=0)
        self.memory = ReplayBuffer(10000)
        self.gamma = 0.99
        self.tau = 1e-3
        self.utils = ModelUtilities(input_range=(-1, 1), weight_clip_value=0.1)

        # Load ReplayBuffer instance
        #with open('Replay_buffer.pkl', 'rb') as file:
            #self.memory = pickle.load(file)



        # Load ReplayBuffer instance
        #with open('replay_buffer.pkl', 'rb') as file:
            #self.memory = pickle.load(file)

    def soft_update(self, local_model, target_model):
        for target_param, param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)

    def train(self, batch_size):
        # Ensure there are enough experiences in the memory for sampling
        if len(self.memory) < batch_size:
            # Not enough experiences to sample from
            return None, None

        # Sample a batch of experiences
        sample_indices = np.random.choice(len(self.memory), batch_size, replace=False)
        experiences_batch = self.memory.sample(sample_indices)

        # Separate the data
        states, actions, rewards, next_states, dones = zip(*experiences_batch)

        # Convert the tuple of experiences into tensors
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.float32)
        rewards = torch.tensor(rewards, dtype=torch.float32).view(-1, 1)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32).view(-1, 1)
        
        print(f"states in train method of DDPG post tensor: {states}")
        print(f"actions in train method of DDPG post tensor: {actions}")
        print(f"rewards in train method of DDPG post tensor: {rewards}")
        print(f"next_states in train method of DDPG post tensor: {next_states}")
        print(f"dones in train method of DDPG post tensor: {dones}")

        # Update critic
        next_actions = self.target_actor(next_states)
        print(f"next_actions : {next_actions}")

        next_Q = self.target_critic(next_states, next_actions)
        print(f"next_Q : {next_Q}")
        expected_rewards = rewards + (1.0 - dones) * self.gamma * next_Q
        print(f"expected_rewards : {expected_rewards}")       
        predicted_rewards = self.critic(states, actions)
        print(f"predicted_rewards : {predicted_rewards}")
        critic_loss = F.mse_loss(predicted_rewards, expected_rewards.detach())

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip the gradient for the critic
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1.0)
        self.critic_optimizer.step()

        # Update actor
        policy_loss = -self.critic(states, self.actor(states)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        # Clip the gradient for the actor
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1.0)
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.actor, self.target_actor)
        self.soft_update(self.critic, self.target_critic)

        return policy_loss, critic_loss

    def act(self, state, add_noise=True):
        #if state.shape[0] != 89:
            #print("Incorrect state shape detected:", state.shape)
        #print("Input shape to the actor network:", state.shape)  
        state = torch.FloatTensor(state).unsqueeze(0)
        print("state = torch.FloatTensor(state).unsqueeze(0):", state)  

        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().numpy()
            print(f"action = self.actor(state).cpu().numpy() {action}")
        self.actor.train()
        if add_noise:
            action += self.noise.sample()
        print(f"output of act method: np.clip(action, -1, 1).flatten() = {np.clip(action, -1, 1).flatten()}")
        return np.clip(action, -1, 1).flatten()  

    def reset(self):
        self.noise.reset()

    def add_memory(self, state, action, reward, next_state, done):
        # Make sure the reward and done are wrapped in lists
        self.memory.add(state, action, [reward], next_state, [done])





In [94]:
import pickle
class Actor(nn.Module):
    def __init__(self, input_dim, hidden_dim, action_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.ln2 = nn.LayerNorm(action_dim)
        self.dropout = nn.Dropout(p=0.1)
        
    def forward(self, x):
        print(f"x input to Actor forward method: {x}")
        
        x = self.fc1(x)
        print(f"self.fc1(x): {x}")
        x = self.ln1(x)
        print(f"self.ln1(x): {x}")
        x = F.elu(x)
        print(f"After ELU: {x}")
        
        x = self.dropout(x)
        x = self.fc2(x)
        print(f"self.fc2(x): {x}")
        x = self.ln2(x)
        print(f"self.ln2(x): {x}")
        x = torch.tanh(x)
        print(f"output from Actor: {x}")
        
        return x

# 1. Complete the Critic using TransformerLSTM
class Critic(nn.Module):
    def __init__(self, input_dim, lstm_hidden_dim, nhead, num_layers, num_lstm_layers):
        super(Critic, self).__init__()
        self.transformer_lstm = TransformerLSTM(input_dim, lstm_hidden_dim, nhead, num_layers, num_lstm_layers)

    def forward(self, state, action):
        #print(f"state shape passed to Critic forward {state}")
        #print(f"action shape passed to Critic forward {action}")
        x = torch.cat([state, action], 1)
        #print(f" x = torch.cat([state, action], 1) = {x}")
        return self.transformer_lstm(x)


# 2. Ornstein-Uhlenbeck Noise
class OUNoise:
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = np.random.seed(seed)
        self.size = size
        self.reset()

    def reset(self):
        self.state = np.copy(self.mu)

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.standard_normal(self.size)
        self.state = x + dx
        return self.state


# 3. Memory Replay
import random

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def add(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, indices):
        # This new sample method will return experiences at the given indices
        return [self.buffer[i] for i in indices]

    def __len__(self):
        return len(self.buffer)


import torch
import torch.optim as optim
import numpy as np
from torch.nn import functional as F
# 4. DDPG Agent
class DDPGAgent:
    def __init__(self, state_dim, action_dim, actor_hidden_dim=256, critic_lstm_hidden_dim=256, nhead=1, num_layers=2, num_lstm_layers=1):
        self.actor = Actor(state_dim, actor_hidden_dim, action_dim)
        self.critic = Critic(state_dim + action_dim, critic_lstm_hidden_dim, nhead, num_layers, num_lstm_layers)
        
        self.target_actor = Actor(state_dim, actor_hidden_dim, action_dim)
        self.target_critic = Critic(state_dim + action_dim, critic_lstm_hidden_dim, nhead, num_layers, num_lstm_layers)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4, weight_decay=1e-5)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3, weight_decay=1e-5)
        # Learning rate schedulers for both actor and critic
        self.actor_scheduler = optim.lr_scheduler.StepLR(self.actor_optimizer, step_size=100, gamma=0.9)
        self.critic_scheduler = optim.lr_scheduler.StepLR(self.critic_optimizer, step_size=100, gamma=0.9)

        self.noise = OUNoise(action_dim, seed=0)
        self.memory = ReplayBuffer(10000)
        self.gamma = 0.99
        self.tau = 1e-3
        self.utils = ModelUtilities(input_range=(-1, 1), weight_clip_value=0.1)

        # Load ReplayBuffer instance
        #with open('Replay_buffer.pkl', 'rb') as file:
            #self.memory = pickle.load(file)

    def soft_update(self, local_model, target_model):
        for target_param, param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)

    def train(self, batch_size):
        if len(self.memory) < batch_size:
            return None, None

        sample_indices = np.random.choice(len(self.memory), batch_size, replace=False)
        experiences_batch = self.memory.sample(sample_indices)
        
        states, actions, rewards, next_states, dones = zip(*experiences_batch)

        # Convert to numpy arrays
        states_np, next_states_np = np.array(states), np.array(next_states)

        # Scale and clip inputs
        states = self.utils.scale_inputs(states_np)
        print(f"states after scale_inputs {states}")
        next_states = self.utils.scale_inputs(next_states_np)
        print(f"next_states after scale_inputs {next_states}")
        states = self.utils.clip_inputs(states)
        next_states = self.utils.clip_inputs(next_states)
        print(f"states after clip_inputs {states}")

        print(f"next_states after clip_inputs {next_states}")

        # Convert to tensors
        actions = torch.tensor(actions, dtype=torch.float32)
        rewards = torch.tensor(rewards, dtype=torch.float32).view(-1, 1)
        dones = torch.tensor(dones, dtype=torch.float32).view(-1, 1)


        
        #print(f"states in train method of DDPG post tensor: {states}")
        #print(f"actions in train method of DDPG post tensor: {actions}")
        #print(f"rewards in train method of DDPG post tensor: {rewards}")
        #print(f"next_states in train method of DDPG post tensor: {next_states}")
        #print(f"dones in train method of DDPG post tensor: {dones}")

        # Update critic
        next_actions = self.target_actor(next_states)
        #print(f"next_actions : {next_actions}")

        next_Q = self.target_critic(next_states, next_actions)
        #print(f"next_Q : {next_Q}")
        expected_rewards = rewards + (1.0 - dones) * self.gamma * next_Q
        #print(f"expected_rewards : {expected_rewards}")       
        predicted_rewards = self.critic(states, actions)
        #print(f"predicted_rewards : {predicted_rewards}")
        critic_loss = F.mse_loss(predicted_rewards, expected_rewards.detach())

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip the gradient for the critic
        self.utils.clip_weights(self.critic.parameters())  # Clip weights after loss.backward() and before optimizer.step()
        self.critic_optimizer.step()

        # Update actor
        policy_loss = -self.critic(states, self.actor(states)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        # Clip the gradient for the actor
        self.utils.clip_weights(self.actor.parameters())  # Clip weights for actor as well
        self.actor_optimizer.step()
        # Update learning rate
        self.actor_scheduler.step()
        self.critic_scheduler.step()

        # Update target networks
        self.soft_update(self.actor, self.target_actor)
        self.soft_update(self.critic, self.target_critic)

        return policy_loss, critic_loss
    
    def act(self, state, add_noise=True):
        # Ensure state is a numpy array
        state = np.array(state).astype(np.float32)
        state = self.utils.scale_inputs(state)  # Scale inputs
        state = self.utils.clip_inputs(state)  # Clip inputs

        state_tensor = torch.FloatTensor(state).unsqueeze(0)  # Convert to tensor and add batch dimension
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state_tensor).cpu().numpy().flatten()
        self.actor.train()

        if add_noise:
            noise = self.noise.sample()
            action += noise

        action = np.clip(action, -1, 1)
        return action

    def reset(self):
        self.noise.reset()

    def add_memory(self, state, action, reward, next_state, done):
        # Make sure the reward and done are wrapped in lists
        self.memory.add(state, action, [reward], next_state, [done])





# Training

In [8]:

import sys

# Hyperparameters
EPOCHS = 1000
BATCH_SIZE = 128
GAN_TRAIN_INTERVAL = 50
DDPG_TRAIN_INTERVAL = 10
MAX_TIMESTEPS = 50

env = StockTradingEnvironment(data) 
state_dim = env._get_state().shape[0]
#print(f"state dim used in gan and DDPGAgent : {state_dim}")
action_dim = env.action_space.shape[0]
#print(f"action dim used in gan and DDPGAgent: {action_dim}")
#assert state_dim + action_dim + 1 == 181, f"Dimensions do not match. Got: {state_dim + action_dim + 1}, Expected: 181"

# Calculate the total dimension of the data to be generated
total_output_dim = state_dim + action_dim + 1 + state_dim  # State + Action + Reward + Next State
gan = GAN(171, 256, total_output_dim)
agent = DDPGAgent(state_dim, action_dim, actor_hidden_dim=256, critic_lstm_hidden_dim=256, nhead=1, num_layers=2, num_lstm_layers=1)

gan.generator.apply(ModelUtilities.init_weights_kaiming)
gan.discriminator.apply(ModelUtilities.init_weights_kaiming)
agent.actor.apply(ModelUtilities.init_weights_kaiming)
agent.critic.apply(ModelUtilities.init_weights_xavier) 
agent.critic.transformer_lstm.apply(ModelUtilities.init_weights_kaiming)


'''# Load TransformerLSTM weights
agent.critic.transformer_lstm.load_state_dict(torch.load('Transformer_model.pth'))
agent.critic.transformer_lstm.eval()  # Set to evaluation mode if you're not planning on training further

# Load GAN weights
gan.load_state_dict(torch.load('Gan_weights.pth'))
gan.eval()  # Set to evaluation mode

# Load Actor and Critic weights
agent.actor.load_state_dict(torch.load('Actor_weights.pth'))
agent.actor.eval()  # Set to evaluation mode

agent.critic.load_state_dict(torch.load('Critic_weights.pth'))
agent.critic.eval()  # Set to evaluation mode'''



# Metric storage
ddpg_losses = []
transformer_losses = []
gan_losses_g = []
gan_losses_d = []
portfolio_values = []
returns = []
stock_proportions = []
INITIAL_PORTFOLIO_VALUE = 100000 # start with $100,000
# Initialization of experience buffer
experiences = {
    'state': [],
    'action': [],
    'reward': [],
    'next_state': [],
    'done': []
}

# Reset the environment at the beginning of training
#current_state = env.reset()

# Training loop
for epoch in range(EPOCHS):
    total_reward = 0 
    initial_state = env.reset()

    
    current_state = initial_state
    #print(f"current_state being passed into the act method: {current_state}")
    #portfolio_value = INITIAL_PORTFOLIO_VALUE  # Reset portfolio value at the start of each epoch
    ddpg_loss = transformer_loss = loss_g = loss_d = 0  # Initialize losses

    for t in range(MAX_TIMESTEPS):
        #print(f"Time Step: {t}")

        action = agent.act(current_state)
        #print(f"agent.act(current_state): {action}")

        action = np.clip(action, env.action_space.low, env.action_space.high)
        #print(f"actions after np.clip(action, env.action_space.low, env.action_space.high):  {action}")

        next_state, reward, done, _ = env.step(action)
        #print(f"next_state: {next_state} - reward: {reward} - done: {done}")
        total_reward += reward
        #episode_reward.append(reward)
       # Add the experience to the memory
        agent.add_memory(current_state, action, reward, next_state, done)


        current_state = next_state

        # Check if enough samples are available for training
        # Check if it's time to train
        if t % 10 == 0 and len(agent.memory) >= BATCH_SIZE:
            # Sample a batch of experiences
            sample_indices = np.random.choice(len(agent.memory), BATCH_SIZE, replace=False)
            experiences_batch = agent.memory.sample(sample_indices)
            # Convert batch to tensors
            states_batch = torch.tensor([exp[0] for exp in experiences_batch], dtype=torch.float)
            actions_batch = torch.tensor([exp[1] for exp in experiences_batch], dtype=torch.float)
            rewards_batch = torch.tensor([exp[2] for exp in experiences_batch], dtype=torch.float)
            next_states_batch = torch.tensor([exp[3] for exp in experiences_batch], dtype=torch.float)
            dones_batch = torch.tensor([exp[4] for exp in experiences_batch], dtype=torch.float)
            # Train GAN
            loss_g, loss_d = gan.train_gan(states_batch, actions_batch, rewards_batch, next_states_batch)
            gan_losses_g.append(loss_g)
            gan_losses_d.append(loss_d)
            # Generate fake experiences
            combined_fake = gan.generator(torch.cat([states_batch, actions_batch, rewards_batch, next_states_batch], dim=1))
            # Separate the fake experiences
            fake_states = combined_fake[:, :state_dim]
            fake_actions = combined_fake[:, state_dim:state_dim+action_dim]
            fake_rewards = combined_fake[:, -1].view(-1, 1)

            # Add fake experiences to the memory
            for i in range(BATCH_SIZE):
                fake_reward = fake_rewards[i].item()  # Get the scalar value
                fake_done = bool(dones_batch[i].item())  # Convert to boolean
                agent.add_memory(fake_states[i].tolist(), fake_actions[i].tolist(), fake_reward, next_states_batch[i].tolist(), fake_done)

            # Train DDPG with real experiences
            ddpg_loss, transformer_loss = agent.train(BATCH_SIZE)

            # Check if training was successful
            if ddpg_loss is not None and transformer_loss is not None:
                ddpg_losses.append(ddpg_loss.item())
                transformer_losses.append(transformer_loss.item())
           

            # Capture portfolio value and other metrics
            portfolio_value = env._get_portfolio_value()
            proportions = env.get_stock_proportions()

            if done:
                current_state = env.reset()  # Reset only if portfolio value is below zero
                if env._get_portfolio_value() >= 0:
                    # Don't reset the portfolio value if it's non-negative
                    env.cash = portfolio_value  # Carry over the last portfolio value to the new episode

    # Logging the information at the end of the epoch
    portfolio_value = env._get_portfolio_value()  # Get the final portfolio value
    print(f"Epoch {epoch}/{EPOCHS} - Total Reward: {total_reward}- Portfolio Value: {portfolio_value} - Transformer Loss: {transformer_loss} - DDPG Loss: {ddpg_loss} - Generator Loss: {loss_g} - Discriminator Loss: {loss_d}")
    sys.stdout.flush()


  from .autonotebook import tqdm as notebook_tqdm


state = torch.FloatTensor(state).unsqueeze(0): tensor([[8.0064e+00, 8.0350e+00, 7.9468e+00, 8.0161e+00, 6.8040e+00, 4.4691e+08,
         7.8797e+00, 7.5644e+00, 7.3913e+00, 2.9420e+01, 2.9490e+01, 2.9200e+01,
         2.9370e+01, 2.2426e+01, 3.6724e+07, 2.8919e+01, 2.8798e+01, 2.9165e+01,
         1.4060e+01, 1.4225e+01, 1.4033e+01, 1.4144e+01, 1.4144e+01, 1.3712e+08,
         1.4145e+01, 1.3777e+01, 1.3955e+01, 6.5620e+00, 6.6145e+00, 6.5250e+00,
         6.5895e+00, 6.5895e+00, 8.2650e+07, 6.5054e+00, 6.2366e+00, 6.2146e+00,
         3.4940e+00, 3.5027e+00, 3.3900e+00, 3.4853e+00, 3.4853e+00, 6.8297e+04,
         3.2330e+00, 3.1411e+00, 3.1756e+00, 4.3350e+01, 4.3380e+01, 4.2690e+01,
         4.3240e+01, 3.0125e+01, 2.8554e+07, 4.2684e+01, 4.1748e+01, 4.1348e+01,
         4.2875e+00, 4.4650e+00, 4.2850e+00, 4.4400e+00, 4.0734e+00, 6.6499e+07,
         4.2970e+00, 4.2329e+00, 4.2644e+00, 1.7433e+02, 1.7642e+02, 1.7356e+02,
         1.7619e+02, 1.4045e+02, 1.1688e+07, 1.6973e+02, 1.633

NameError: name 'proportions' is not defined

# Saving Weights

In [96]:
# Saving the weights of TransformerLSTM model
torch.save(agent.critic.transformer_lstm.state_dict(), 'Transformer_model.pth')

# Saving the weights of GAN model
torch.save(gan.state_dict(), 'Gan_weights.pth')

# Saving the weights of DDPG models
torch.save(agent.actor.state_dict(), 'Actor_weights.pth')
torch.save(agent.critic.state_dict(), 'Critic_weights.pth')


import pickle

# Save ReplayBuffer instance
with open('Replay_buffer.pkl', 'wb') as file:
    pickle.dump(agent.memory, file)
