In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Define the LSTM-DQN network
class LSTM_DQN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM_DQN, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden=None):
        lstm_out, hidden = self.lstm(x, hidden)
        q_values = self.fc(lstm_out[:, -1, :])  # Use only the last LSTM output
        return q_values, hidden

# Define the stock trading environment
class StockEnvironment:
    def __init__(self, prices):
        self.prices = prices
        self.current_step = 0
        self.initial_balance = 10000  # Initial balance in dollars
        self.balance = self.initial_balance
        self.shares_owned = 0
        self.max_steps = len(prices)

    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares_owned = 0

    def step(self, action):
        current_price = self.prices[self.current_step]
        reward = 0

        if action == 0:  # Buy
            if self.balance >= current_price:
                self.shares_owned += 1
                self.balance -= current_price
        elif action == 1:  # Sell
            if self.shares_owned > 0:
                self.shares_owned -= 1
                self.balance += current_price

        # Calculate profit
        next_price = self.prices[self.current_step + 1] if self.current_step < self.max_steps - 1 else current_price
        profit = self.balance + (self.shares_owned * next_price) - self.initial_balance

        self.current_step += 1
        done = self.current_step >= self.max_steps - 1

        return profit, done

# Define the environment parameters
prices = np.random.randint(50, 150, size=100)  # Random stock prices

input_size = 1  # Size of input observations (single price)
hidden_size = 32  # Size of LSTM hidden state
output_size = 3  # Number of possible actions (buy, sell, hold)
lr = 0.001  # Learning rate
gamma = 0.99  # Discount factor
epsilon_start = 1.0  # Initial epsilon for epsilon-greedy exploration
epsilon_end = 0.01  # Final epsilon
epsilon_decay = 0.995  # Epsilon decay rate
num_episodes = 1000  # Number of episodes

# Initialize the LSTM-DQN network
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTM_DQN(input_size, hidden_size, output_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

# Epsilon-greedy exploration
def epsilon_greedy(q_values, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(output_size)
    else:
        return torch.argmax(q_values).item()

# Define the loss function
loss_fn = nn.MSELoss()

# Initialize the environment
env = StockEnvironment(prices)

# Training loop
for episode in range(num_episodes):
    env.reset()
    state = torch.FloatTensor(env.prices.reshape(-1, 1)).unsqueeze(0).to(device)
    hidden = None
    total_profit = 0

    for _ in range(len(env.prices) - 1):
        q_values, hidden = model(state, hidden)
        action = epsilon_greedy(q_values, epsilon_start)
        reward, done = env.step(action)

        # Compute target Q-value
        with torch.no_grad():
            next_state = torch.FloatTensor(env.prices[env.current_step:].reshape(-1, 1)).unsqueeze(0).to(device)
            next_q_values, _ = model(next_state, hidden)
            max_next_q_value = torch.max(next_q_values)
            target_q_value = reward + (1 - int(done)) * gamma * max_next_q_value

        # Compute loss and optimize the model
        optimizer.zero_grad()
        loss = loss_fn(q_values.squeeze()[action], target_q_value)
        loss.backward()
        optimizer.step()

        total_profit += reward
        state = next_state

    # Decay epsilon
    epsilon_start = max(epsilon_end, epsilon_start * epsilon_decay)

    # Print progress
    if episode % 10 == 0:
        print(f"Episode {episode}, Total Profit: {total_profit}")

# After training, you can use the trained model for evaluation or further tasks


RuntimeError: Expected hidden[0] size (1, 99, 32), got [1, 100, 32]