In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

In [2]:
class TradingEnvironment:
    def __init__(self, data, window_size, max_hold_days=12):
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.inventory = []
        self.cash = 10000  # Starting cash
        self.max_inventory = 10  # Maximum inventory
        self.state_size = window_size * 5  # Number of features (Open, High, Low, Close, Volume)
        self.max_hold_days = max_hold_days
        self.holding_periods = []

    def reset(self):
        self.current_step = 0
        self.inventory = []
        self.cash = 10000
        self.holding_periods = []
        return self._get_state()

    def _get_state(self):
        return self.data[self.current_step:self.current_step + self.window_size].flatten()

    def step(self, action):
        current_price = self.data[self.current_step, 3]  # Close price
        reward = 0

        if action == 0:  # Buy
            if len(self.inventory) < self.max_inventory and self.cash >= float(current_price):
                self.inventory.append(float(current_price))
                self.cash -= float(current_price)
                self.holding_periods.append(0)
        elif action == 1:  # Sell
            if len(self.inventory) > 0:
                bought_price = self.inventory.pop(0)
                reward = float(current_price) - bought_price
                self.cash += float(current_price)
                self.holding_periods.pop(0)
        # Hold (action == 2) doesn't change inventory or cash

        # Update holding periods and enforce max holding days
        self.holding_periods = [days + 1 for days in self.holding_periods]
        while len(self.holding_periods) > 0 and self.holding_periods[0] > self.max_hold_days:
            # Force sell stocks held for more than max_hold_days
            bought_price = self.inventory.pop(0)
            reward += float(current_price) - bought_price  # Include forced sell in reward
            self.cash += float(current_price)
            self.holding_periods.pop(0)

        self.current_step += 1
        done = self.current_step >= len(self.data) - self.window_size
        next_state = self._get_state() if not done else None

        return next_state, reward, done


In [10]:
# Prepare data
def prepare_data(data, window_size):
    data = data[['Open', 'High', 'Low', 'Close', 'Volume']].values
    n = len(data) - window_size
    states = np.array([data[i:i + window_size] for i in range(n)])
    return states

# Load and preprocess data
data = pd.read_pickle('data/SP500.pkl')
window_size = 9
data = prepare_data(data, window_size)
train_size = int(len(data) * 0.7)
train_data = data[:train_size]
test_data = data[train_size:]

In [14]:
data[0].shape

(9, 5)

In [4]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [5]:
# Hyperparameters
state_size = window_size * 5
action_size = 3
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
batch_size = 32
memory_size = 10000
target_update = 10

In [6]:
# Memory for experience replay
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [7]:
# Initialize models and optimizer
policy_net = DQN(state_size, action_size)
target_net = DQN(state_size, action_size)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
memory = ReplayMemory(memory_size)

In [8]:
# Training function
def optimize_model():
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)

    batch_state = torch.tensor(batch_state, dtype=torch.float32)
    batch_action = torch.tensor(batch_action)
    batch_reward = torch.tensor(batch_reward, dtype=torch.float32)
    batch_next_state = torch.tensor(batch_next_state, dtype=torch.float32)

    current_q_values = policy_net(batch_state).gather(1, batch_action.unsqueeze(1))
    next_q_values = target_net(batch_next_state).max(1)[0].detach()
    expected_q_values = batch_reward + (gamma * next_q_values)

    loss = nn.MSELoss()(current_q_values.squeeze(), expected_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [9]:
# Train the model
num_episodes = 100
env = TradingEnvironment(train_data, window_size)

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    for t in range(len(train_data) - window_size):
        state_tensor = torch.tensor(state, dtype=torch.float32)
        if random.random() < epsilon:
            action = random.randrange(action_size)
        else:
            with torch.no_grad():
                action = policy_net(state_tensor).argmax().item()

        next_state, reward, done = env.step(action)
        total_reward += reward
        memory.push(state, action, reward, next_state)
        state = next_state

        optimize_model()
        if done:
            break

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f'Episode {episode}, Total Reward: {total_reward}')

TypeError: only size-1 arrays can be converted to Python scalars

In [None]:
# Save the model
torch.save(policy_net.state_dict(), 'dqn_trading_model.pth')

In [None]:
# Evaluate the model
env = TradingEnvironment(test_data, window_size)
state = env.reset()
total_profit = 0
while True:
    state_tensor = torch.tensor(state, dtype=torch.float32)
    with torch.no_grad():
        action = policy_net(state_tensor).argmax().item()

    next_state, reward, done = env.step(action)
    total_profit += reward
    state = next_state
    if done:
        break

print(f'Total Profit: {total_profit}')