In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import optuna

In [2]:
df_raw = pd.read_pickle('data/SP500.pkl')

In [3]:
df_train = df_raw.iloc[:len(df_raw)*8//10]
df_test = df_raw.iloc[len(df_raw)*8//10:]

In [4]:
print(df_train.shape, df_test.shape)
print(df_train.shape[0] + df_test.shape[0])
print(df_raw.shape[0])

(4026, 6) (1007, 6)
5033
5033


In [5]:
def create_states(df, window_size=9):
    states = []
    for i in range(window_size, len(df)):
        state = df.iloc[i-window_size:i].values
        states.append(state)
    return np.array(states)

states = create_states(df_train)

In [6]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Example dimensions
input_dim = states.shape[1] * states.shape[2]  # window_size * number of features
output_dim = 3  # buy, sell, hold

# Initialize the model and move it to the device
model = DQN(input_dim, output_dim) 


In [7]:
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# Initialize replay memory
memory = ReplayMemory(10000)

In [8]:
class DQNAgent:
    def __init__(self, state_size, action_size, lr=0.0001, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayMemory(10000)
        self.gamma = gamma    # discount rate
        self.epsilon = epsilon   # exploration rate
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.last_buy_index = -float('inf')  # Track the last buy index
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done)
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0)
        act_values = self.model(state)
        return torch.argmax(act_values[0]).item()
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = self.memory.sample(batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0)
            next_state = torch.FloatTensor(next_state).unsqueeze(0)
            target = reward
            if not done:
                target = reward + self.gamma * torch.max(self.model(next_state)[0]).item()
            target_f = self.model(state)
            target_f[0][action] = target
            self.optimizer.zero_grad()
            loss = self.criterion(target_f, self.model(state))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay



In [9]:
# Define the objective function
def objective(trial, states=states):
    state_size = states.shape[1] * states.shape[2]
    action_size = 3

    # Suggest hyperparameters
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    gamma = trial.suggest_uniform('gamma', 0.8, 0.999)
    epsilon = trial.suggest_uniform('epsilon', 0.8, 1.0)
    epsilon_min = trial.suggest_uniform('epsilon_min', 0.01, 0.1)
    epsilon_decay = trial.suggest_uniform('epsilon_decay', 0.9, 0.999)

    # Initialize the agent with suggested hyperparameters
    agent = DQNAgent(state_size, action_size, lr, gamma, epsilon, epsilon_min, epsilon_decay)

    # Train the agent on the training set
    episodes = 100
    batch_size = 32
    total_rewards = []

    for e in range(episodes):
        state = states[0]
        total_reward = 0
        last_action = 'Hold'
        days_since_last_buy = 0
        has_open_position = False

        for time in range(1, len(states)):
            action = agent.act(state)

            # Ensure sell only after buy and max 11 days between buy and sell
            if has_open_position:
                days_since_last_buy += 1

            if action == 0:  # Sell
                if not has_open_position:  # Can't sell if no open position
                    action = 2  # Change to Hold if not previously bought
                else:
                    has_open_position = False  # Sell the open position
                    days_since_last_buy = 0  # Reset days since last buy

            if action == 1:  # Buy
                if has_open_position:  # Can't buy if already have an open position
                    action = 2  # Change to Hold if already bought
                else:
                    has_open_position = True  # Buy, opening a new position
                    days_since_last_buy = 0  # Reset days since last buy

            # Force a sell if more than 11 days since last buy
            if has_open_position and days_since_last_buy > 11:
                action = 0  # Force a sell

            # Update last action and reset days since last buy if sell or buy happens
            if action == 1:  # Buy
                days_since_last_buy = 0
                has_open_position = True
            if action == 0:  # Sell
                has_open_position = False

            next_state = states[time]
            reward = next_state[-1][3] - state[-1][3]  # Example: Reward based on price difference
            agent.remember(state, action, reward, next_state, done=(time == len(states) - 1))
            state = next_state
            total_reward += reward

            if time == len(states) - 1:
                total_rewards.append(total_reward)
                break

            if len(agent.memory) > batch_size:
                agent.replay(batch_size)

        print(f"Iteration {e+1}/{episodes}, Total Reward: {total_reward}")

    return np.mean(total_rewards)

# Optimize hyperparameters with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(f"Best hyperparameters: {study.best_params}")

[I 2024-07-16 20:10:02,687] A new study created in memory with name: no-name-98b8fcc4-cd02-4d2e-b1ee-b3abc1b6484f


In [None]:
# Initialize the agent
state_size = states.shape[1] * states.shape[2]
action_size = 3
agent = DQNAgent(state_size, action_size, lr=0.0005, gamma=0.95, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995)

In [None]:
# episodes = 1
# batch_size = 32
# Episode = []
# Time = []
# Reward = []
# Total_Reward = []
# Action = []
# next_price = []
# prev_price = []
# last_action = 'Hold'
# days_since_last_buy = 0
# has_open_position = False

# for e in range(episodes):
#     state = states[0]
#     total_reward = 0
#     last_action = 'Hold'
#     days_since_last_buy = 0
#     has_open_position = False  # Track if there's an open position
    
#     for time in range(1, len(states)):
#         action = agent.act(state)

#         # Ensure sell only after buy and max 11 days between buy and sell
#         if has_open_position:
#             days_since_last_buy += 1

#         if action == 0:  # Sell
#             if not has_open_position:  # Can't sell if no open position
#                 action = 2  # Change to Hold if not previously bought
#             else:
#                 has_open_position = False  # Sell the open position
#                 days_since_last_buy = 0  # Reset days since last buy

#         if action == 1:  # Buy
#             if has_open_position:  # Can't buy if already have an open position
#                 action = 2  # Change to Hold if already bought
#             else:
#                 has_open_position = True  # Buy, opening a new position
#                 days_since_last_buy = 0  # Reset days since last buy

#         # Force a sell if more than 11 days since last buy
#         if has_open_position and days_since_last_buy > 11:
#             action = 0  # Force a sell

#         # Update last action and reset days since last buy if sell or buy happens
#         if action == 1:  # Buy
#             days_since_last_buy = 0
#             has_open_position = True
#         if action == 0:  # Sell
#             has_open_position = False

#         next_state = states[time]
#         reward = next_state[-1][3] - state[-1][3]  # Example: Reward based on price difference
#         next_price.append(next_state[-1][3])
#         prev_price.append(state[-1][3])
#         done = time == len(states) - 1
#         agent.remember(state, action, reward, next_state, done)
#         state = next_state
#         total_reward += reward
#         Episode.append(e+1)
#         Time.append(time)
#         Reward.append(reward)
#         Total_Reward.append(total_reward)
#         Action.append(action)
#         if done:
#             print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward}")
#             break
#         if len(agent.memory) > batch_size:
#             agent.replay(batch_size)

# log_train = pd.DataFrame({'Episode': Episode, 'Time': Time, 'Reward': Reward, 'Total_Reward': Total_Reward, 'Action': Action, 'Next_Price': next_price, 'Prev_Price': prev_price})
# log_train['Action'] = log_train['Action'].map({0: 'Sell', 1: 'Buy', 2: 'Hold'})
# log_train['Action'].value_counts()


In [None]:
# def evaluate_agent(agent, states):
#     state = states[0]
#     total_reward = 0
#     actions = []
#     days_since_last_buy = 0
#     has_open_position = False
#     last_action = 'Hold'
    
#     for time in range(1, len(states)):
#         action = agent.act(state)

#         # Ensure sell only after buy and max 11 days between buy and sell
#         if has_open_position:
#             days_since_last_buy += 1

#         if action == 0:  # Sell
#             if not has_open_position:  # Can't sell if no open position
#                 action = 2  # Change to Hold if not previously bought
#             else:
#                 has_open_position = False  # Sell the open position
#                 days_since_last_buy = 0  # Reset days since last buy

#         if action == 1:  # Buy
#             if has_open_position:  # Can't buy if already have an open position
#                 action = 2  # Change to Hold if already bought
#             else:
#                 has_open_position = True  # Buy, opening a new position
#                 days_since_last_buy = 0  # Reset days since last buy

#         # Force a sell if more than 11 days since last buy
#         if has_open_position and days_since_last_buy > 11:
#             action = 0  # Force a sell

#         # Update last action and reset days since last buy if sell or buy happens
#         if action == 1:  # Buy
#             days_since_last_buy = 0
#             has_open_position = True
#         if action == 0:  # Sell
#             has_open_position = False

#         next_state = states[time]
#         reward = next_state[-1][3] - state[-1][3]  # Example: Reward based on price difference
#         state = next_state
#         total_reward += reward
#         actions.append(action)

#     return total_reward, actions

# # Assume `validation_states` is the validation dataset prepared similarly to the training dataset
# test_states = create_states(df_test)
# total_reward, actions = evaluate_agent(agent, test_states)
# print(f"Total Reward on Test Data: {total_reward}")

In [None]:
# def plot_actions(df, title='Trading Actions'):
#     plt.figure(figsize=(15, 7))
#     plt.plot(df['Close'], label='Close Price', color='blue')
#     buy_signals = df[df['Action'] == 'Buy']
#     sell_signals = df[df['Action'] == 'Sell']
#     hold_signals = df[df['Action'] == 'Hold']
#     plt.scatter(buy_signals.index, buy_signals['Close'], marker='^', color='green', label='Buy', alpha=1)
#     plt.scatter(sell_signals.index, sell_signals['Close'], marker='v', color='red', label='Sell', alpha=1)
#     plt.scatter(hold_signals.index, hold_signals['Close'], marker='o', color='orange', label='Hold', alpha=0.5)
#     plt.title(title)
#     plt.xlabel('Date')
#     plt.ylabel('Price')
#     plt.legend()
#     plt.show()

# # Visualize actions on training data
# plot_actions(df_train, title='Training Data Actions')

# #visualize actions on validation data
# plot_actions(df_validation, title='Validation Data Actions')

# # Visualize actions on test data
# plot_actions(df_test, title='Test Data Actions')
