In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_raw = pd.read_pickle('data/SP500.pkl')

In [3]:
df_train = df_raw.iloc[:len(df_raw)*6//10]
df_val = df_raw.iloc[len(df_raw)*6//10:len(df_raw)*8//10]
df_test = df_raw.iloc[len(df_raw)*8//10:]

In [4]:
print(df_train.shape, df_val.shape, df_test.shape)
print(df_train.shape[0]+df_val.shape[0]+df_test.shape[0])
print(df_raw.shape[0])

(3019, 6) (1007, 6) (1007, 6)
5033
5033


In [5]:
def create_states(df, window_size=10):
    states = []
    for i in range(window_size, len(df)):
        state = df.iloc[i-window_size:i].values
        states.append(state)
    return np.array(states)

states = create_states(df_train)
validation_states = create_states(df_val)

In [6]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Example dimensions
input_dim = states.shape[1] * states.shape[2]  # window_size * number of features
output_dim = 3  # buy, sell, hold

# Initialize the model and move it to the device
model = DQN(input_dim, output_dim) 


In [7]:
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# Initialize replay memory
memory = ReplayMemory(10000)

In [9]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayMemory(10000)
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0   # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()
        self.last_buy_index = -float('inf')  # Track the last buy index
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done)
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0)
        act_values = self.model(state)
        return torch.argmax(act_values[0]).item()
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = self.memory.sample(batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0)
            next_state = torch.FloatTensor(next_state).unsqueeze(0)
            target = reward
            if not done:
                target = reward + self.gamma * torch.max(self.model(next_state)[0]).item()
            target_f = self.model(state)
            target_f[0][action] = target
            self.optimizer.zero_grad()
            loss = self.criterion(target_f, self.model(state))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Initialize the agent
state_size = states.shape[1] * states.shape[2]
action_size = 3
agent = DQNAgent(state_size, action_size)


In [10]:
episodes = 1
batch_size = 32
Episode = []
Time = []
Reward = []
Total_Reward = []
Action = []
next_price = []
prev_price = []

for e in range(episodes):
    state = states[0]
    total_reward = 0
    for time in range(1, len(states)):
        action = agent.act(state)
        next_state = states[time]
        reward = next_state[-1][3] - state[-1][3]  # Example: Reward based on price difference
        done = time == len(states) - 1
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward   
        if done:
            print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward}")
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)


Episode 1/1, Total Reward: 908.8800048828125


In [11]:
log_train = pd.DataFrame({'Episode': Episode, 'Time': Time, 'Reward': Reward, 'Total_Reward': Total_Reward, 'Action': Action, 'Next_Price': next_price, 'Prev_Price': prev_price})
log_train['Action'] = log_train['Action'].map({0: 'Sell', 1: 'Buy', 2: 'Hold'})
log_train['Action'].value_counts()

Hold    1965
Buy      620
Sell     423
Name: Action, dtype: int64

In [12]:
display(log_train)

Unnamed: 0,Episode,Time,Reward,Total_Reward,Action,Next_Price,Prev_Price
0,1,1,0.669922,0.669922,Sell,1128.839966,1128.170044
1,1,2,5.770020,6.439941,Hold,1134.609985,1128.839966
2,1,3,1.209961,7.649902,Sell,1135.819946,1134.609985
3,1,4,-17.669922,-10.020020,Sell,1118.150024,1135.819946
4,1,5,5.939941,-4.080078,Hold,1124.089966,1118.150024
...,...,...,...,...,...,...,...
3003,1,3004,2.020020,923.430054,Hold,2051.600098,2049.580078
3004,1,3005,-1.800049,921.630005,Hold,2049.800049,2051.600098
3005,1,3006,-13.090088,908.539917,Hold,2036.709961,2049.800049
3006,1,3007,-0.770020,907.769897,Hold,2035.939941,2036.709961


In [None]:
# episodes = 50
# batch_size = 32
# Episode = []
# Time = []
# Reward = []
# Total_Reward = []
# Action = []

# for e in range(episodes):
#     state = states[0]
#     total_reward = 0
#     for time in range(1, len(states)):
#         action = agent.act(state, time)
#         next_state = states[time]
#         reward = next_state[-1][3] - state[-1][3]  # Example: Reward based on price difference
        
#         done = time == len(states) - 1 
#         agent.remember(state, action, reward, next_state, done)
#         state = next_state
#         total_reward += reward
#         Episode.append(e+1)
#         Time.append(time)
#         Reward.append(reward)
#         Total_Reward.append(total_reward)
#         Action.append(action)

#         if done:
#             print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward}")
#             break
#         if len(agent.memory) > batch_size:
#             agent.replay(batch_size)


In [None]:
log_train = pd.DataFrame({'Episode': Episode, 'Time': Time, 'Reward': Reward, 'Total_Reward': Total_Reward, 'Action': Action})
log_train['Action'] = log_train['Action'].map({0: 'Sell', 1: 'Buy', 2: 'Hold'})
log_train['Action'].value_counts()

In [None]:
def evaluate_agent(agent, states):
    state = states[0]
    total_reward = 0
    actions = []
    for time in range(1, len(states)):
        action = agent.act(state, time)
        next_state = states[time]
        reward = next_state[-1][3] - state[-1][3]  # Example: Reward based on price difference
        state = next_state
        total_reward += reward
        actions.append(action)
    return total_reward, actions

# Assume `validation_states` is the validation dataset prepared similarly to the training dataset
validation_states = create_states(df_validation)
total_reward, actions = evaluate_agent(agent, validation_states)
print(f"Total Reward on Validation Data: {total_reward}")


In [None]:
agent = DQNAgent(state_size, action_size, lr=0.0005, gamma=0.95, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995)


In [None]:

# Evaluate after fine-tuning
total_reward, actions = evaluate_agent(agent, validation_states)
print(f"Total Reward on Validation Data after fine-tuning: {total_reward}")


In [None]:
def predict_actions(agent, states):
    actions = []
    agent.last_buy_index = -float('inf')  # Reset last buy index
    agent.last_action = 2  # Assume starting with 'Hold' (2: Hold)

    for current_index, state in enumerate(states):
        action = agent.act(state, current_index)
        actions.append(action)

    return actions


# Predict actions for test dataset (assuming test_states is prepared similarly to training states)
test_states = create_states(df_test)
test_actions = predict_actions(agent, test_states)


# Predict actions for test dataset (assuming test_states is prepared similarly to training states)
test_states = create_states(df_test)
test_actions = predict_actions(agent, test_states)


In [None]:


# Mapping actions back to the test dataframe
df_test['Action'] = [None] * (len(df_test) - len(test_actions)) + test_actions

# Mapping actions (0: Hold, 1: Buy, 2: Sell) to strings for better readability
action_map = {0: 'Hold', 1: 'Buy', 2: 'Sell'}
df_test['Action'] = df_test['Action'].map(action_map)


In [None]:
df_test['Action'].value_counts()

In [None]:
def plot_actions(df, title='Trading Actions'):
    plt.figure(figsize=(15, 7))
    plt.plot(df['Close'], label='Close Price', color='blue')
    buy_signals = df[df['Action'] == 'Buy']
    sell_signals = df[df['Action'] == 'Sell']
    hold_signals = df[df['Action'] == 'Hold']
    plt.scatter(buy_signals.index, buy_signals['Close'], marker='^', color='green', label='Buy', alpha=1)
    plt.scatter(sell_signals.index, sell_signals['Close'], marker='v', color='red', label='Sell', alpha=1)
    plt.scatter(hold_signals.index, hold_signals['Close'], marker='o', color='orange', label='Hold', alpha=0.5)
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.show()

# Visualize actions on training data
plot_actions(df_train, title='Training Data Actions')

#visualize actions on validation data
plot_actions(df_validation, title='Validation Data Actions')

# Visualize actions on test data
plot_actions(df_test, title='Test Data Actions')
