In [None]:
import yfinance as yf
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import datetime as dt
from plotly import tools
from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot, iplot_mpl
stock_start_date = '2021-01-01'
stock_end_date = '2023-12-31'
df = yf.download('MSFT', start= stock_start_date, end=stock_end_date)

In [None]:
# Function to format the price for printing
def formatPrice(n):
    return ("-$" if n < 0 else "$") + "{0:.2f}".format(abs(n))

def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -100, 100)))  # Clip values to prevent overflow

def getState(data, t, window_size):
    # Ensuring there is enough data to create a full state
    d = t - window_size + 1
    if d < 0:
        # Padding the data if 'd' is negative
        block = np.pad(data[:t + 1], ((abs(d), 0), (0, 0)), 'constant', constant_values=(0))
    else:
        block = data[d:t + 1]

    res = [sigmoid(block[i + 1] - block[i]) for i in range(min(len(block) - 1, window_size - 1))]
    # Padding the result to ensure it always has 'window_size - 1' elements
    if len(res) < window_size - 1:
        res = [0] * (window_size - 1 - len(res)) + res

    # Ensure the output shape matches the expected input shape of the model
    return np.array([res]).reshape(1, -1)

def plot_behavior(data_input, states_buy, states_sell, profit):
    # Assuming data_input is a 2D array with 6 features, where the 'Close' price is the 4th column (index 3)
    #close_prices = data_input[:, 3]  # Extracting the 'Close' prices

    fig = plt.figure(figsize = (15,5))
    plt.plot(data_input, color='r', lw=2.)  # Plot the 'Close' price line

    # Mark the buy and sell points
    # markevery parameter expects a list of indices, here we directly use states_buy and states_sell
    plt.plot(data_input, '^', markersize=10, color='m', label = 'Buying signal', markevery = states_buy)
    plt.plot(data_input, 'v', markersize=10, color='k', label = 'Selling signal', markevery = states_sell)

    plt.title('Total gains: %f' % profit)
    plt.legend()
    plt.show()

In [None]:
class Agent:
    def __init__(self, state_size, window_size, model_name="", is_eval=False):
        self.state_size = state_size
        self.action_size = 3  # sit, buy, sell
        self.memory = deque(maxlen=2000)  # Increased memory size
        self.inventory = []
        self.model_name = model_name
        self.is_eval = is_eval

        self.gamma = 0.95  # Discount rate
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995  # More aggressive decay
        self.learning_rate = 0.001  # Initial learning rate
        self.learning_rate_decay = 0.999  # Learning rate decay
        self.model = load_model(model_name) if is_eval else self._model()

    def _model(self):
        model = Sequential()
        model.add(Input(shape=(self.state_size,)))
        model.add(Dense(units=64, activation="relu"))
        model.add(Dense(units=32, activation="relu"))
        model.add(Dense(units=16, activation="relu"))  # Adjusted architecture
        model.add(Dense(self.action_size, activation="linear"))
        model.compile(loss="mse", optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def act(self, state):
        if not self.is_eval and random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        options = self.model.predict(state)
        return np.argmax(options[0])

    def expReplay(self, batch_size):
        mini_batch = random.sample(self.memory, min(len(self.memory), batch_size))
        states = np.zeros((len(mini_batch), self.state_size))
        next_states = np.zeros((len(mini_batch), self.state_size))

        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            if state.shape == (1, self.state_size) and next_state.shape == (1, self.state_size):
                states[i] = state
                next_states[i] = next_state
            else:
                continue  # Skip any invalid states

        # Proceed with the calculation only if there are valid states
        if np.any(states):
            target_f = self.model.predict(states)
            next_Q_values = self.model.predict(next_states)
            for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
                target = reward
                if not done:
                    target = reward + self.gamma * np.amax(next_Q_values[i])
                target_f[i][action] = target

            # Fit the model
            self.model.fit(states, target_f, epochs=1, verbose=0)

In [None]:
# Main code to run the training loop
window_size = 1  # This should reflect the number of timesteps each state considers
feature_count = 6  # Number of features per timestep
state_size = window_size * feature_count

# Instantiate the agent
agent = Agent(state_size=state_size, window_size=window_size)

#In this step we feed the closing value of the stock price
data = X_train
l = len(data) - 1
#
batch_size = 32
#An episode represents a complete pass over the data.
episode_count = 10

In [None]:
for e in range(episode_count + 1):
    print(f"Running episode {e}/{episode_count}")
    state = getState(data, 0, window_size)
    total_profit = 0
    agent.inventory = []
    states_sell = []
    states_buy = []

    for t in range(l):
        action = agent.act(state)  # Agent takes an action
        next_state = getState(data, t + 1, window_size)  # Observe the next state
        reward = 0

        # Take action based on the current state
        if action == 1:  # Buy
            agent.inventory.append(data[t][3])
            states_buy.append(t)
            print("Buy: " + formatPrice(data[t][3]))

        elif action == 2 and len(agent.inventory) > 0:  # Sell
            bought_price = agent.inventory.pop(0)
            reward = max(data[t][3] - bought_price, 0)
            total_profit += data[t][3] - bought_price
            states_sell.append(t)
            print("Sell: " + formatPrice(data[t][3]) + " | Profit: " + formatPrice(data[t][3] - bought_price))

        done = t == l - 1  # Check if we're at the end of the episode

        # Store the transition in memory
        agent.memory.append((state, action, reward, next_state, done))

        # Move to the next state
        state = next_state

        # Perform experience replay if the memory is sufficient
        if len(agent.memory) > batch_size:
            agent.expReplay(batch_size)

        # If done, print the total profit and plot the trades
        if done:
            print("--------------------------------")
            print("Total Profit: " + formatPrice(total_profit))
            print("--------------------------------")
            plot_behavior(data[:, 3], states_buy, states_sell, total_profit)

    # Save the model after each episode
    agent.model.save("model_ep" + str(e))