Module 9 - Actor Critic - Classroom Demo

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
import os
import gc
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
#Change the working directory
os.chdir('/content/gdrive/My Drive/Colab Notebooks/Stock Prediction using A2C')
os.listdir()

['data', 'StockPrediction.ipynb']

In [None]:
from keras import layers, models, optimizers
from keras import backend as K


Using TensorFlow backend.


In [None]:
from keras import regularizers
from keras import initializers

In [None]:
# The Actor Class takes in the state as input and outputs the log probability of the actions
class Actor:
    
    
  # """Actor (policy) Model. """
  # The inputs to the Actor class are State size and Action Size
    def __init__(self, state_size, action_size):

        self.state_size = state_size
        self.action_size = action_size

        self.build_model()
    # The neural network is built with input layers = state_size.
    # It has 2 layers of 16 and 32 hidden Units
    #The final Output layer predicts the Action probabilities having softmax as the Activation function

    def build_model(self):
        states = layers.Input(shape=(self.state_size,), name='states')
        
        net = layers.Dense(units=16,kernel_regularizer=regularizers.l2(1e-6))(states)
        net = layers.BatchNormalization()(net)
        net = layers.Activation("relu")(net)
        net = layers.Dense(units=32,kernel_regularizer=regularizers.l2(1e-6))(net)
        net = layers.BatchNormalization()(net)
        net = layers.Activation("relu")(net)

        actions = layers.Dense(units=self.action_size, activation='softmax', name = 'actions')(net)
        
        self.model = models.Model(inputs=states, outputs=actions)
    # Loss function is defined as Action Probabilities * Q Gradients ( Called Action Gradients here)
      
        action_gradients = layers.Input(shape=(self.action_size,))
        loss = K.mean(-action_gradients * actions)

        optimizer = optimizers.Adam(lr=.00001)
        updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss)
        self.train_fn = K.function(
            inputs=[self.model.input, action_gradients, K.learning_phase()],
            outputs=[],
            updates=updates_op)

In [None]:
class Critic:
    """Critic (Value) Model."""
    # The Critic Neural network takes in the state and action as Input and outputs the Q-Value

    def __init__(self, state_size, action_size):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
        """
        # The input to init function is state size and Action Size
        self.state_size = state_size
        self.action_size = action_size

        self.build_model()

    def build_model(self):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        # Define input layers
        states = layers.Input(shape=(self.state_size,), name='states')
        actions = layers.Input(shape=(self.action_size,), name='actions')
        # State neural network
        net_states = layers.Dense(units=16,kernel_regularizer=regularizers.l2(1e-6))(states)
        net_states = layers.BatchNormalization()(net_states)
        net_states = layers.Activation("relu")(net_states)

        net_states = layers.Dense(units=32, kernel_regularizer=regularizers.l2(1e-6))(net_states)
        
        # Action neural Network
        net_actions = layers.Dense(units=32,kernel_regularizer=regularizers.l2(1e-6))(actions)
       
        # The main neural network takes in the State and the Action and outputs the Q-value
        net = layers.Add()([net_states, net_actions])
        net = layers.Activation('relu')(net)

        Q_values = layers.Dense(units=1, name='q_values',kernel_initializer=initializers.RandomUniform(minval=-0.003, maxval=0.003))(net)

        self.model = models.Model(inputs=[states, actions], outputs=Q_values)
        
        # The loss function is to minimize the difference between current Q-Value and Target Q-Value 
        optimizer = optimizers.Adam(lr=0.001)
        self.model.compile(optimizer=optimizer, loss='mse')
        
        # Gradient of Q-Value
        action_gradients = K.gradients(Q_values, actions)

        self.get_action_gradients = K.function(
            inputs=[*self.model.input, K.learning_phase()],
            outputs=action_gradients)

In [None]:
import numpy as np
from numpy.random import choice
import random
from collections import namedtuple, deque

In [None]:
class ReplayBuffer:
    # A Replay buffer is created to Store experiences.
    # It is a tuple that contains State, Action, Reward, Next_State and done
    # Random mini batches are sampled from the Replay Buffer 
    def __init__(self, buffer_size, batch_size):
    
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
    
    # Function to add a new value into ReplayBuffer
    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    # Sample a random minibatch from the replay Buffer
    def sample(self, batch_size=32):
        return random.sample(self.memory, k=self.batch_size)
    
    def __len__(self):
        return len(self.memory)
    
    
class Agent:
  # Agent Class takes in Bacth Size, State Size and a boolean finction to check train or test
  # It creates instances for both Actor and Critic Classes
    def __init__(self, state_size, batch_size, is_eval = False):
        self.state_size = state_size
        self.action_size = 3
        self.buffer_size = 1000000
        self.batch_size = batch_size
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        # Inventory contains the currently owned stocks
        self.inventory = []
        self.is_eval = is_eval
        
        # Discount Factor gamma and soft Update parameter Tau
        self.gamma = 0.99
        self.tau = 0.001
        
        # Create Local versions of Actor and Critic networks
        # Create Target version of Actor and Critic Network

        self.actor_local = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)    

        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        
        # Initially, weights of Local network = weights of Target network
        self.critic_target.model.set_weights(self.critic_local.model.get_weights()) 
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

    # The actor network gives the probability of Actions and an Action is returned by the below function based on that probability    
    def act(self, state):
        options = self.actor_local.model.predict(state)
        self.last_state = state
        if not self.is_eval:
            return choice(range(3), p = options[0])
        return np.argmax(options[0])
    
    # A random batchsize is sampled from the experience buffer. Learning happens from this sample.
    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)
            self.last_state = next_state

    def learn(self, experiences):               
        states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)    
        actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)
        
        # The next Actions are fecthed from Actor_target
        # Using the above action, the next Qvalue is fetched from critic_target
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
        
        # The local Critic is trained based on next Q-Value returned
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x = [states, actions], y=Q_targets)
        
        # Gradient of the Q-value is obtained and this is used to train the local Actor
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),(-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])
        self.soft_update(self.critic_local.model, self.critic_target.model)  
        self.soft_update(self.actor_local.model, self.actor_target.model)
    # A soft update based on parameter Tau to avoid drastic changes in the model
    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

In [None]:
import numpy as np
import math

In [None]:
# Helper functions to get State in a window-size, format the price, and read the data from the file 
def formatPrice(n):
    if n >= 0:
        curr = "$"
    else:
        curr = "-$"
    return curr + "{0:.2f}".format(abs(n))


def getStockData(key):
    datavec = []
    lines = open("data/" + key + ".csv", "r").read().splitlines()
    
    for line in lines[1:]:
        datavec.append(float(line.split(",")[4]))
    
    return datavec
  

def getState(data, t, window):
    if t - window >= -1:
        vec = data[t - window + 1:t + 1]
    else: 
        vec = -(t-window+1)*[data[0]]+data[0: t + 1]
    scaled_state = []
    for i in range(window - 1):
        scaled_state.append(1/(1 + math.exp(vec[i] - vec[i+1])))
    
    return np.array([scaled_state])

In [None]:
window_size = 50
batch_size = 32
agent = Agent(window_size, batch_size)
data = getStockData("train")
l = len(data) - 1
episode_count = 10

for e in range(episode_count):
    print("Episode " + str(e) + "/" + str(episode_count))
    state = getState(data, 0, window_size + 1)

    agent.inventory = []
    total_profit = 0
    done = False
    for t in range(l):
        # An instance of the Agent class is created, An Action and action probability are fetched
        action = agent.act(state)        
        action_prob = agent.actor_local.model.predict(state)

        next_state = getState(data, t + 1, window_size + 1)
        reward = 0
        # Action == buy , it is added to the inventory
        if action == 1:
            agent.inventory.append(data[t])
            print("Buy:" + formatPrice(data[t]))
        # Action == sell, it is removed from the inventory and Profit is calculated
        elif action == 2 and len(agent.inventory) > 0:
            bought_price = agent.inventory.pop(0)
            reward = max(data[t] - bought_price, 0)
            total_profit += data[t] - bought_price
            print("sell: " + formatPrice(data[t]) + "| profit: " + formatPrice(data[t] - bought_price))

        if t == l - 1:
            done = True
        # Pass the action probability to get the next state    
        agent.step(action_prob, reward, next_state, done)
        state = next_state

        if done:
            print("------------------------------------------")
            print("Total Profit: " + formatPrice(total_profit))
            print("------------------------------------------")

test_data = getStockData("test")
l_test = len(test_data) - 1
state = getState(test_data, 0, window_size + 1)
total_profit = 0
agent.inventory = []
agent.is_eval = False
done = False
for t in range(l_test):
    action = agent.act(state)

    next_state = getState(test_data, t + 1, window_size + 1)
    reward = 0

    if action == 1:

        agent.inventory.append(test_data[t])
        print("Buy: " + formatPrice(test_data[t]))

    elif action == 2 and len(agent.inventory) > 0:
        bought_price = agent.inventory.pop(0)
        reward = max(test_data[t] - bought_price, 0)
        total_profit += test_data[t] - bought_price
        print("Sell: " + formatPrice(test_data[t]) + " | profit: " + formatPrice(test_data[t] - bought_price))

    if t == l_test - 1:
        done = True
    agent.step(action_prob, reward, next_state, done)
    state = next_state

    if done:
        print("------------------------------------------")
        print("Total Profit: " + formatPrice(total_profit))
        print("------------------------------------------")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Buy:$1338.35
Buy:$1324.80
Buy:$1304.86
Buy:$1316.63
sell: $1320.68| profit: $381.53
sell: $1278.04| profit: $354.32
sell: $1278.18| profit: $366.21
sell: $1315.13| profit: $404.42
Buy:$1325.66
sell: $1329.10| profit: $436.06
Buy:$1344.78
sell: $1357.98| profit: $430.75
Buy:$1319.99
Buy:$1331.85
Buy:$1362.16
Buy:$1365.51
Buy:$1352.46
sell: $1341.47| profit: $435.63
Buy:$1341.45
sell: $1334.76| profit: $380.18
sell: $1353.64| profit: $363.97
Buy:$1372.78
Buy:$1337.89
sell: $1385.30| profit: $387.26
Buy:$1379.32
sell: $1394.23| profit: $399.48
sell: $1401.35| profit: $398.11
sell: $1402.22| profit: $368.85
sell: $1402.80| profit: $358.66
sell: $1405.87| profit: $353.24
sell: $1404.11| profit: $332.45
Buy:$1403.93
Buy:$1415.51
Buy:$1413.49
sell: $1411.13| profit: $360.35
Buy:$1410.44
sell: $1410.49| profit: $353.41
Buy:$1406.58
Buy:$1404.94
Buy:$1429.08
Buy:$1459.99
sell: $1461.05| profit: $395.57
sell: $1433.32| profit: $336