## Q-Learning with function approximation - Average reward setting

In [None]:
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import time

from scipy.optimize import fsolve, minimize
from scipy.integrate import quad
from scipy import linspace, meshgrid, arange, empty, concatenate, newaxis, shape

from collections import deque

## Try TD-learning, SARSA

### $Q(s,a; \theta) = \theta^{\top} \phi(s,a)$, where, $\theta = [\theta_1, \cdots, \theta_{10}]$ and $\phi(s,a) = [n_1^2, \ n_1n_2, \ n_2^2, \ n_1, \ n_2, \ a^2, \ n_1 a, \ n_2 a, \ a, \ 1]$
## Algorithm:
### Suppose we start in state $s_0$. Then for $t \geq 0$,
### $a_t = \begin{cases} \arg\min_{a' \in [0,1]} Q(s_t,a';\theta_t) \hspace{1 cm} \text{w.p.} \ 1-\epsilon \\ \text{Uniform}[0,1] \hspace{4 cm} \text{w.p.} \ \epsilon \end{cases}$
### $C_t, s_{t+1}$ observed
### $\delta_t = (C_t - \overline{C}) + \min_{a' \in [0,1]} Q(s_{t+1}, a' ; \theta) - Q(s_t, a_t ; \theta)$
### $\theta_{t+1} = \theta_t + \alpha_t \delta_t \phi(s_t, a_t)$
### $\overline{C} \leftarrow \overline{C} + \beta_t \delta_t$

In [None]:
def speed_up(p, n_cores):
    rate = 1/(1 - p*(1 - 1/max(1, n_cores)))
    return rate

# Returns transition probabilities from any state (n1, n2)
def transition_probabilities(model_pars, state, action):
    lam, mu, cores, p1, p2, alpha, M = model_pars
    [n1, n2] = state
    
    p1 = lam*alpha if n1 < M else 0
    p2 = lam*(1-alpha) if n2 < M else 0
    p3 = min(n1, cores*action)*mu*speed_up(p1, cores*action/n1) if n1 > 0 else 0
    p4 = min(n2, cores*(1-action))*mu*speed_up(p2, cores*(1-action)/n2) if n2 > 0 else 0
    p5 = 1 - p1 - p2 - p3 - p4
    return [p1, p2, p3, p4, p5]

# Given current state, it samples the next state under the optimal core allocation policy
# which is evaluated by solving the Bellman optimality equation
def next_state(model_pars, current_state, action):
    [n1, n2] = current_state
    
    possible_next_states = [[n1+1, n2], [n1, n2+1], [n1-1, n2], [n1, n2-1], [n1, n2]]
    indices = [0, 1, 2, 3, 4]
    probabilities = transition_probabilities(model_pars, current_state, action)
    
    next_state = possible_next_states[np.random.choice(indices, size = 1, p = probabilities)[0]]
    return next_state

def feature_vector(model_pars, state, action):
    M = model_pars[6]
    [n1, n2] = state
    
    phi = np.array([n1**2/M**2, n1*n2/M**2, n2**2/M**2, n1/M, n2/M, action**2, n1*action/M, n2*action/M, action, 1], dtype = 'object')
    return phi

def Q_approximation(model_pars, state, action, theta):
    phi = feature_vector(model_pars, state, action)
    return np.dot(theta, phi)

def min_argmin_Q(model_pars, state, theta):
    lam, mu, cores, p1, p2, alpha, M = model_pars
    
    Q_values = np.array([Q_approximation(model_pars, state, (i/cores), theta) for i in range(cores+1)])
    argmin_Q_values = np.argmin(Q_values)
    action = argmin_Q_values/cores
    min_Q_value = Q_values[argmin_Q_values]
    return [action, min_Q_value]

def choose_action(model_pars, state, theta, epsilon):
    lam, mu, cores, p1, p2, alpha, M = model_pars
    ber = np.random.binomial(1, epsilon)
    chosen_action = min_argmin_Q(model_pars, state, theta)[0] if ber == 0 else np.random.randint(cores + 1, size = 1)/cores
    return chosen_action

def TD_error(model_pars, current_state, next_state, action, theta, average_cost):
    delta = (current_state[0] + current_state[1]) - average_cost \
            + min_argmin_Q(model_pars, next_state, theta)[1] - Q_approximation(model_pars, current_state, action, theta)
    return delta

def Q_learning(model_pars, initial_state, initial_theta, n_iters):
    current_state = initial_state
    theta = np.zeros(10)
    
    cost = 0
    avg_cost = 0
    delta = 0
    
    epsilon_iterates = []
    immediate_cost_iterates = []
    delta_iterates = []
    theta_iterates = []
    avg_cost_iterates = []
    
    N_0 = 0.2*n_iters
    
    for i in range(n_iters):
        if i < N_0: # Explore for first N_0 steps
            action = np.random.uniform(low = 0, high = 1, size = 1)
            epsilon_iterates.append(1)
        else: # After N_0 steps, follow an epsilon-greedy policy
            # Choosing epsilon for epsilon-greedy policy
            epsilon = 1/math.sqrt(i+1)
            epsilon_iterates.append(epsilon)
            # Taking action
            action = choose_action(model_pars, current_state, theta, epsilon)
        # Cost and next state
        immediate_cost = current_state[0] + current_state[1]
        immediate_cost_iterates.append(immediate_cost)
        future_state = next_state(model_pars, current_state, action)
        # TD error
        delta = TD_error(model_pars, current_state, future_state, action, theta, avg_cost)
        delta_iterates.append(delta)
        # theta update
        theta = theta + (1/math.pow(i+1, 0.75))*delta*feature_vector(model_pars, current_state, action)
        theta_iterates.append(theta)
        # average cost update
        avg_cost = avg_cost + (1/math.pow(i+1, 0.75))*delta
        avg_cost_iterates.append(avg_cost)
        
    return [epsilon_iterates, immediate_cost_iterates, delta_iterates, theta_iterates, avg_cost_iterates]

In [None]:
# Model parameters
lam = 4
mu = 1
cores = 10
p1 = 0.4
p2 = 0.75
alpha = 0.4
M = 20

scale = lam + M*mu*(speed_up(p1, cores) + speed_up(p2, cores))

# Scaled model parameters
model_pars = [lam/scale, mu/scale, cores, p1, p2, alpha, M]

In [None]:
# Q-Learning
initial_state = [0, 0]
initial_theta = np.random.random(10) 
n_iters = 10000

[epsilon_iterates, immediate_cost_iterates, delta_iterates, theta_iterates, avg_cost_iterates] = Q_learning(model_pars, initial_state, initial_theta, n_iters)

## Graph plots

In [None]:
plt.plot(delta_iterates, color="blue")
plt.xlabel("Iterate")
plt.ylabel("TD Error")
plt.title("TD Error iterates")