# Exercise 1: Q-learning Widget Sales

In [None]:
import numpy as np
import matplotlib.pyplot as plt

Below we define a simple `WidgetShop` class that defines the true shop dynamics and demand distributions, as well as the reward function.

In [None]:
class WidgetShop:
    def __init__(self):
        self.X = np.array([0, 1, 2, 3, 4, 5]) # state space
        self.U = np.array([0, 2, 4]) # action space
        self.D = np.array([0, 1, 2, 3, 4]) # demand space
        self.P = np.array([0.1, 0.3, 0.3, 0.2, 0.1]) # daily demand distribution

    def action_idx(self, u: int):
        """
        Maps an action in self.U to an index.
        """
        return int(u/2)

    def step(self, x: int, u: int, d: int) -> int:
        """
        Compute the next state given the current state, action, and demand.
        """
        return np.clip(x + u - d, 0, 5)


    def reward(self, x: int, u: int, d: int) -> float:
        """
        Compute the reward given the current state, action, and demand.
        """
        price = 1.2
        cost_rent = 1.
        cost_storage = 0.05*x
        cost_order = np.sqrt(u)
        r = price*np.minimum(x + u, d) - cost_rent - cost_storage - cost_order
        return r
    
    def simulate(self,
                 policy: callable,
                 T: int,
                 x0: int = 5) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Simulate widget sales for a given policy.

        Parameters:
            policy: policy to simulate, callable as policy(x)
            T: number of time steps to simulate
            x0: initial inventory
        """
        x = np.zeros(T + 1)  # states
        u = np.zeros(T)      # actions
        r = np.zeros(T)      # rewards
        x[0] = x0            # initial state
        rng = np.random.default_rng(0) # for reproducibility
        for t in range(T):
            # Sample demand
            d = rng.choice(self.D, p=self.P)
    
            # Record action, reward, and next state
            u[t] = policy(x[t])
            r[t] = self.reward(x[t], u[t], d)
            x[t+1] = self.step(x[t], u[t], d)
    
        return x, u, r

#### Exercise 1.1: Model-free Q-learning
In this first exercise, you will use a dataset of past widget sales to learn tabulated Q-values without the use of a model of the environment. Specifically, in the code below update the array `Q` using Q-learning.

In [None]:
shop = WidgetShop()

# Generate historical data with a uniformly random policy
log = {}
T = 3 * 365
rng = np.random.default_rng(0) # for reproducibility
random_policy = lambda x: rng.choice(shop.U)
log['x'], log['u'], log['r'] = shop.simulate(random_policy, T)

# Q-learning
γ = 0.95                   # discount factor
α = 1e-2                   # learning rate
num_epochs = 5 * int(1/α)  # number of epochs

Q = np.zeros((shop.X.size, shop.U.size))
Q_epoch = np.zeros((num_epochs + 1, shop.X.size, shop.U.size))

for k in range(1, num_epochs + 1):
    # Shuffle transition tuple indices
    shuffled_indices = rng.permutation(T)

    ##### YOUR CODE STARTS HERE #####
    # Hint: You can use shop.action_idx() to convert the action to a tabular index
    # Do a Q-update for each transition tuple

    ###### YOUR CODE END HERE ######

    # Record Q-values for this epoch
    Q_epoch[k] = Q

#### Exercise 1.2: Value Iteration
We will now leverage the model of the widget shop supply and demand to compute the Q-value using value iteration. Finish the code below to update the array `Q_vi` using value iteration. Then, run the code to produce plots comparing the Q-values from value iteration vs the values computed above using the model-free data approach.

In [None]:
converged = False
eps = 1e-4
max_iters = 500
Q_vi = np.zeros((shop.X.size, shop.U.size))
Q_vi_prev = np.full(Q_vi.shape, np.inf)

for k in range(max_iters):
    ##### YOUR CODE STARTS HERE #####

    ###### YOUR CODE END HERE ######

    if np.max(np.abs(Q_vi - Q_vi_prev)) < eps:
        converged = True
        print('Value iteration converged after {} iterations.'.format(k))
        break
    else:
        np.copyto(Q_vi_prev, Q_vi)

if not converged:
    raise RuntimeError('Value iteration did not converge!')

# Plot Q-values for each epoch
fig, axes = plt.subplots(2, shop.X.size//2, figsize=(12, 6),
                         sharex=True, sharey=True, dpi=150)
fig.subplots_adjust(hspace=0.2)
for i, ax in enumerate(axes.ravel()):
    for j in range(shop.U.size):
        plot = ax.plot(Q_epoch[:, i, j], label='$u = {}$'.format(shop.U[j]))
        ax.axhline(Q_vi[i, j], linestyle='--', color=plot[0].get_color())
        ax.legend(loc='lower right')
        ax.set_title(r'$x = {}$'.format(shop.X[i]))
for ax in axes[-1, :]:
    ax.set_xlabel('epoch')
for ax in axes[:, 0]:
    ax.set_ylabel('$Q(x,u)$')
plt.show()

#### Exercise 1.3: Simulate Q-learning and Value Iteration Optimal Policies
Write code below to compute:
1. `π_ql(x)` and `π_vi(x)`: lambda functions, using the corresponding Q-value arrays computed above.
2. `r_ql` and `r_vi`: arrays of rewards from simulateing the shop with the above policies for 5 years.
1. `profit_ql` and `profit_vi`: arrays of cumulative profits for each day over 5 years (can use `np.cumsum`)

What do you notice about the cumulative profits from the two methods? Why might there be a difference between the non-discount and discounted cumulative profits? Try playing around with the discount factor to see how this affects the results.

In [None]:
# Simulate optimal policies for 5 years
T = 5 * 365

##### YOUR CODE STARTS HERE #####

###### YOUR CODE END HERE ######

print('Optimal policy (Q-learning):     ', [int(π_ql(x)) for x in shop.X])
print('Optimal policy (value iteration):', [int(π_vi(x)) for x in shop.X])

# Plot results
fig, ax = plt.subplots()
ax.plot(profit_ql, label=r'$Q$-learning')
ax.plot(profit_vi, label=r'value iteration')
ax.legend(loc='lower right')
ax.set_xlabel(r'day $t$')
ax.set_ylabel(r'cumulative profit $\sum_{k=0}^t r_k$')
plt.show()

# Repeat with discounted profits
profit_ql_discounted = np.cumsum((γ**np.arange(T)) * r_ql)
profit_vi_discounted = np.cumsum((γ**np.arange(T)) * r_vi)
fig, ax = plt.subplots()
ax.plot(profit_ql_discounted, label=r'$Q$-learning')
ax.plot(profit_vi_discounted, label=r'value iteration')
ax.legend(loc='lower right')
ax.set_xlabel(r'day $t$')
ax.set_ylabel(r'cumulative discounted profit $\sum_{k=0}^t \gamma^k r_k$')
plt.show()