# Reinforcement Learning: An Introduction

---

## Setup

In [None]:
# Variables used for kernel and git management
import ipynbname
NOTEBOOK_NAME = ipynbname.name()

KERNEL_VENV = '.venv'
KERNEL_NAME = 'rl'
KERNEL_DISPLAY_NAME = 'RL-venv'

KERNEL_SAVED = !grep -A4 "kernelspec" "{NOTEBOOK_NAME}.ipynb" | grep -Po "(?<=\"display_name\": \")[^,\"]+"
KERNEL_SAVED = KERNEL_SAVED[0]
                                                                      
IS_INTERACTIVE = ![ ! -e "{NOTEBOOK_NAME}.lock" ] && echo 1
IS_INTERACTIVE = IS_INTERACTIVE[0] if IS_INTERACTIVE else ''

In [None]:
%%bash -s "$KERNEL_VENV" "$KERNEL_NAME" "$KERNEL_DISPLAY_NAME" "$KERNEL_SAVED"
# Create a venv and a kernel for the notebook
KERNEL_VENV=$1
KERNEL_NAME=$2
KERNEL_DISPLAY_NAME=$3
KERNEL_SAVED=$4

if [ "$(. "$KERNEL_VENV/bin/activate" 2> /dev/null && which python)" != "$(pwd)/$KERNEL_VENV/bin/python" ]; then
    python -m venv "$KERNEL_VENV" --prompt "$KERNEL_DISPLAY_NAME" --system-site-packages;
    echo "Created virtual environment: '$(pwd)/$KERNEL_VENV'"
    OUTPUT_NOT_EMPTY=1
fi

if ! jupyter kernelspec list | grep -q $KERNEL_NAME; then
    (. "$KERNEL_VENV/bin/activate"; python -m ipykernel install --user --name=$KERNEL_NAME --display-name="$KERNEL_DISPLAY_NAME")
    echo "## Please refresh page and select: Kernel > Change kernel > $KERNEL_DISPLAY_NAME ##"
    OUTPUT_NOT_EMPTY=1
elif [ "$KERNEL_SAVED" != "$KERNEL_DISPLAY_NAME" ]; then
    echo "## Saved kernel is '$KERNEL_SAVED'. Please refresh page and select: Kernel > Change kernel > $KERNEL_DISPLAY_NAME ##"
    OUTPUT_NOT_EMPTY=1
fi

if [ -z "$OUTPUT_NOT_EMPTY" ]; then echo "No news is good news!"; fi

In [None]:
if IS_INTERACTIVE:
    COMMIT_MESSAGE = input("Next commit message: ")
    COMMIT_AMEND = ['','--amend'][input("Amend next commit? [y/N] ").lower() in ['y','yes']]
else:
    COMMIT_MESSAGE = ''
    COMMIT_AMEND = ''

In [None]:
%%bash -s "$IS_INTERACTIVE" "$NOTEBOOK_NAME" "$KERNEL_SAVED" "$KERNEL_DISPLAY_NAME" "$COMMIT_MESSAGE" "$COMMIT_AMEND"
# Commit and push to git
IS_INTERACTIVE=$1
NOTEBOOK_NAME=$2
KERNEL_SAVED=$3
KERNEL_DISPLAY_NAME=$4
COMMIT_MESSAGE=$5
COMMIT_AMEND=$6

ALL_CHECKS_PASSED=false
if [ -z $IS_INTERACTIVE ]; then
    echo "INFO: Skipping git section."
elif [ -z "$COMMIT_MESSAGE" ]; then
    echo "ERROR: Commit message cannot be empty. Please execute the cell above."
elif [ "$COMMIT_MESSAGE" = "\$COMMIT_MESSAGE" ]; then
    echo "ERROR: Missing variables. Please execute all cells above."
elif [ "$KERNEL_SAVED" != "$KERNEL_DISPLAY_NAME" ]; then
    echo "ERROR: Wrong kernel saved. Please select the '$KERNEL_DISPLAY_NAME' kernel and save the notebook."
elif ! git checkout develop 1> /dev/null; then
    echo "ERROR: Couldn't checkout develop. Manual intervention required."
else
    ALL_CHECKS_PASSED=true
fi

if $ALL_CHECKS_PASSED; then
    mkdir -p .backups
    cp "$NOTEBOOK_NAME".ipynb .backups/
    mv "$NOTEBOOK_NAME".ipynb "$NOTEBOOK_NAME".lock
    python -m nbconvert --clear-output "$NOTEBOOK_NAME".lock
    git add "$NOTEBOOK_NAME".ipynb
    git commit $COMMIT_AMEND -m "$COMMIT_MESSAGE"
#     git push $([ -n $COMMIT_AMEND ] && echo -f)
    cp ".backups/$NOTEBOOK_NAME".ipynb .
    rm "$NOTEBOOK_NAME".lock
fi

In [None]:
# Pypi libraries
import numpy as np
import matplotlib.pyplot as plt

# Matplotlib configs
# %config InlineBackend.figure_formats = ['svg']
# plt.rcParams['figure.dpi'] = 300

## Chapter 2

### 2.3 - The 10-armed Testbed

In [None]:
class ArmedBandit:
    def __init__(self,arms,runs):
        self.arms = arms
        self.runs = runs
        self.runs_range = np.arange(self.runs)
        
        self.action_values = np.random.normal(0,1,size=(self.arms,self.runs))
        self.optimal_action = self.action_values.argmax(axis=0)
    
    def step(self,action):
        return np.random.normal(self.action_values[action,self.runs_range],1)

In [None]:
class EpsilonPolicy:
    def __init__(self,actions,runs,epsilon):
        self.actions = actions
        self.runs = runs
        self.runs_arange = np.arange(runs)
        self.epsilon = epsilon
        
        self.action_reward = np.zeros((actions,runs))
        self.action_count = np.zeros((actions,runs))
        self.prev_action = np.zeros(runs,dtype=int)
    
    def act(self):
        # action_type: 0=random, 1=greedy
        action_type = np.random.rand(self.runs) > self.epsilon
        
        # random actions
        self.prev_action[~action_type] = np.random.randint(self.actions,size=sum(~action_type))
        # greedy actions: argmax estimate action values via sample average (replace 0 counts by 1 to avoid div0 error)
        self.prev_action[action_type] = (self.action_reward[:,action_type] / \
                                         (self.action_count[:,action_type] + (self.action_count[:,action_type] == 0)) \
                                        ).argmax(axis=0)
        return self.prev_action
    
    def update(self,reward):
        self.action_reward[self.prev_action,self.runs_arange] += reward
        self.action_count[self.prev_action,self.runs_arange] += 1

In [None]:
class PolicyAnalyser:        
    def __init__(self,episode_length):
        self.episode_length = episode_length
        self.labels = []
        self.reward_history = np.zeros((0,episode_length))
        self.optimal_history = np.zeros((0,episode_length))
        
    def create(self,label,env,policy):
        self.env = env
        self.policy = policy
        
        self.step = 0
        self.labels.append(label)
        self.reward_history = np.concatenate((self.reward_history,np.zeros((1,self.episode_length))))
        self.optimal_history = np.concatenate((self.optimal_history,np.zeros((1,self.episode_length))))
        
    def log(self,reward):
        self.reward_history[-1,self.step] = reward.mean()
        self.optimal_history[-1,self.step] = (self.policy.prev_action == self.env.optimal_action).sum()/self.env.runs
        self.step += 1
    
    def plot(self):
        plt.figure()
        plt.plot(self.reward_history.transpose(),label=self.labels,linewidth=0.75)
        plt.legend()
        
        plt.figure()
        plt.plot(self.optimal_history.transpose(),label=self.labels,linewidth=1)
        plt.legend()

In [None]:
def run_bandit_episode(env,policy,datalog,episode_length):
    for i in range(episode_length):
        action = policy.act()
        reward = env.step(action)
        policy.update(reward)
        datalog.log(reward)

arms = 10
episode_length = 1000
runs = 2000
epsilon = [0,0.01,0.1]

datalog = PolicyAnalyser(episode_length)
for eps in epsilon:
    env = ArmedBandit(arms,runs)
    policy = EpsilonPolicy(arms,runs,eps)
    datalog.create(f'eps={eps}',env,policy)
    run_bandit_episode(env,policy,datalog,episode_length)

In [None]:
datalog.plot()

## Exercice 2.5 - Nonstationary Bandit Problem

In [None]:
# Non-stationnary Armed Bandit
class NSArmedBandit(ArmedBandit):
    # Armed Bandit problem with action values initialised to 0
    # and a random walk every step
    def __init__(self,action_value_dev,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self._step = super().step
        self.action_values = np.zeros((self.arms,self.runs))
        self.action_value_dev = action_value_dev
        
    def step(self,*args,**kwargs):
        self.action_values = np.random.normal(self.action_values,self.action_value_dev)
        self.optimal_action = self.action_values.argmax(axis=0)
        return self._step(*args,**kwargs)

In [None]:
random_walk_std_dev = 0.01

datalog = PolicyAnalyser(episode_length)
for eps in epsilon:
    env = NSArmedBandit(random_walk_std_dev,arms,runs)
    policy = EpsilonPolicy(arms,runs,eps)
    datalog.create(f'eps={eps}',env,policy)
    run_bandit_episode(env,policy,datalog,episode_length)

In [None]:
datalog.plot()