In [36]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
from hiive.mdptoolbox import mdp
from hiive.mdptoolbox import util
import gym
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
np.random.seed(44)

In [7]:
def test_policy(P, R, policy, test_count=1000, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode

In [8]:
def grid_search_VI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration",
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df


Value Iteration

In [190]:
P,R = forest(20, r1 = 10, r2 = 4, p = 0.1)

In [None]:
fm_20_vi = grid_search_VI(P, R, discount = 0.9, epsilon = [1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9])

In [74]:
fm_20_vi

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.001,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55,0.005896,2.843259,"(4.460720290173723, 5.013211594807497, 5.01321..."
1,0.0001,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",66,0.003501,2.842526,"(4.4706146525683454, 5.023100336527209, 5.0231..."
2,1e-05,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",76,0.00358,2.920512,"(4.473560831234312, 5.026046957818786, 5.02604..."
3,1e-06,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,0.003875,2.884589,"(4.474643139169861, 5.027129333047953, 5.02712..."
4,1e-07,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",98,0.004393,2.885044,"(4.47498279201032, 5.027468979261533, 5.027468..."
5,1e-08,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",109,0.005316,2.913634,"(4.475089377376456, 5.027575565280265, 5.02757..."
6,1e-09,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",120,0.006105,2.866948,"(4.475122825121185, 5.027609012960728, 5.02760..."


In [191]:
fm_20_vi.Policy.nunique()

1

In [192]:
fm_20_vi.Policy.unique()

array([(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)],
      dtype=object)

Policy Iteration

In [193]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(14, 0.0638279914855957, 2.948054429531793)

In [204]:
list(pi_pol)

[0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [206]:
[i for i in fm_20_vi.Policy.unique()]

[(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)]

Q-Learning

In [79]:
def grid_search_Q(P, R, discount=0.9, alpha_dec=[.99], alpha_min=[0.001],
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min",
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards","Training Errors"])

    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec,
                                      alpha_min=a_min, epsilon=eps,
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        error = [s['Error'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward,
                                q.time, q.policy, q.V, rews, error]

                        df_length = len\
                            (q_df)
                        q_df.loc[df_length] = info
    return q_df

In [80]:
epsilons = [0.5, 0.8, 0.9, 0.99]
epsilon_decays = [.999]
alpha_decs = [0.999]
alpha_mins =[0.001]
iters = [int(e) for e  in [1e5, 1e6, 1e7, 1e8]]
q_df = grid_search_Q(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins,
            epsilon=epsilons, epsilon_decay=epsilon_decays, n_iter=iters)

1: 3.328811073128214
2: 3.093887911696705
3: 3.122111119591973
4: 3.2781773512201275
5: 3.1733645875441137
6: 3.34976253076596
7: 3.208006197590274
8: 3.2962114704142356
9: 3.158835124792923
10: 3.298125995132963
11: 3.183542218943943
12: 3.206992649140325
13: 2.9318491170394436
14: 2.876297938751445
15: 2.9239452232667005
16: 2.928126903203329


0.0     5399
1.0     4526
10.0      70
4.0        5
dtype: int64

In [82]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0))
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [212]:
q_df['Value Function'][0]

(4.470695063983851,
 5.026038757264145,
 3.958253374790828,
 4.837389394921782,
 0.11310113153958719,
 0.8642783300371065,
 1.201743939141563,
 0.18230613994140643,
 0.530159185135527,
 0.05370833991776588,
 0.4501664242116971,
 0.26308271941216255,
 0.35535420196685047,
 0.9615404641997741,
 0.5284852811194357,
 0.03554313387849705,
 0.39050855011273655,
 0.2538811218077455,
 0.43790070365226513,
 9.304268150025019)