In [10]:
import gymnasium as gym
import numpy as np

def epsilon_greedy(env,state,epsilon,q1,q2):
    if np.random.default_rng().random() < epsilon : 
        return env.action_space.sample()
    else:
        values = np.array([q1[(state,a)] + q2[(state,a)] for a in range(env.action_space.n)])
        action = np.argmax(values)
        return action
    
def double_q_learning(env,q1,q2,episodes,time_limit,gamma,epsilon,alpha,epsilon_decay):


    for i in range(episodes):
        state , _ = env.reset()
        
        truncated = False
        terminated = False
        time_step = 0

        while not terminated and not truncated:
            action = epsilon_greedy(env,state,epsilon,q1,q2)
            next_state , reward , terminated , truncated , _ = env.step(action)
            
            if np.random.default_rng().random() < 0.5 : 
                next_action = np.argmax([q1[(next_state,a)] for a in range(env.action_space.n)])
                q1[(state,action)] += alpha * (reward + gamma * q2[(next_state,next_action)] - q1[(state , action)])
            else:
                next_action = np.argmax([q2[(next_state,a)] for a in range(env.action_space.n)])
                q2[(state,action)] += alpha * (reward + gamma * q1[(next_state,next_action)] - q2[(state , action)])
            
            state = next_state

            time_step += 1 
            if time_step > time_limit:
                truncated = True
                
        if epsilon > 0 and epsilon_decay :
            epsilon -= 0.001

    return q1,q2


def walk(q_,slippery):
    env = gym.make("FrozenLake-v1" , map_name='4x4' , is_slippery=slippery , render_mode='human')
    state,_ = env.reset()
    terminated = False
    step = 60
    q = q_
    while not terminated:
        max = -10000000000
        for a in range(env.action_space.n):
            if max < q[(state,a)]:
                max = q[(state,a)]
                action = a

        next_state , reward, terminated, truncated , _ = env.step(action)
        state = next_state 
        step-=1
        if step == 0 : break
    env.close()


'''gamma = 0.9
alpha = 0.85
epsilon = 0.8
episodes = 1000
time_limit = 100'''
# alpha = 0.5
# gamma = 0.9
# epsilon = 1
# episodes = 10000
# time_limit = 100

# env = gym.make("FrozenLake-v1" , map_name='4x4' , is_slippery=False , render_mode=None)

# q1 = np.zeros((env.observation_space.n,env.action_space.n),dtype=float)
# q2 = np.zeros((env.observation_space.n,env.action_space.n),dtype=float)

# q1,q2 = double_q_learning(env,episodes,time_limit,gamma,epsilon,alpha,epsilon_decay=True)

'gamma = 0.9\nalpha = 0.85\nepsilon = 0.8\nepisodes = 1000\ntime_limit = 100'

In [20]:
def run(time_limit,episodes,epsilon,gamma,alpha,slippery):
    env = gym.make("FrozenLake-v1" , map_name='4x4' , is_slippery=slippery , render_mode=None)
    q1 = np.zeros((env.observation_space.n,env.action_space.n),dtype=float)
    q2 = np.zeros((env.observation_space.n,env.action_space.n),dtype=float)    
    q1_,q2_ = double_q_learning(env,q1,q2,episodes,time_limit,gamma,epsilon,alpha,epsilon_decay=True)    
    walk(q1_,slippery)
    return q1_,q2_

Report

Double Q_Learning non Slippery mode

In [38]:
q1,q2 = run(time_limit=100,episodes=1000,epsilon=1,gamma=0.9,alpha=0.1,slippery=False)
q1

array([[0.14977343, 0.5903531 , 0.08999043, 0.23358923],
       [0.22061783, 0.        , 0.        , 0.00156552],
       [0.00125805, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.1481859 , 0.65596767, 0.        , 0.18571382],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.32967254, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.19099169, 0.        , 0.72898372, 0.18575165],
       [0.23080193, 0.489798  , 0.80998831, 0.        ],
       [0.28132771, 0.89999987, 0.        , 0.00728833],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.04890997, 0.73335008, 0.07215706],
       [0.22424271, 0.61183752, 0.99999995, 0.50962905],
       [0.        , 0.        , 0.        , 0.        ]])

In [39]:
q1,q2 = run(time_limit=100,episodes=1000,epsilon=1,gamma=0.9,alpha=1,slippery=False)
q1

array([[0.531441, 0.59049 , 0.59049 , 0.531441],
       [0.531441, 0.      , 0.6561  , 0.59049 ],
       [0.59049 , 0.729   , 0.59049 , 0.6561  ],
       [0.6561  , 0.      , 0.59049 , 0.59049 ],
       [0.59049 , 0.6561  , 0.      , 0.531441],
       [0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.81    , 0.      , 0.6561  ],
       [0.      , 0.      , 0.      , 0.      ],
       [0.6561  , 0.      , 0.729   , 0.59049 ],
       [0.6561  , 0.81    , 0.81    , 0.      ],
       [0.729   , 0.9     , 0.      , 0.729   ],
       [0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.81    , 0.9     , 0.729   ],
       [0.81    , 0.9     , 1.      , 0.81    ],
       [0.      , 0.      , 0.      , 0.      ]])

We can see that like Q_learning , alpha is controling the size of the steps that change our values. for this small environment we can set different alpha's and still converge to optimal value but for more complex problems we must select a proper alpha.

In [40]:
q1,q2 = run(time_limit=100,episodes=1000,epsilon=1,gamma=0.1,alpha=0.1,slippery=False)
q1

array([[6.66997937e-07, 9.99980994e-06, 3.68334330e-08, 7.32446355e-07],
       [5.64498733e-07, 0.00000000e+00, 1.02803801e-15, 3.24842578e-12],
       [1.29241033e-09, 0.00000000e+00, 0.00000000e+00, 3.79349821e-16],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.43062457e-06, 9.99991173e-05, 0.00000000e+00, 7.32669687e-07],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.37654810e-03, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.33453649e-05, 0.00000000e+00, 9.99999472e-04, 7.59499538e-06],
       [6.30484273e-05, 9.99999445e-03, 4.41234580e-03, 0.00000000e+00],
       [2.19488171e-04, 9.50647168e-02, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 6.66751371e-03, 9.99999990e

In [41]:
q1,q2 = run(time_limit=100,episodes=1000,epsilon=1,gamma=0.5,alpha=0.1,slippery=False)
q1

array([[1.41756759e-02, 3.12499592e-02, 1.20478549e-02, 1.40674184e-02],
       [4.15127687e-03, 0.00000000e+00, 2.92242386e-02, 8.19721325e-04],
       [1.10660270e-03, 9.76626632e-02, 2.46285262e-05, 6.40789492e-03],
       [6.15697003e-05, 0.00000000e+00, 0.00000000e+00, 1.25366999e-06],
       [2.64035430e-02, 6.24999737e-02, 0.00000000e+00, 1.31795628e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.37877676e-01, 0.00000000e+00, 1.09287059e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.49134505e-02, 0.00000000e+00, 1.24999994e-01, 2.08210485e-02],
       [4.59833171e-02, 1.82771232e-01, 2.49999998e-01, 0.00000000e+00],
       [9.68083415e-02, 4.99999999e-01, 0.00000000e+00, 9.38462914e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 7.79641470e-02, 4.70249241e

In [42]:
q1,q2 = run(time_limit=100,episodes=1000,epsilon=1,gamma=0.9,alpha=0.1,slippery=False)
q1

array([[3.30242448e-01, 5.90469794e-01, 1.75667952e-01, 3.30759287e-01],
       [3.68203878e-01, 0.00000000e+00, 6.02828819e-04, 2.25388019e-02],
       [4.29856536e-02, 0.00000000e+00, 0.00000000e+00, 6.00329148e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.80278010e-01, 6.56085992e-01, 0.00000000e+00, 3.58030060e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.71047547e-01, 0.00000000e+00, 6.03551938e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.58109299e-01, 0.00000000e+00, 7.28998417e-01, 4.51306537e-01],
       [3.88228796e-01, 4.30919087e-01, 8.09999164e-01, 0.00000000e+00],
       [4.27709101e-01, 8.99999971e-01, 0.00000000e+00, 9.41940541e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.37325062e-01, 8.41358273e

We can see with higher value of gamma we will look further into future and also it is controling the magnitude of our values or we could say with higher gammas , large rewards from future play a rule in the values of the current states.

Double Q_Learning Slippery mode

In [43]:
q1,q2 = run(time_limit=100,episodes=10000,epsilon=1,gamma=0.9,alpha=0.1,slippery=True)
q1

array([[2.24968650e-02, 0.00000000e+00, 6.76688091e-04, 4.25043734e-04],
       [1.01074956e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.79134708e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.79223846e-02, 7.89522318e-04, 1.80289346e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.22721143e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.41071160e-04, 3.46017008e-04, 0.00000000e+00, 4.52745120e-02],
       [3.84431441e-02, 8.57365240e-03, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.16144455e-01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 6.56100000e-03, 2.33248907e

In [44]:
q1,q2 = run(time_limit=100,episodes=1000,epsilon=1,gamma=0.9,alpha=0.1,slippery=True)
q1

array([[1.31551656e-03, 3.38161458e-03, 1.81072654e-03, 1.44417524e-03],
       [1.44657510e-03, 1.01804753e-03, 8.09038624e-03, 2.98154493e-03],
       [4.77692886e-03, 4.40401563e-03, 2.08560534e-02, 1.97508780e-03],
       [1.59177928e-03, 2.02575598e-03, 2.16124646e-03, 1.29855266e-02],
       [1.01882751e-04, 7.69343775e-04, 2.35733415e-03, 9.96002285e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.97736994e-03, 0.00000000e+00, 2.28653880e-02, 6.15748384e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.28324265e-04, 1.20917480e-04, 3.39563745e-03, 1.07347299e-02],
       [3.31862949e-05, 0.00000000e+00, 0.00000000e+00, 1.52645838e-02],
       [6.37048235e-02, 0.00000000e+00, 1.84866560e-01, 1.45615927e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.64348876e-02, 8.53409892e

In [45]:
q1,q2 = run(time_limit=100,episodes=50000,epsilon=1,gamma=0.9,alpha=0.1,slippery=True)
q1

array([[2.75185956e-02, 1.40973887e-03, 1.61451559e-03, 1.64373868e-04],
       [1.36179882e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.95341269e-02, 4.59504177e-04, 3.14533867e-03, 4.52772607e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.11919114e-03, 7.31410627e-04, 1.71017906e-03, 5.50594128e-02],
       [9.27320010e-02, 0.00000000e+00, 1.79555744e-03, 0.00000000e+00],
       [5.31441000e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.20259940e-03, 0.00000000e+00, 3.05545738e

Like every other algorithm when we introduce uncerteity to our environment the problem become more complex . in the results we can see Double Q_learning can find the optimal q_values but it need more time(episodes) and hyperparameters must be selected with more precision.

Double Q_learning vs Q_learning

Performance : both Q_learning and Double Q_learning perfrom well and converge to optimal policy . 
Convergence Speed : Q_learning is fast and stable but Double q_learning is slightly slower due to the need to update two separate value functions.
Maxization Bias : the main reason of using Double Q_learning was to control maximization bias and it fix the problem . 

But in the slippery mode we see some changes in the performance of this algorithms. 
Q_learning performance degrade and become slower and unstable while Double Q_learning show improved speed and performance due to reducing overestimation bias.