In [10]:
import gymnasium as gym
import numpy as np
import random
import wandb
from matplotlib import pyplot as plt
SEED = 150
np.random.seed(SEED)

In [11]:
wandb.login()

True

In [None]:
sweep_config = { 'method': 'random' }
metric = { 'name': 'reward',
    'goal': 'maximize'}
sweep_config['metric'] = metric
parameters_dict = {
    'learning_rate': { 'values': [0.01] },
    'Discretization_Size': { 'values': [42] },
    'episodes': { 'value': 50000 },
    'discount_rate':{'value':0.99},
    'wind': { 'values': [5] },
    }

sweep_config['parameters'] = parameters_dict

In [13]:
import pprint
pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'maximize', 'name': 'reward'},
 'parameters': {'Discretization_Size': {'values': [42]},
                'discount_rate': {'value': 0.99},
                'episodes': {'value': 50000},
                'learning_rate': {'values': [0.01]},
                'wind': {'values': [5]}}}


In [None]:
sweep_id = wandb.sweep(sweep_config, project="RL_PA1_CartPole_QLearning")

Create sweep with ID: 89xwwyky
Sweep URL: https://wandb.ai/me21b145/RL_Assignment_1/sweeps/89xwwyky


In [15]:

render = False

### Q-Learning with Softmax Exploration

In [None]:
def qlearning(config=None):
    env = gym.make("CartPole-v1")
    env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=10000)
    done = False
    with wandb.init(config=config):
        seeds = [250, 1000]
        for seed in seeds:
            np.random.seed(seed)
            random.seed(seed)
            config = wandb.config
            Discretization_size = config.Discretization_Size
            LEARNING_RATE = config.learning_rate
            EPISODES = config.episodes
            DISCOUNT = config.discount_rate
            wind = config.wind
            buckets = Discretization_size-2
            div_size = 2*wind/buckets
            
            DISCRETE_OBSERVATION_SPACE_SIZE = [Discretization_size] * len(env.observation_space.high)

            discrete_os_win_size = (env.observation_space.high -
                                    env.observation_space.low) / DISCRETE_OBSERVATION_SPACE_SIZE

            def get_discrete_state(state):
                discrete_state = (state - env.observation_space.low) / discrete_os_win_size
                if state[1]<=-wind:
                    discrete_state[1]=0
                elif state[1]>=wind:
                    discrete_state[1]=Discretization_size-1
                else:
                    discrete_state[1]= 1+int(state[1]+wind/div_size)
                if state[3]<=-wind:
                    discrete_state[3]=0
                elif state[3]>=wind:
                    discrete_state[3]=Discretization_size-1
                else:
                    discrete_state[3]= 1+int(state[3]+wind/div_size)
                return tuple(discrete_state.astype(int))
            
            def softmax(x):
                exp_x = np.exp(x - np.max(x))
                return exp_x / exp_x.sum()
            
            q_table = np.random.uniform(
            low=-1, high=1, size=(DISCRETE_OBSERVATION_SPACE_SIZE + [env.action_space.n]))
            rewards_per_episode = []
            return_per_episode = []
            
            for ep in range(EPISODES):
                done = False
                discrete_state = get_discrete_state(env.reset()[0])
                t =0
                total_reward = 0
                truncated = False
                rew = 0


                while not done: 
                    t+=1
                    probabilities = softmax(q_table[discrete_state])
                    sample = np.random.multinomial(1, probabilities)
                    action = np.argmax(sample)
                    new_state, reward, done, truncated,_ = env.step(action)
                    new_discrete_state = get_discrete_state(new_state)
                    current_q = q_table[discrete_state + (action,)]
                    rew += reward
                    if done:
                        new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * reward
                        print(f'episode {ep} finished after {t} timesteps')
                    else:
                        max_future_q = np.max(q_table[new_discrete_state])
                        new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

                    q_table[discrete_state + (action,)] = new_q
                    discrete_state = new_discrete_state
                    total_reward += reward

                rewards_per_episode.append(total_reward)
                if ep == 0:
                    return_per_episode.append(total_reward)
                else:
                    return_per_episode.append(return_per_episode[-1]*DISCOUNT + total_reward)


                wandb.log({'reward': rew, 'episode': ep})
                wandb.log({'length': t, 'episode': ep})


            plt.figure(figsize=(12, 5))
            plt.plot(rewards_per_episode, label="Reward Per Episode")
            plt.xlabel("Episodes")
            plt.ylabel("Rewards")
            plt.title(f"Q-Learning Training on random seed {seed}")
            plt.legend()
            wandb.log({'Seed': seed, 'Rewards per episode': wandb.Image(plt)})
            plt.close()

            plt.figure(figsize=(12, 5))
            plt.plot(return_per_episode, label="Return Per Episode")
            plt.xlabel("Episodes")
            plt.ylabel("Returns")
            plt.title(f"Q-Learning Training on random seed {seed}")

            wandb.log({'Seed': seed, 'Return per episode': wandb.Image(plt)})
            plt.close()
            env.close()

        return q_table, rewards_per_episode, return_per_episode

### Hyperparameter Tuning

In [17]:
wandb.agent(sweep_id, qlearning,count=1)

[34m[1mwandb[0m: Agent Starting Run: etzqkjux with config:
[34m[1mwandb[0m: 	Discretization_Size: 42
[34m[1mwandb[0m: 	discount_rate: 0.99
[34m[1mwandb[0m: 	episodes: 50000
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	wind: 5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


  discrete_state = (state - env.observation_space.low) / discrete_os_win_size


episode 0 finished after 11 timesteps
episode 1 finished after 15 timesteps
episode 2 finished after 16 timesteps
episode 3 finished after 15 timesteps
episode 4 finished after 12 timesteps
episode 5 finished after 26 timesteps
episode 6 finished after 27 timesteps
episode 7 finished after 20 timesteps
episode 8 finished after 14 timesteps
episode 9 finished after 14 timesteps
episode 10 finished after 33 timesteps
episode 11 finished after 20 timesteps
episode 12 finished after 20 timesteps
episode 13 finished after 21 timesteps
episode 14 finished after 16 timesteps
episode 15 finished after 25 timesteps
episode 16 finished after 13 timesteps
episode 17 finished after 20 timesteps
episode 18 finished after 21 timesteps
episode 19 finished after 13 timesteps
episode 20 finished after 13 timesteps
episode 21 finished after 37 timesteps
episode 22 finished after 57 timesteps
episode 23 finished after 18 timesteps
episode 24 finished after 19 timesteps
episode 25 finished after 9 timeste

0,1
Seed,▁▁██
episode,▁▁▁▂▂▄▄▄▅▅▆▆▆▆▇▇████▁▁▂▂▂▃▃▃▃▃▃▃▃▃▄▆▇▇██
length,▂▁▂▂▃▃▃▂▂▃▃▃▃▂▄▅▂▄▂▃▁▃▃▅▁▄▅▃▆▃▆▅▆▅█▅▃▆▅▄
reward,▂▃▂▂▂▃▂▃▃▂▂▁▃▄▂▃▄▂▄▃▂▄▃▄▃▅▅▁▆▇▅▂▆█▆▆▆▇▆▆

0,1
Seed,1000
episode,49999
length,109
reward,109
