<a href="https://colab.research.google.com/github/Tejaswini170104/CH5020-Term-paper-presentation/blob/main/CartPole_v1_qlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import gymnasium as gym
import random
import wandb

# W&B Sweep Configuration
sweep_config = {
    "method": "bayes",  # Bayesian Optimization
    "metric": {"name": "avg_regret", "goal": "minimize"},
    "parameters": {
        "alpha": {"values": [0.001, 0.01, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5]},
        "tau": {"values": [1.0, 0.5, 0.1, 0.05, 0.01]},
    }
}

sweep_id = wandb.sweep(sweep_config, project="rl_qlearning_softmax")

# Other constants
GAMMA = 0.99
TAU_MIN = 0.01
MAX_EPISODES = 10000
MAX_STEPS = 500
NUM_BINS = 20
SOLVED_CRITERIA = 500
CONSECUTIVE_EPISODES = 50

# State discretization bounds
state_bounds = [
    [-4.8, 4.8],
    [-3.0, 3.0],
    [-0.418, 0.418],
    [-3.0, 3.0]
]

def discretize_state(state):
    return tuple(np.digitize(state[i], np.linspace(state_bounds[i][0], state_bounds[i][1], NUM_BINS)) - 1 for i in range(len(state)))

def softmax_action(q_table, state, tau):
    q_values = q_table[state]
    exp_q = np.exp(q_values / tau)
    probs = exp_q / np.sum(exp_q)
    return np.random.choice(len(q_values), p=probs)

def check_mean_crossings(returns):
    count = 0
    for i in range(len(returns) - CONSECUTIVE_EPISODES + 1):
        if np.mean(returns[i:i + CONSECUTIVE_EPISODES]) >= SOLVED_CRITERIA:
            count += 1
    return count

def qlearning_softmax():
    # Initialize W&B for each run
    wandb.init(project="rl_qlearning_softmax", config=wandb.config)

    alpha = wandb.config.alpha
    tau = wandb.config.tau
    env = gym.make("CartPole-v1")

    q_table = np.zeros((NUM_BINS, NUM_BINS, NUM_BINS, NUM_BINS, env.action_space.n))
    returns, regret = [], []

    for episode in range(MAX_EPISODES):
        tau = max(TAU_MIN, tau * np.exp(-0.001 * episode))  # Decay tau over time
        state = discretize_state(env.reset()[0])
        total_reward = 0

        for _ in range(MAX_STEPS):
            action = softmax_action(q_table, state, tau)
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = discretize_state(next_state)

            best_next_action = np.argmax(q_table[next_state])
            td_target = reward + (GAMMA * q_table[next_state][best_next_action] if not (terminated or truncated) else 0)
            q_table[state][action] += alpha * (td_target - q_table[state][action])

            state = next_state
            total_reward += reward
            if terminated or truncated:
                break

        returns.append(total_reward)
        regret.append(MAX_STEPS - total_reward)

        # Log per episode
        wandb.log({"episode": episode + 1, "reward": total_reward, "regret": regret[-1], "tau": tau})

    avg_regret = np.mean(regret)
    mean_cross_count = check_mean_crossings(returns)

    # Final log per experiment
    wandb.log({"alpha": alpha, "tau": tau, "avg_regret": avg_regret, "mean_cross_count": mean_cross_count})
    wandb.finish()

# Run the Sweep Agent
wandb.agent(sweep_id, function=qlearning_softmax, count=20)  # Run 20 experiments


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: 2i40hybt
Sweep URL: https://wandb.ai/tejaswiniksssn-indian-institute-of-technology-madras/rl_qlearning_softmax/sweeps/2i40hybt


[34m[1mwandb[0m: Agent Starting Run: fficm5rm with config:
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	tau: 1
[34m[1mwandb[0m: Currently logged in as: [33mtejaswiniksssn[0m ([33mtejaswiniksssn-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  exp_q = np.exp(q_values / tau)
  probs = exp_q / np.sum(exp_q)


0,1
episode,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇█
regret,▂▇▇▇▆█▆▆▆▇▇██▇▇▇█▇▁▇▆▇█▃▇▇█▇▇▆▆▆▇▇▆▆▆█▅█
reward,▃▂▄▂▁▁▃▂█▁▁▁▃▁▁▆▂▂▃▁▁▁▂▁▂▁▁▁▂▂▁▁▂▃▁▁▁▂▃▁
tau,█▄▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
episode,1222.0
regret,487.0
reward,13.0
tau,0.01


Run fficm5rm errored:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "<ipython-input-1-5274025344ad>", line 68, in qlearning_softmax
    action = softmax_action(q_table, state, tau)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-1-5274025344ad>", line 42, in softmax_action
    return np.random.choice(len(q_values), p=probs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "numpy/random/mtrand.pyx", line 994, in numpy.random.mtrand.RandomState.choice
ValueError: probabilities contain NaN

[34m[1mwandb[0m: [32m[41mERROR[0m Run fficm5rm errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m

0,1
episode,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇██
regret,▆▆▇▂▇▇▇▄▅▁▇▅█▆▇▁▇█▃▇█▇
reward,▃▃▂▇▂▂▂▅▄█▂▄▁▃▂█▂▁▆▂▁▂
tau,█████▇▇▇▇▇▆▆▅▅▅▄▄▃▃▂▂▁

0,1
episode,22.0
regret,454.0
reward,46.0
tau,0.03969


Run rrdu1jmp errored:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "<ipython-input-1-5274025344ad>", line 68, in qlearning_softmax
    action = softmax_action(q_table, state, tau)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-1-5274025344ad>", line 42, in softmax_action
    return np.random.choice(len(q_values), p=probs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "numpy/random/mtrand.pyx", line 994, in numpy.random.mtrand.RandomState.choice
ValueError: probabilities contain NaN

[34m[1mwandb[0m: [32m[41mERROR[0m Run rrdu1jmp errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m

0,1
episode,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
regret,▅██▇▇▆▅▇▇▇▄▁▆▅▂▃▅▄▃▇▃▁▄▂▃▇▂▄█▇▄▆▃▁▇▃▆▄▇▄
reward,▅▃▂▁▁▁▃▄▃▄▂▂▆▆▇▄▇▄▅▆▆▆▂▅▂▂▅▂▆▄█▂▅▆▆▆▃▂▂▅
tau,██████▇▇▇▇▆▆▆▆▆▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
episode,81.0
regret,460.0
reward,40.0
tau,0.03916


Run jc6gpmr5 errored:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "<ipython-input-1-5274025344ad>", line 68, in qlearning_softmax
    action = softmax_action(q_table, state, tau)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-1-5274025344ad>", line 42, in softmax_action
    return np.random.choice(len(q_values), p=probs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "numpy/random/mtrand.pyx", line 994, in numpy.random.mtrand.RandomState.choice
ValueError: probabilities contain NaN

[34m[1mwandb[0m: [32m[41mERROR[0m Run jc6gpmr5 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m