<a href="https://colab.research.google.com/github/Shubhamgoe/My-Projects/blob/Reinforcement-Learning/RL_coding_quiz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# bandit algorithm



import gym
import numpy as np
import random

# Initialize the bandit environment
env = gym.make("FrozenLake-v1", is_slippery=False)
n_actions = env.action_space.n

# Hyperparameters
epsilon = 0.1  # Exploration rate
n_iterations = 1000  # Number of iterations

# Initialize action-value estimates and counts
Q = np.zeros(n_actions)  # Estimated value of actions
N = np.zeros(n_actions)  # Number of times each action was selected

# Bandit algorithm loop
env.reset()
for episode in range(n_iterations):
    # Epsilon-greedy action selection
    if np.random.uniform(0,1) < epsilon:
        action = env.action_space.sample()  # Explore: Random action
    else:
        action = np.argmax(Q)  # Exploit: Action with max estimated value

    # Interact with the environment
    state, reward, done, _ = env.step(action)

    # Update counts and estimated values
    N[action] += 1
    Q[action] += (reward - Q[action]) / N[action]  # Incremental update

    # Reset environment if episode is done
    if done:
        env.reset()

# Print results
print("Action-Value Estimates (Q):", Q)
print("Action Selection Counts (N):", N)

# Determine the best action
best_action = np.argmax(Q)
print(f"Best Action: {best_action} with Q-value: {Q[best_action]}")


In [None]:
# UCB bandit algorithm



import gym
import numpy as np
import random

# Initialize the bandit environment
env = gym.make("FrozenLake-v1", is_slippery=False)
n_actions = env.action_space.n

# Hyperparameters
epsilon = 0.1  # Exploration rate
n_iterations = 1000  # Number of iterations

# Initialize action-value estimates and counts
Q = np.zeros(n_actions)  # Estimated value of actions

actions = [i for i in range(env.action_space.n)]
N = [0 for a in actions]  # Number of times each action was selected
# Bandit algorithm loop
c = 0.1
env.reset()
t = 0
done = False
for episode in range(n_iterations):
    # UCB action selection
    action = np.argmax([Q[a]+c*(np.sqrt(np.log(t)/N[a]+1)) for a in actions])
    N[action] += 1
    # Interact with the environment
    state, reward, done, _ = env.step(action)

    # Update counts and estimated values
    N[action] += 1
    Q[action] += (reward - Q[action]) / N[action]  # Incremental update
    t+=1
    # Reset environment if episode is done
    if done:
        env.reset()

# Print results
print("Action-Value Estimates (Q):", Q)
print("Action Selection Counts (N):", N)

# Determine the best action
best_action = np.argmax(Q)
print(f"Best Action: {best_action} with Q-value: {Q[best_action]}")


In [None]:
# Gradient Bandit Algorithms


import gym
import numpy as np
import random

# Initialize the bandit environment
env = gym.make("FrozenLake-v1", is_slippery=False)

h = [0 for a in range(env.action_space.n)]
alpha = 0.1
avg_rewards = 0
iterations = 1000
done = False
actions = [i for i in range(env.action_space.n)]
env.reset()
for iter in range(iterations):
  policy = np.exp(h - np.max(h))/np.sum(np.exp(h - np.max(h)))
  action = np.random.choice(actions,p=policy)
  state,reward,done,_ = env.step(action)
  avg_rewards += (reward - avg_rewards)/(iter+1)
  for a in actions:
    if action == a:
      h[a] += alpha*(reward - avg_rewards)*(1-policy[a])
    else:
      h[a] -= alpha*(reward - avg_rewards)*(policy[a])
  if done:
    env.reset()

# Print results
print("Preferences (H):", h)
print("Softmax Probabilities (policy):", policy)

# Determine the best action
best_action = np.argmax(policy)
print(f"Best Action: {best_action} with Probability: {policy[best_action]}")



In [None]:
# policy iteration


import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)

s = [i for i in range(env.observation_space.n)]
a = [i for i in range(env.action_space.n)]

policy = [np.random.choice(a) for _ in s]
value = [0.0 for i in s]

gamma = 0.9  # Discount factor
theta = 1e-8  # Convergence threshold


def policy_evaluation(policy, value, gamma, theta):
    while True:
      delta = 0
      for state in s:
        v = value[state]
        action = policy[state]
        value[state] = np.sum([prob*(reward+gamma*value[next_state]) for prob, next_state, reward, done in env.P[state][action]])
        delta = max(delta,abs(v-value[state]))
      if delta < theta:
        break

    return value




def policy_improvement(value, gamma):
    policy_stable = True
    for state in s:
      old_action = policy[state]
      policy[state] = np.argmax([np.sum([prob*(reward+gamma*value[next_state]) for prob, next_state, reward, done in env.P[state][action]]) for action in a])
      if old_action != policy[state]:
        policy_stable = False
    return policy,policy_stable



while True:
  value = policy_evaluation(policy, value, gamma, theta)
  policy,policy_stable = policy_improvement(value, gamma)
  if policy_stable:
    break

print("Optimal Policy:", policy)

In [None]:
# Value iteration
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)


s = [i for i in range(env.observation_space.n)]
a = [i for i in range(env.action_space.n)]
policy = [np.random.choice(a) for i in s]
value = [0.0 for i in s]

gamma = 0.9  # Discount factor
theta = 1e-8  # Convergence threshold


def value_iteration(policy, value, gamma, theta):
    while True:
      delta = 0
      for state in s:
        v = value[state]

        value[state] = np.max([np.sum([prob*(reward+gamma*value[next_state]) for prob, next_state, reward, done in env.P[state][action]]) for action in a])
        delta = max(delta,abs(v-value[state]))
      if delta < theta:
        break

    return value


value = value_iteration(policy, value, gamma, theta)
for state in s:
  policy[state] = np.argmax([np.sum([prob*(reward+gamma*value[next_state]) for prob, next_state, reward, done in env.P[state][action]]) for action in a])


print("Optimal Policy:", policy)






In [None]:
# Monte Carlo

import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)



# Finally now something can be done
# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# create policy
policy = [np.random.choice(a) for i in s]
# state_action values
Q = [[0 for j in a] for i in s]
# returns
returns = [[[] for j in a] for i in s]

num_of_episodes = 1000
gamma = 1
for i in range(num_of_episodes):
    state = env.reset()
    episode_data = []
    done = False
    while not done:
      action = policy[state]
      next_state,reward,done,_ = env.step(action)
      episode_data.append((state,action,reward))
      state = next_state
    G = 0
    for t in reversed(range(len(episode_data))):
      state,action,reward = episode_data[t]
      G = reward + gamma * G

      if (state,action) not in [(x[0],x[1]) for x in episode_data[0:t]]:
        returns[state][action].append(G)
        Q[state][action] = np.mean(returns[state][action])
        policy[state] = np.argmax(Q[state])


print(policy)
print(Q)




In [None]:
# Monte Carlo (exploring start + epsilon greedy policy improvement)
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)

# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# policy
policy = [[np.random.uniform(0,1) for j in a] for i in s]
policy = [list(np.array(p)/np.sum(p)) for p in policy]
print(policy)
# state_action values
Q = [[0 for j in a] for i in s]
N = [[0 for j in a] for i in s]
# returns
# returns = [[[] for j in a] for i in s]

num_of_episodes = 1000
gamma = 1
epsilon = 0.1
for i in range(num_of_episodes):
    env.reset()
    state = env.observation_space.sample()
    episode_data = []
    done = False
    while not done:
      action = np.random.choice(a,p =policy[state])
      next_state,reward,done,_ = env.step(action)
      episode_data.append((state,action,reward))
      state = next_state
    G = 0
    for t in reversed(range(len(episode_data))):
      state,action,reward = episode_data[t]
      G = reward + gamma * G
      # print(G)
      if (state,action) not in [(x[0],x[1]) for x in episode_data[0:t]]:
        # returns[state][action].append(G)
        N[state][action] += 1
        Q[state][action] += (G - Q[state][action])/N[state][action]
        # if(np.random.uniform(0,1)<epsilon):
        #   policy[state] = np.random.choice(a)
        # else:
        #   policy[state] = np.argmax(Q[state])                       approach of deterministic policy
        for action in a:
          if(action == np.argmax(Q[state])):
            policy[state][action] = 1 - epsilon + epsilon/len(a)
          else:
            policy[state][action] = epsilon/len(a)
            print(policy[state][action])
                                                                      #approach of epsilon soft policy






print(policy)
# print(Q)




In [None]:
# Off policy prediction Monte Carlo
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)

# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# policy
target_policy = [[np.random.uniform(0,1) for j in a] for i in s]
target_policy = [list(np.array(p)/np.sum(p)) for p in target_policy]
# print(policy)
# state_action values
Q = [[0 for j in a] for i in s]
C = [[0 for j in a] for i in s]
# target_policy[state] = np.argmax(Q[state])
# returns
# returns = [[[] for j in a] for i in s]

num_of_episodes = 1000
gamma = 1
epsilon = 0.1
for i in range(num_of_episodes):
    env.reset()
    behaviour_policy = [[1/len(a) for j in a] for i in s]
    state = env.observation_space.sample()
    episode_data = []
    done = False
    while not done:
      action = np.random.choice(a,p =behaviour_policy[state])
      next_state,reward,done,_ = env.step(action)
      episode_data.append((state,action,reward))
      state = next_state
    G = 0
    W = 1
    for t in reversed(range(len(episode_data))):
      state,action,reward = episode_data[t]
      G = reward + gamma * G
      # print(G)

      C[state][action] += W
      Q[state][action] += (G - Q[state][action])/C[state][action]
      for action in a:
        if(action == np.argmax(Q[state])):
          target_policy[state][action] = 1 - epsilon + epsilon/len(a)
        else:
          target_policy[state][action] = epsilon/len(a)
      # target_policy[state] = np.argmax(Q[state])                     if target_policy is deterministic
      W = W * (target_policy[state][action]/behaviour_policy[state][action])
      # W = W * (1/behaviour_policy[state][action])                    if target_policy is deterministic

                                                                    #approach of epsilon soft policy






print(policy)
# print(Q)




In [None]:
# Q learning
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)

# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# policy
policy = [[np.random.uniform(0,1) for j in a] for i in s]
policy = [list(np.array(p)/np.sum(p)) for p in policy]
# print(policy)
# state_action values
Q = [[0 for j in a] for i in s]
N = [[0 for j in a] for i in s]
# returns
# returns = [[[] for j in a] for i in s]

num_of_episodes = 1000
gamma = 1
epsilon = 0.1
for i in range(num_of_episodes):
    env.reset()
    state = env.observation_space.sample()
    # episode_data = []
    done = False
    while not done:
      action = np.random.choice(a,p=policy[state])
      next_state,reward,done,_ = env.step(action)
      N[state][action] += 1
      next_action = np.argmax(Q[next_state])
      Q[state][action] += (reward + gamma*Q[next_state][next_action] - Q[state][action])/N[state][action]
      # if(np.random.uniform(0,1)<epsilon):
      #   policy[state] = np.random.choice(a)
      # else:
      #   policy[state] = np.argmax(Q[state])                       approach of deterministic policy
      for action in a:
        if(action == np.argmax(Q[state])):
          policy[state][action] = 1 - epsilon + epsilon/len(a)
        else:
          policy[state][action] = epsilon/len(a)
                                                                    #approach of epsilon soft policy
      state = next_state






print(policy)
# print(Q)




In [None]:
# SARSA
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)

# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# policy
policy = [[np.random.uniform(0,1) for j in a] for i in s]
policy = [list(np.array(p)/np.sum(p)) for p in policy]
# print(policy)
# state_action values
Q = [[0 for j in a] for i in s]
N = [[0 for j in a] for i in s]
# returns
# returns = [[[] for j in a] for i in s]

num_of_episodes = 1000
gamma = 1
epsilon = 0.1
for i in range(num_of_episodes):
    env.reset()
    state = env.observation_space.sample()
    done = False
    while not done:
      action = np.random.choice(a,p=policy[state])
      next_state,reward,done,_ = env.step(action)
      for action in a:
        if(action == np.argmax(Q[next_state])):
          policy[next_state][action] = 1 - epsilon + epsilon/len(a)
        else:
          policy[next_state][action] = epsilon/len(a)
      N[state][action] += 1
      next_action = np.random.choice(a,p=policy[next_state])
      Q[state][action] += (reward + gamma*Q[next_state][next_action] - Q[state][action])/N[state][action]

      state = next_state






print(policy)
# print(Q)




In [None]:
# EXPECTED SARSA
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)


# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# policy
policy = [[np.random.uniform(0,1) for j in a] for i in s]
policy = [list(np.array(p)/np.sum(p)) for p in policy]
# print(policy)
# state_action values
Q = [[0 for j in a] for i in s]
N = [[0 for j in a] for i in s]
# returns
# returns = [[[] for j in a] for i in s]

num_of_episodes = 1000
gamma = 1
epsilon = 0.1
for i in range(num_of_episodes):
    env.reset()
    state = env.observation_space.sample()
    # episode_data = []
    done = False
    while not done:
      action = np.random.choice(a,p=policy[state])
      next_state,reward,done,_ = env.step(action)
      # returns[state][action].append(G)
      for action in a:
        if(action == np.argmax(Q[next_state])):
          policy[next_state][action] = 1 - epsilon + epsilon/len(a)
        else:
          policy[next_state][action] = epsilon/len(a)
      N[state][action] += 1
      # next_action = np.random.choice(a,p=policy[next_state])
      target = np.sum([policy[next_state][i]*Q[next_state][i] for i in a])
      Q[state][action] += (reward + gamma*target - Q[state][action])/N[state][action]

      state = next_state
    print(policy)







# print(Q)




In [None]:
# Double Q learning
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)

# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# policy
policy = [[np.random.uniform(0,1) for j in a] for i in s]
policy = [list(np.array(p)/np.sum(p)) for p in policy]
Q1 = [[0 for j in a] for i in s]
Q2 = [[0 for j in a] for i in s]
num_of_episodes = 1000
gamma = 1
epsilon = 0.1
alpha = 0.1
for i in range(num_of_episodes):
    env.reset()
    state = env.observation_space.sample()
    done = False
    while not done:
      action = np.random.choice(a,p=policy[state])
      next_state,reward,done,_ = env.step(action)
      if np.random.uniform(0,1) < 0.5:
         next_action = np.argmax(Q1[next_state])
         Q1[state][action] += alpha*(reward + gamma*Q2[next_state][next_action] - Q1[state][action])
      else:
         next_action = np.argmax(Q2[next_state])
         Q2[state][action] += alpha*(reward + gamma*Q1[next_state][next_action] - Q2[state][action])
      for action in a:
        if(action == np.argmax([Q1[state][j]+Q2[state][j] for j in a])):
          policy[state][action] = 1 - epsilon + epsilon/len(a)
        else:
          policy[state][action] = epsilon/len(a)
      state = next_state






print(policy)





In [None]:
# n step SARSA
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)

# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# policy
policy = [[np.random.uniform(0,1) for j in a] for i in s]
policy = [list(np.array(p)/np.sum(p)) for p in policy]
# print(policy)
# state_action values
Q = [[0 for j in a] for i in s]
N = [[0 for j in a] for i in s]
# returns
# returns = [[[] for j in a] for i in s]

num_of_episodes = 1000
gamma = 1
epsilon = 0.1
n = 3
for i in range(num_of_episodes):
    state = env.observation_space.sample()
    episode_data = []
    done = False
    t = 0
    while True:
      if(not done):
        action = np.random.choice(a,p=policy[state])
        next_state,reward,done,_ = env.step(action)
        # returns[state][action].append(G)
        next_action = np.random.choice(a,p=policy[next_state])
        episode_data.append((state,action,reward,next_state,next_action))
        state = next_state
      if(len(episode_data) is not 0 and len(episode_data)==t):
        break
      if len(episode_data) >= n:
        state_up = episode_data[t][0]
        action = episode_data[t][1]
        N[state_up][action] += 1
        tot_discounted_reward = 0
        for h in range(t,len(episode_data)):
          rewardi = episode_data[h][2]
          tot_discounted_reward = (gamma)*tot_discounted_reward + rewardi
        if (len(episode_data) - t == n):
          nth_state = episode_data[len(episode_data)-1][3]
          nth_action = episode_data[len(episode_data)-1][4]
          may_vary = Q[nth_state][nth_action]
        else:
          may_vary = 0

        Q[state_up][action] += (tot_discounted_reward + (gamma**n)*may_vary - Q[state_up][action])/N[state_up][action]
        for action in a:
          if(action == np.argmax(Q[state])):
            policy[state_up][action] = 1 - epsilon + epsilon/len(a)
          else:
            policy[state_up][action] = epsilon/len(a)
        t += 1







print(policy)
# print(Q)




In [None]:
# n step Expected SARSA
import gym
import numpy as np

# Environment setup
env = gym.make("FrozenLake-v1", is_slippery=False)

# state space
s = [i for i in range(env.observation_space.n)]
# action space
a = [i for i in range(env.action_space.n)]
# policy
policy = [[np.random.uniform(0,1) for j in a] for i in s]
policy = [list(np.array(p)/np.sum(p)) for p in policy]
# print(policy)
# state_action values
Q = [[0 for j in a] for i in s]
N = [[0 for j in a] for i in s]


num_of_episodes = 1000
gamma = 1
epsilon = 0.1
n = 3
for i in range(num_of_episodes):
    state = env.observation_space.sample()
    episode_data = []
    done = False
    t = 0
    while True:
      if(not done):
        action = np.random.choice(a,p=policy[state])
        next_state,reward,done,_ = env.step(action)
        next_action = np.random.choice(a,p=policy[next_state])
        episode_data.append((state,action,reward,next_state,next_action))
        state = next_state
      if(len(episode_data) is not 0 and len(episode_data)==t):
        break
      if len(episode_data) >= n:
        state_up = episode_data[t][0]
        action = episode_data[t][1]
        N[state_up][action] += 1
        tot_discounted_reward = 0
        for h in range(t,len(episode_data)):
          rewardi = episode_data[h][2]
          tot_discounted_reward = (gamma**(h-t))*tot_discounted_reward + rewardi
        if (len(episode_data) - t == n):
          nth_state = episode_data[len(episode_data)-1][3]
          nth_action = episode_data[len(episode_data)-1][4]
          may_vary = np.sum([policy[nth_state][i]*Q[nth_state][i] for i in a])
        else:
          may_vary = 0

        Q[state_up][action] += (tot_discounted_reward + (gamma**n)*may_vary - Q[state_up][action])/N[state_up][action]
        for action in a:
          if(action == np.argmax(Q[state])):
            policy[state][action] = 1 - epsilon + epsilon/len(a)
          else:
            policy[state][action] = epsilon/len(a)
        t += 1







print(policy)
# print(Q)


