# Grid World Environment

In [1]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class GridWorldEnv(gym.Env):
    def __init__(self, grid_size, start, goal, obstacles):
        super(GridWorldEnv, self).__init__()
        self.grid_size = grid_size
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.action_space = spaces.Discrete(4)  # Four actions: up, down, left, right
        self.observation_space = spaces.Tuple((
            spaces.Discrete(grid_size[0]),
            spaces.Discrete(grid_size[1])
        ))
        self.reset()

    def reset(self):
        self.agent_pos = list(self.start)
        return tuple(self.agent_pos)

    def step(self, action):
        # Define actions: 0=up, 1=down, 2=left, 3=right
        if action == 0 and self.agent_pos[0] > 0:  # Up
            self.agent_pos[0] -= 1
        elif action == 1 and self.agent_pos[0] < self.grid_size[0] - 1:  # Down
            self.agent_pos[0] += 1
        elif action == 2 and self.agent_pos[1] > 0:  # Left
            self.agent_pos[1] -= 1
        elif action == 3 and self.agent_pos[1] < self.grid_size[1] - 1:  # Right
            self.agent_pos[1] += 1
        
        reward = -1
        done = False
        if tuple(self.agent_pos) == self.goal:
            reward = 10
            done = True
        elif tuple(self.agent_pos) in self.obstacles:
            reward = -10
            done = True

        return tuple(self.agent_pos), reward, done, {}

    def render(self, mode='human'):
        grid = np.zeros(self.grid_size)
        grid[self.start] = 1  # Start position
        grid[self.goal] = 2  # Goal position
        for obstacle in self.obstacles:
            grid[obstacle] = -1  # Obstacle positions
        grid[tuple(self.agent_pos)] = 3  # Agent position
        print(grid)

# Example usage
grid_size = (5, 5)
start = (0, 0)
goal = (4, 4)
obstacles = [(1, 1), (2, 2), (3, 3)]

env = GridWorldEnv(grid_size, start, goal, obstacles)
obs = env.reset()
env.render()

done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, _ = env.step(action)
    env.render()
    print(f"Action: {action}, Reward: {reward}")


[[ 3.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  2.]]
[[ 3.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  2.]]
Action: 2, Reward: -1
[[ 1.  3.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  2.]]
Action: 3, Reward: -1
[[ 3.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  2.]]
Action: 2, Reward: -1
[[ 1.  3.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  2.]]
Action: 3, Reward: -1
[[ 1.  0.  3.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  2.]]
Action: 3, Reward: -1
[[ 1.  0.  0.  3.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  2.]]
Action: 3, Reward: -1
[[ 1.  0.  0.  0.  0.]
 [ 0. -1.  0.  3.  0.]
 [ 0.  0. 

# Multi-Armed Bandit Environment

In [2]:
import numpy as np

class MultiArmedBanditEnv(gym.Env):
    def __init__(self, k_arms, reward_distributions):
        super(MultiArmedBanditEnv, self).__init__()
        self.k_arms = k_arms
        self.reward_distributions = reward_distributions
        self.action_space = spaces.Discrete(k_arms)
        self.observation_space = spaces.Discrete(1)
        self.reset()

    def reset(self):
        return 0  # Single-state problem

    def step(self, action):
        reward = np.random.normal(self.reward_distributions[action][0], self.reward_distributions[action][1])
        return 0, reward, False, {}  # Single-state, so always return state 0

# Example usage
k_arms = 10
reward_distributions = [(np.random.rand(), 1) for _ in range(k_arms)]

env = MultiArmedBanditEnv(k_arms, reward_distributions)
obs = env.reset()
for _ in range(100):
    action = env.action_space.sample()
    obs, reward, done, _ = env.step(action)
    print(f"Action: {action}, Reward: {reward}")


Action: 6, Reward: 0.7870618088156315
Action: 0, Reward: -0.4257115169570683
Action: 1, Reward: -0.10371139520153574
Action: 3, Reward: 0.5196974347441244
Action: 1, Reward: 0.16188072254602276
Action: 8, Reward: 2.801921680050778
Action: 7, Reward: 0.956507470864909
Action: 8, Reward: 2.590806644376442
Action: 3, Reward: -0.49223398354003145
Action: 6, Reward: -1.3373291643326914
Action: 8, Reward: 1.5950496692073695
Action: 8, Reward: 3.960706642120981
Action: 2, Reward: -1.4349237603999578
Action: 5, Reward: -0.06361534895716092
Action: 8, Reward: 0.7325531852865608
Action: 2, Reward: 0.9140492613457103
Action: 7, Reward: -0.8051568387495164
Action: 1, Reward: 0.37778958277793123
Action: 5, Reward: 0.7707816863161439
Action: 3, Reward: 1.548818412281353
Action: 2, Reward: 0.6157867483337305
Action: 4, Reward: 1.484606924893538
Action: 2, Reward: -0.34466077849897925
Action: 9, Reward: 0.14590453350785074
Action: 1, Reward: -0.7321127020637822
Action: 5, Reward: 1.7011852781425287
Ac

# Value Iteration

In [3]:
def value_iteration(env, gamma=0.99, theta=1e-6):
    value_table = np.zeros(env.observation_space.n)
    policy = np.zeros(env.observation_space.n, dtype=int)

    while True:
        delta = 0
        for state in range(env.observation_space.n):
            v = value_table[state]
            q_values = [sum([prob * (reward + gamma * value_table[next_state])
                             for prob, next_state, reward, _ in env.P[state][action]])
                        for action in range(env.action_space.n)]
            value_table[state] = max(q_values)
            delta = max(delta, abs(v - value_table[state]))
        if delta < theta:
            break

    for state in range(env.observation_space.n):
        q_values = [sum([prob * (reward + gamma * value_table[next_state])
                         for prob, next_state, reward, _ in env.P[state][action]])
                    for action in range(env.action_space.n)]
        policy[state] = np.argmax(q_values)

    return policy, value_table


# Policy Iteration


In [4]:
def policy_iteration(env, gamma=0.99, theta=1e-6):
    policy = np.random.choice(env.action_space.n, env.observation_space.n)
    value_table = np.zeros(env.observation_space.n)

    def one_step_lookahead(state, value_table):
        action_values = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[state][action]:
                action_values[action] += prob * (reward + gamma * value_table[next_state])
        return action_values

    while True:
        while True:
            delta = 0
            for state in range(env.observation_space.n):
                v = value_table[state]
                value_table[state] = sum([prob * (reward + gamma * value_table[next_state])
                                          for prob, next_state, reward, _ in env.P[state][policy[state]]])
                delta = max(delta, abs(v - value_table[state]))
            if delta < theta:
                break

        policy_stable = True
        for state in range(env.observation_space.n):
            chosen_action = policy[state]
            action_values = one_step_lookahead(state, value_table)
            best_action = np.argmax(action_values)
            if chosen_action != best_action:
                policy_stable = False
            policy[state] = best_action

        if policy_stable:
            break

    return policy, value_table


# Q-Learning

In [5]:
def q_learning(env, alpha=0.1, gamma=0.99, epsilon=0.1, episodes=1000):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))

    def choose_action(state):
        if np.random.uniform(0, 1) < epsilon:
            return np.random.choice(env.action_space.n)
        else:
            return np.argmax(q_table[state, :])

    for _ in range(episodes):
        state = env.reset()
        done = False
        while not done:
            action = choose_action(state)
            next_state, reward, done, _ = env.step(action)
            q_table[state, action] += alpha * (reward + gamma * np.max(q_table[next_state, :]) - q_table[state, action])
            state = next_state

    policy = np.argmax(q_table, axis=1)
    return policy, q_table


# Epsilon-Greedy Policy for Multi-Armed Bandit

In [6]:
def epsilon_greedy_bandit(env, alpha=0.1, epsilon=0.1, episodes=1000):
    q_values = np.zeros(env.action_space.n)
    action_counts = np.zeros(env.action_space.n)

    for _ in range(episodes):
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(q_values)

        _, reward, _, _ = env.step(action)
        action_counts[action] += 1
        q_values[action] += (reward - q_values[action]) / action_counts[action]

    return q_values


# Upper Confidence Bound (UCB) Algorithm for Multi-Armed Bandit

In [7]:
def ucb_bandit(env, alpha=0.1, episodes=1000):
    q_values = np.zeros(env.action_space.n)
    action_counts = np.zeros(env.action_space.n)
    total_steps = 0

    for _ in range(episodes):
        total_steps += 1
        ucb_values = q_values + np.sqrt(2 * np.log(total_steps) / (action_counts + 1e-5))
        action = np.argmax(ucb_values)

        _, reward, _, _ = env.step(action)
        action_counts[action] += 1
        q_values[action] += (reward - q_values[action]) / action_counts[action]

    return q_values
