<a href="https://colab.research.google.com/github/SanjayS2348553/Reinforcement-Learning/blob/main/2348553_SANJAY_S_RL_LAB_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install gymnasium numpy


Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [2]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class GridWorldEnv(gym.Env):
    def __init__(self, grid_size=5, target_position=(4, 4), obstacle_positions=[]):
        super(GridWorldEnv, self).__init__()

        # Define grid size
        self.grid_size = grid_size
        self.target_position = target_position
        self.obstacle_positions = set(obstacle_positions)

        # Action space: 0 = up, 1 = right, 2 = down, 3 = left
        self.action_space = spaces.Discrete(4)

        # Observation space: agent's position in the grid
        self.observation_space = spaces.Box(low=0, high=grid_size - 1, shape=(2,), dtype=np.int32)

        # Initialize agent's position
        self.reset()

    def reset(self):
        self.agent_position = np.array([0, 0])  # Starting position
        return self.agent_position

    def step(self, action):
        # Move agent
        if action == 0:  # Up
            self.agent_position[0] = max(0, self.agent_position[0] - 1)
        elif action == 1:  # Right
            self.agent_position[1] = min(self.grid_size - 1, self.agent_position[1] + 1)
        elif action == 2:  # Down
            self.agent_position[0] = min(self.grid_size - 1, self.agent_position[0] + 1)
        elif action == 3:  # Left
            self.agent_position[1] = max(0, self.agent_position[1] - 1)

        # Check if agent hits an obstacle
        if tuple(self.agent_position) in self.obstacle_positions:
            reward = -1
            done = False
        # Check if agent reaches the target
        elif np.array_equal(self.agent_position, self.target_position):
            reward = 10
            done = True
        else:
            reward = -0.1
            done = False

        return self.agent_position, reward, done, {}

    def render(self):
        grid = np.zeros((self.grid_size, self.grid_size), dtype=str)
        grid[:] = '.'
        grid[tuple(self.target_position)] = 'T'
        for obs in self.obstacle_positions:
            grid[obs] = 'X'
        grid[tuple(self.agent_position)] = 'A'
        print("\n".join([" ".join(row) for row in grid]))
        print()

# Example usage
if __name__ == "__main__":
    env = GridWorldEnv(obstacle_positions=[(2, 2), (3, 3)])
    obs = env.reset()
    env.render()

    for _ in range(10):
        action = env.action_space.sample()
        obs, reward, done, _ = env.step(action)
        env.render()
        if done:
            break


A . . . .
. . . . .
. . X . .
. . . X .
. . . . T

. . . . .
A . . . .
. . X . .
. . . X .
. . . . T

. . . . .
. A . . .
. . X . .
. . . X .
. . . . T

. A . . .
. . . . .
. . X . .
. . . X .
. . . . T

A . . . .
. . . . .
. . X . .
. . . X .
. . . . T

. A . . .
. . . . .
. . X . .
. . . X .
. . . . T

. . A . .
. . . . .
. . X . .
. . . X .
. . . . T

. . . . .
. . A . .
. . X . .
. . . X .
. . . . T

. . A . .
. . . . .
. . X . .
. . . X .
. . . . T

. A . . .
. . . . .
. . X . .
. . . X .
. . . . T

. . A . .
. . . . .
. . X . .
. . . X .
. . . . T



In [3]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class ContinuousPendulumEnv(gym.Env):
    def __init__(self):
        super(ContinuousPendulumEnv, self).__init__()

        # State: [theta (angle), theta_dot (angular velocity)]
        self.observation_space = spaces.Box(low=np.array([-np.pi, -8]),
                                            high=np.array([np.pi, 8]),
                                            dtype=np.float32)

        # Action: torque applied to the pendulum
        self.action_space = spaces.Box(low=np.array([-2.0]),
                                       high=np.array([2.0]),
                                       dtype=np.float32)

        # Simulation parameters
        self.dt = 0.05
        self.max_steps = 200
        self.steps = 0

        self.reset()

    def reset(self):
        # Randomize initial state
        self.state = np.array([np.random.uniform(-np.pi, np.pi), 0])  # [theta, theta_dot]
        self.steps = 0
        return self.state

    def step(self, action):
        theta, theta_dot = self.state
        torque = np.clip(action, self.action_space.low, self.action_space.high)[0]

        # Dynamics: d2(theta)/dt2 = -g/l * sin(theta) - b/m * d(theta)/dt + torque / (m * l^2)
        g, l, m, b = 9.8, 1.0, 1.0, 0.1  # gravity, length, mass, damping coefficient
        theta_ddot = (-g / l * np.sin(theta) - b / m * theta_dot + torque / (m * l**2))

        # Update state using Euler integration
        theta_dot += theta_ddot * self.dt
        theta += theta_dot * self.dt

        # Wrap theta to [-pi, pi]
        theta = ((theta + np.pi) % (2 * np.pi)) - np.pi

        self.state = np.array([theta, theta_dot])
        self.steps += 1

        # Reward: stabilize the pendulum upright (theta = 0)
        reward = - (theta**2 + 0.1 * theta_dot**2 + 0.01 * torque**2)
        done = self.steps >= self.max_steps

        return self.state, reward, done, {}

    def render(self):
        print(f"State: {self.state}, Steps: {self.steps}")

# Example usage
if __name__ == "__main__":
    env = ContinuousPendulumEnv()
    state = env.reset()
    for _ in range(100):
        action = env.action_space.sample()  # Random action
        next_state, reward, done, _ = env.step(action)
        env.render()
        if done:
            break


State: [ 0.43078081 -0.28195749], Steps: 1
State: [ 0.41020348 -0.41154659], Steps: 2
State: [ 0.38162062 -0.57165721], Steps: 3
State: [ 0.33910784 -0.85025558], Steps: 4
State: [ 0.28582013 -1.06575423], Steps: 5
State: [ 0.22384033 -1.23959607], Steps: 6
State: [ 0.15784706 -1.31986535], Steps: 7
State: [ 0.09185042 -1.31993285], Steps: 8
State: [ 0.0243076  -1.35085627], Steps: 9
State: [-0.04303934 -1.34693882], Steps: 10
State: [-0.10798113 -1.29883583], Steps: 11
State: [-0.17316731 -1.30372365], Steps: 12
State: [-0.23452356 -1.2271249 ], Steps: 13
State: [-0.28516072 -1.01274317], Steps: 14
State: [-0.32627251 -0.82223591], Steps: 15
State: [-0.35649959 -0.60454161], Steps: 16
State: [-0.37921664 -0.45434107], Steps: 17
State: [-0.39044353 -0.22453762], Steps: 18
State: [-0.39171269 -0.02538329], Steps: 19
State: [-0.3864791   0.10467181], Steps: 20
State: [-0.3730814   0.26795397], Steps: 21
State: [-0.35171478  0.42733243], Steps: 22
State: [-0.32516364  0.53102273], Steps: 

  gym.logger.warn(
  gym.logger.warn(


In [4]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class StochasticBanditEnv(gym.Env):
    def __init__(self, n_arms=5):
        super(StochasticBanditEnv, self).__init__()

        # Number of arms (actions)
        self.n_arms = n_arms
        self.action_space = spaces.Discrete(n_arms)

        # Reward probabilities for each arm
        self.reward_probs = np.random.uniform(0, 1, size=n_arms)

    def reset(self):
        # No state for bandits
        return None

    def step(self, action):
        # Generate reward based on chosen arm's probability
        assert self.action_space.contains(action), "Invalid action"
        reward = np.random.rand() < self.reward_probs[action]
        return None, float(reward), False, {}

    def render(self):
        print(f"Reward probabilities: {self.reward_probs}")

# Example usage
if __name__ == "__main__":
    env = StochasticBanditEnv()
    env.render()
    env.reset()
    for _ in range(10):
        action = env.action_space.sample()  # Random arm selection
        _, reward, _, _ = env.step(action)
        print(f"Action: {action}, Reward: {reward}")


Reward probabilities: [0.99811657 0.790234   0.9300397  0.80288682 0.98474511]
Action: 1, Reward: 0.0
Action: 0, Reward: 1.0
Action: 3, Reward: 1.0
Action: 4, Reward: 1.0
Action: 1, Reward: 1.0
Action: 2, Reward: 1.0
Action: 1, Reward: 1.0
Action: 4, Reward: 1.0
Action: 3, Reward: 1.0
Action: 1, Reward: 1.0
