In [1]:
!pip install gym numpy




In [2]:
import gym
import numpy as np
import random

In [6]:
# Define the environment
class GridEnv(gym.Env):
    def __init__(self):
        self.grid_size = 10
        self.state = (0, 0)
        self.goal = (9, 9)
        self.obstacle = (5, 5)
        self.action_space = gym.spaces.Discrete(4)  # 4 possible actions: up, down, left, right
        self.observation_space = gym.spaces.Box(low=0, high=self.grid_size-1, shape=(2,), dtype=np.int32)

    def reset(self):
        self.state = (0, 0)
        self.obstacle = (5, 5)
        return np.array(self.state)

    def step(self, action):
        if action == 0:  # up
            next_state = (max(self.state[0] - 1, 0), self.state[1])
        elif action == 1:  # down
            next_state = (min(self.state[0] + 1, self.grid_size - 1), self.state[1])
        elif action == 2:  # left
            next_state = (self.state[0], max(self.state[1] - 1, 0))
        elif action == 3:  # right
            next_state = (self.state[0], min(self.state[1] + 1, self.grid_size - 1))

        if next_state == self.obstacle:
            reward = -10
            done = False
        elif next_state == self.goal:
            reward = 100
            done = True
        else:
            reward = -1
            done = False

        self.state = next_state
        self.move_obstacle()
        return np.array(self.state), reward, done, {}

    def move_obstacle(self):
        moves = [(0,1), (0,-1), (1,0), (-1,0)]
        move = random.choice(moves)
        next_obstacle = (self.obstacle[0] + move[0], self.obstacle[1] + move[1])
        if 0 <= next_obstacle[0] < self.grid_size and 0 <= next_obstacle[1] < self.grid_size:
            self.obstacle = next_obstacle

    def render(self):
        grid = np.zeros((self.grid_size, self.grid_size))
        grid[self.state] = 1
        grid[self.goal] = 2
        grid[self.obstacle] = -1
        print(grid)

In [7]:
# Q-learning algorithm
def q_learning(env, episodes=1000, alpha=0.1, gamma=0.99, epsilon=0.1):
    q_table = np.zeros((env.grid_size, env.grid_size, env.action_space.n))

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state[0], state[1]])

            next_state, reward, done, _ = env.step(action)
            old_value = q_table[state[0], state[1], action]
            next_max = np.max(q_table[next_state[0], next_state[1]])

            new_value = old_value + alpha * (reward + gamma * next_max - old_value)
            q_table[state[0], state[1], action] = new_value

            state = next_state

        if episode % 100 == 0:
            print(f"Episode {episode}")

    return q_table

In [8]:
# Main function to run the environment
if __name__ == "__main__":
    env = GridEnv()
    q_table = q_learning(env)

    state = env.reset()
    done = False
    steps = 0

    while not done and steps < 100:
        env.render()
        action = np.argmax(q_table[state[0], state[1]])
        state, reward, done, _ = env.step(action)
        steps += 1

    env.render()
    if done:
        print("Reached the goal!")
    else:
        print("Failed to reach the goal.")

Episode 0
Episode 100
Episode 200
Episode 300
Episode 400
Episode 500
Episode 600
Episode 700
Episode 800
Episode 900
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  2.]]
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  2.]]
[[ 0.  0.  0.  0.  0