In [None]:
#Task 1
import gym
import numpy as np

env = gym.make('Taxi-v3')

q_table = np.zeros([env.observation_space.n, env.action_space.n])

alpha = 0.1
gamma = 0.9

num_episodes = 1000
max_steps = 100

for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards = 0

    for step in range(max_steps):
        action = np.argmax(q_table[state]) if np.random.rand() > 0.1 else env.action_space.sample()

        next_state, reward, done, _ = env.step(action)

        q_table[state, action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])

        state = next_state
        rewards += reward

        if done:
            break

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0

    done = False

    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

In [None]:
#Task 2
import gym
import numpy as np

env = gym.make('Taxi-v2')

q_table = np.zeros([env.observation_space.n, env.action_space.n])

alpha = 0.1
gamma = 0.9

num_episodes = 1000
max_steps = 100

for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards = 0

    for step in range(max_steps):
        action = np.argmax(q_table[state]) if np.random.rand() > 0.1 else env.action_space.sample()

        next_state, reward, done, _ = env.step(action)

        if not done:
            q_table[state, action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])
        else:
            q_table[state, action] += alpha * (reward - q_table[state, action])
            break

        state = next_state
        rewards += reward

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0

    done = False

    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")


In [None]:
#Task 3
import gym
from gym import spaces
import numpy as np

class HealthcareNetworkEnv(gym.Env):
    def __init__(self, num_nodes=10, max_steps=100):
        super(HealthcareNetworkEnv, self).__init__()
        self.num_nodes = num_nodes
        self.max_steps = max_steps
        self.action_space = spaces.Discrete(self.num_nodes)
        self.observation_space = spaces.Discrete(self.num_nodes)
        self.sink_node = self.num_nodes - 1
        self.steps_taken = 0
        self.state = 0

    def reset(self):
        self.steps_taken = 0
        self.state = 0
        return self.state

    def step(self, action):
        reward = -action

        self.steps_taken += 1
        done = self.steps_taken >= self.max_steps or self.state == self.sink_node

        if not done:
            self.state = action

        return self.state, reward, done, {}

env = HealthcareNetworkEnv(num_nodes=10, max_steps=100)

Q_table = np.zeros([env.observation_space.n, env.action_space.n])
alpha = 0.1
gamma = 0.9
epsilon = 0.1

num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    done = False

    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[state])

        next_state, reward, done, _ = env.step(action)

        old_Q = Q_table[state, action]
        next_max_Q = np.max(Q_table[next_state])
        new_Q = (1 - alpha) * old_Q + alpha * (reward + gamma * next_max_Q - old_Q)
        Q_table[state, action] = new_Q

        state = next_state

total_reward = 0
num_eval_episodes = 100
for _ in range(num_eval_episodes):
    state = env.reset()
    done = False

    while not done:
        action = np.argmax(Q_table[state])
        state, reward, done, _ = env.step(action)
        total_reward += reward

average_reward = total_reward / num_eval_episodes
print(f"Average reward over {num_eval_episodes} evaluation episodes: {average_reward}")
