## Value Iteration Algorithm

In [1]:
import typing as tt
import gymnasium as gym
from collections import defaultdict, Counter
from torch.utils.tensorboard.writer import SummaryWriter
from gymnasium.wrappers import RecordVideo

ENV_NAME = "FrozenLake-v1"
GAMMA = 0.9
TEST_EPISODES = 100

State = int
Action = int
RewardKey = tt.Tuple[State, Action, State]
TransitKey = tt.Tuple[State, Action]

In [2]:
class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.state, _ = self.env.reset()
        self.rewards: tt.Dict[RewardKey, float] = defaultdict(float)
        self.transits: tt.Dict[TransitKey, Counter] = defaultdict(Counter)
        self.values: tt.Dict[State, float] = defaultdict(float)

    def play_n_random_steps(self, n: int):
        for _ in range(n):
            action = self.env.action_space.sample()
            new_state, reward, is_done, is_trunc, _ = self.env.step(action)
            rw_key = (self.state, action, new_state)
            self.rewards[rw_key] = float(reward)
            tr_key = (self.state, action)
            self.transits[tr_key][new_state] += 1
            if is_done or is_trunc:
                self.state, _ = self.env.reset()
            else:
                self.state = new_state

    def calc_action_value(self, state: State, action: Action) -> float:
        target_counts = self.transits[(state, action)]
        total = sum(target_counts.values())
        action_value = 0.0
        for tgt_state, count in target_counts.items():
            rw_key = (state, action, tgt_state)
            reward = self.rewards[rw_key]
            val = reward + GAMMA * self.values[tgt_state]
            action_value += (count / total) * val
        return action_value

    def select_action(self, state: State) -> Action:
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

    def play_episode(self, env: gym.Env) -> float:
        total_reward = 0.0
        state, _ = env.reset()
        while True:
            action = self.select_action(state)
            new_state, reward, is_done, is_trunc, _ = env.step(action)
            rw_key = (state, action, new_state)
            self.rewards[rw_key] = float(reward)
            tr_key = (state, action)
            self.transits[tr_key][new_state] += 1
            total_reward += reward
            if is_done or is_trunc:
                break
            state = new_state
        return total_reward

    def value_iteration(self):
        for state in range(self.env.observation_space.n):
            state_values = [
                self.calc_action_value(state, action)
                for action in range(self.env.action_space.n)
            ]
            self.values[state] = max(state_values)


In [3]:
if __name__ == "__main__":
    test_env = gym.make(ENV_NAME)
    agent = Agent()
    writer = SummaryWriter(comment="-v-iteration")
    iter_no = 0
    best_reward = 0.0
    while True:
        iter_no += 1
        agent.play_n_random_steps(100)
        agent.value_iteration()
        reward = 0.0
        for _ in range(TEST_EPISODES):
            reward += agent.play_episode(test_env)
        reward /= TEST_EPISODES
        writer.add_scalar("reward", reward, iter_no)
        if reward > best_reward:
            print(f"{iter_no}: Best reward updated: {best_reward:.3f} -> {reward:.3f}")
            best_reward = reward
        if reward > 0.8:
            print(f"Solved in {iter_no} iterations!")
            break
    writer.close()

4: Best reward updated: 0.000 -> 0.010
5: Best reward updated: 0.010 -> 0.080
6: Best reward updated: 0.080 -> 0.120
10: Best reward updated: 0.120 -> 0.300
12: Best reward updated: 0.300 -> 0.430
13: Best reward updated: 0.430 -> 0.440
17: Best reward updated: 0.440 -> 0.490
18: Best reward updated: 0.490 -> 0.640
19: Best reward updated: 0.640 -> 0.740
20: Best reward updated: 0.740 -> 0.770
21: Best reward updated: 0.770 -> 0.800
67: Best reward updated: 0.800 -> 0.810
Solved in 67 iterations!


In [4]:
# -------- Testing + Recording Loop --------
num_episodes = 5
print("\n=== Testing & Recording trained agent ===")
video_env = RecordVideo(
        gym.make(ENV_NAME, render_mode="rgb_array"),
        video_folder="./videos",
        episode_trigger=lambda ep_id: True  # record every test episode
    )

for ep in range(num_episodes):
    state, _ = video_env.reset()
    total_reward = 0
    while True:
        action = agent.select_action(state)
        new_state, reward, is_done, is_trunc, _ = video_env.step(action)
        total_reward += reward
        state = new_state
        if is_done or is_trunc:
            break
    print(f"Test Episode {ep+1}: reward={total_reward}")

video_env.close()
print(f"All {num_episodes} test episodes recorded in ./videos folder")


=== Testing & Recording trained agent ===


  logger.warn(


Test Episode 1: reward=1.0
Test Episode 2: reward=1.0
Test Episode 3: reward=0.0
Test Episode 4: reward=1.0
Test Episode 5: reward=1.0
All 5 test episodes recorded in ./videos folder


In [4]:
agent.select_action(1), agent.transits, agent.rewards, agent.values

(3,
 defaultdict(collections.Counter,
             {(0, np.int64(2)): Counter({4: 1935, 0: 1925, 1: 1917}),
              (0, np.int64(3)): Counter({0: 492, 1: 248}),
              (1, np.int64(1)): Counter({2: 112, 0: 99, 5: 71}),
              (1, np.int64(3)): Counter({1: 1471, 0: 1402, 2: 1322}),
              (0, np.int64(1)): Counter({4: 368, 0: 347, 1: 346}),
              (1, np.int64(0)): Counter({5: 99, 1: 98, 0: 98}),
              (1, np.int64(2)): Counter({2: 100, 1: 97, 5: 91}),
              (0, np.int64(0)): Counter({0: 46137, 4: 23157}),
              (2, np.int64(2)): Counter({2: 1500, 3: 1499, 6: 1428}),
              (2, np.int64(1)): Counter({1: 47, 3: 46, 6: 45}),
              (3, np.int64(3)): Counter({3: 3028, 2: 1474}),
              (3, np.int64(0)): Counter({3: 30, 7: 22, 2: 19}),
              (2, np.int64(3)): Counter({3: 51, 2: 47, 1: 43}),
              (2, np.int64(0)): Counter({2: 843, 6: 836, 1: 795}),
              (6, np.int64(2)): Counter({2: 1717,

In [None]:
# Transition probabilities and rewards
env = gym.make(ENV_NAME)
env.unwrapped.P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

In [2]:
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import RecordVideo

In [3]:
env = gym.make("FrozenLake-v1")

In [4]:
num_states = env.observation_space.n
num_actions = env.action_space.n
num_states, num_actions

(np.int64(16), np.int64(4))

In [None]:
def value_iteration(num_states, num_actions, gamma=0.9):
    """Function to perform value iteration algorithm."""
    policy = np.zeros(num_states)
    V = np.zeros(num_states)
    V_old = -1 * np.ones(num_states)
    while np.max(np.abs(V - V_old)) > 1e-3:
        V_old = V.copy()
        for s in range(num_states):
            q = np.zeros(num_actions)
            for a in range(num_actions):
                for prob, next_s, reward, is_done in env.unwrapped.P[s][a]:
                    q[a] += prob * (reward + gamma * V_old[next_s]) # Computing the Q Value using the bellman equation of optimality
            V[s] = np.max(q) # Comouting the value(estimate) of a state
            policy[s] = np.argmax(q)
    return V, policy

In [6]:
value, policy = value_iteration(num_states, num_actions)

In [7]:
value.reshape(4, 4), policy.reshape(4, 4)

(array([[0.06253203, 0.05606969, 0.07051342, 0.05159888],
        [0.08603505, 0.        , 0.11007757, 0.        ],
        [0.14066756, 0.24422494, 0.29731176, 0.        ],
        [0.        , 0.3775274 , 0.6377414 , 0.        ]]),
 array([[0., 3., 0., 3.],
        [0., 0., 0., 0.],
        [3., 1., 0., 0.],
        [0., 2., 1., 0.]]))

In [10]:
num_episodes = 5
env_name = "FrozenLake-v1"
print("\n=== Testing & Recording trained agent ===")
video_env = RecordVideo(
        gym.make(env_name, render_mode="rgb_array"),
        video_folder="./videos_frozenlake_viter",
        episode_trigger=lambda ep_id: True  # record every test episode
    )

for ep in range(num_episodes):
    state, _ = video_env.reset()
    total_reward = 0
    while True:
        action = int(policy[state])
        new_state, reward, is_done, is_trunc, _ = video_env.step(action)
        total_reward += reward
        state = new_state
        if is_done or is_trunc:
            break
    print(f"Test Episode {ep+1}: reward={total_reward}")

video_env.close()
print(f"All {num_episodes} test episodes recorded in ./videos_frozenlake_viter folder")


=== Testing & Recording trained agent ===
Test Episode 1: reward=1.0
Test Episode 2: reward=1.0


  logger.warn(


Test Episode 3: reward=1.0
Test Episode 4: reward=0.0
Test Episode 5: reward=1.0
All 5 test episodes recorded in ./videos_frozenlake_viter folder


## Value Iteration on the gymnasium Taxi environment

In [1]:
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import RecordVideo

In [2]:
env = gym.make("Taxi-v3")

In [3]:
num_states = env.observation_space.n
num_actions = env.action_space.n
num_states, num_actions

(np.int64(500), np.int64(6))

In [None]:
def value_iteration(num_states, num_actions, gamma=0.9):
    """Function to perform value iteration algorithm."""
    policy = np.zeros(num_states)
    V = np.zeros(num_states)
    V_old = -1 * np.ones(num_states)
    while np.max(np.abs(V - V_old)) > 1e-3:
        V_old = V.copy()
        for s in range(num_states):
            q = np.zeros(num_actions)
            for a in range(num_actions):
                for prob, next_s, reward, is_done in env.unwrapped.P[s][a]:
                    q[a] += prob * (reward + gamma * V_old[next_s]) # Computing the Q Value using the bellman equation of optimality
            V[s] = np.max(q) # Computing the value(estimate) of a state
            policy[s] = np.argmax(q)
    return V, policy

In [5]:
value, policy = value_iteration(num_states, num_actions)

In [6]:
value, policy

(array([ 89.46916234,  32.81563744,  55.26016234,  37.57393009,
          8.42815264,  32.81563744,   8.42815264,  15.28085117,
         32.81563744,  18.08978465,  55.26016234,  21.21187144,
         12.75186641,  18.08978465,  12.75186641,  37.57393009,
        100.52229109,  37.57393009,  62.51229109,  42.85987234,
         79.52229109,  28.53411868,  48.73419109,  32.81563744,
         10.47672475,  37.57393009,  10.47672475,  18.08978465,
         28.53411868,  15.28085117,  48.73419109,  18.08978465,
         15.28085117,  21.21187144,  15.28085117,  42.85987234,
         89.46916234,  42.85987234,  55.26016234,  48.73419109,
         42.85987234,  12.75186641,  24.67980717,  15.28085117,
         24.67980717,  70.56916234,  24.67980717,  37.57393009,
         24.67980717,  12.75186641,  42.85987234,  15.28085117,
         18.08978465,  24.67980717,  18.08978465,  48.73419109,
         48.73419109,  79.52229109,  48.73419109,  55.26016234,
         37.57393009,  10.47672475,  21.

In [7]:
num_episodes = 5
env_name = "Taxi-v3"
print("\n=== Testing & Recording trained agent ===")
video_env = RecordVideo(
        gym.make(env_name, render_mode="rgb_array"),
        video_folder="./videos_taxi_viter",
        episode_trigger=lambda ep_id: True  # record every test episode
    )

for ep in range(num_episodes):
    state, _ = video_env.reset()
    total_reward = 0
    while True:
        action = int(policy[state])
        new_state, reward, is_done, is_trunc, _ = video_env.step(action)
        total_reward += reward
        state = new_state
        if is_done or is_trunc:
            break
    print(f"Test Episode {ep+1}: reward={total_reward}")

video_env.close()
print(f"All {num_episodes} test episodes recorded in ./videos_taxi_viter folder")


=== Testing & Recording trained agent ===


  logger.warn(


Test Episode 1: reward=11
Test Episode 2: reward=4
Test Episode 3: reward=7
Test Episode 4: reward=7
Test Episode 5: reward=12
All 5 test episodes recorded in ./videos_taxi_viter folder


## Tabular Q Learning on the FrozenLake Environment

In [1]:
import typing as tt
import gymnasium as gym
from collections import defaultdict
from torch.utils.tensorboard.writer import SummaryWriter
import os

In [None]:
ENV_NAME = "FrozenLake-v1"
GAMMA = 0.9
ALPHA = 0.2
EPSILON = 0.1
TEST_EPISODES = 20
VIDEO_FOLDER = "videos_frozenlake_ql"

In [4]:
State = int
Action = int
ValuesKey = tt.Tuple[State, Action]

In [5]:
class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.state, _ = self.env.reset()
        self.values: tt.Dict[ValuesKey, float] = defaultdict(float) 

    def sample_env(self) -> tt.Tuple[State, Action, float, State]:
        action = self.env.action_space.sample()
        old_state = self.state
        new_state, reward, is_done, is_trunc, _ = self.env.step(action)
        if is_done or is_trunc:
            self.state, _ = self.env.reset()
        else:
            self.state = new_state
        return old_state, action, reward, new_state
    
    def best_value_and_action(self, state: State) -> tt.Tuple[float, Action]:
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        if best_action is None:
            best_action = self.env.action_space.sample()
            best_value = self.values[(state, best_action)]
        return best_value, best_action

    def value_update(self, state: State, action: Action, reward: float, next_state: State):
        best_value, _ = self.best_value_and_action(next_state)
        # Q learning update
        self.values[(state, action)] = (1 - ALPHA) * self.values[(state, action)] + ALPHA * (reward + GAMMA * best_value)

    def play_episode(self, env: gym.Env) -> float:
        total_reward = 0.0
        state, _ = env.reset()
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, is_trunc, _ = env.step(action)
            total_reward += reward
            if is_done or is_trunc:
                break
            state = new_state
        return total_reward


In [6]:
if __name__ == "__main__":
    test_env = gym.make(ENV_NAME)
    agent = Agent()
    writer = SummaryWriter(comment="-q-learning")
    iter_no = 0
    best_reward = 0.0
    while True:
        iter_no += 1
        state, action, reward, next_state = agent.sample_env()
        agent.value_update(state, action, reward, next_state)

        test_reward = 0.0
        for _ in range(TEST_EPISODES):
            test_reward += agent.play_episode(test_env)
        test_reward /= TEST_EPISODES
        writer.add_scalar("reward", test_reward, iter_no)
        if test_reward > best_reward:
            print(f"{iter_no}: Best reward updated: {best_reward:.3f} -> {test_reward:.3f}")
            best_reward = test_reward
        if test_reward > 0.8:
            print(f"Solved in {iter_no} iterations!")
            break
    writer.close()


1083: Best reward updated: 0.000 -> 0.100
1085: Best reward updated: 0.100 -> 0.150
1086: Best reward updated: 0.150 -> 0.200
1148: Best reward updated: 0.200 -> 0.250
1642: Best reward updated: 0.250 -> 0.300
1689: Best reward updated: 0.300 -> 0.350
1959: Best reward updated: 0.350 -> 0.400
3109: Best reward updated: 0.400 -> 0.550
3136: Best reward updated: 0.550 -> 0.600
3218: Best reward updated: 0.600 -> 0.700
10457: Best reward updated: 0.700 -> 0.750
14382: Best reward updated: 0.750 -> 0.800
23592: Best reward updated: 0.800 -> 0.850
Solved in 23592 iterations!


In [7]:
def record_agent_video(agent: Agent, env_name: str, video_path: str, num_episodes: int = 1):
  
    os.makedirs(video_path, exist_ok=True)

    env = gym.make(env_name, render_mode="rgb_array")
    env = gym.wrappers.RecordVideo(env, video_path)

    print(f"Recording {num_episodes} episode(s) to {video_path}...")
    for i in range(num_episodes):
        obs, _ = env.reset()
        done = False
        truncated = False
        total_reward = 0
        while not done and not truncated:
            _, action = agent.best_value_and_action(obs)
            obs, reward, done, truncated, _ = env.step(action)
            total_reward += reward
        print(f"Episode {i+1} finished with reward: {total_reward}")
    env.close()
    print("Video recording complete.")

In [9]:
print("\nTraining complete! Now recording a video of the trained agent.")
record_agent_video(agent, ENV_NAME, VIDEO_FOLDER, num_episodes=5) 


Training complete! Now recording a video of the trained agent.


  logger.warn(


Recording 5 episode(s) to videos_frozenlake_ql...
Episode 1 finished with reward: 1.0
Episode 2 finished with reward: 1.0
Episode 3 finished with reward: 0.0
Episode 4 finished with reward: 1.0
Episode 5 finished with reward: 1.0
Video recording complete.


## SARSA learning Algorithm on the Taxi environment

In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

In [9]:
ENV_NAME = "Taxi-v3"
ALPHA = 0.1
GAMMA = 0.99
EPSILON = 1.0
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.995
NUM_EPISODES = 10000
MAX_STEPS = 100
VIDEO_DIR = "videos_taxi_sarsa"

In [None]:
class SarsaAgent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.n_states = self.env.observation_space.n
        self.n_actions = self.env.action_space.n
        self.Q_table = np.zeros((self.n_states, self.n_actions))

    def params(self):
        return {
            "Q_table": self.Q_table,
            "ALPHA": ALPHA,
            "GAMMA": GAMMA,
            "EPSILON": EPSILON,
            "EPSILON_MIN": EPSILON_MIN,
            "EPSILON_DECAY": EPSILON_DECAY,
            "MAX_STEPS": MAX_STEPS,
            "NUM_EPISODES": NUM_EPISODES,
            "n_actions": self.n_actions,
            "n_states": self.n_states
        }

    def epsilon_greedy(self, state, epsilon, n_actions):
        if np.random.random() < epsilon:
            return np.random.randint(n_actions)
        else:
            return np.argmax(self.Q_table[state])

    def sarsa_update(self, state, action, reward, next_state, next_action, alpha, gamma):
        predict = self.Q_table[state, action]
        target = reward + gamma * self.Q_table[next_state, next_action]
        self.Q_table[state, action] += alpha * (target - predict) # SARSA update rule

    def train_agent(self, params_):
        epsilon = params_["EPSILON"]
        rewards = []

        for episode in range(params_["NUM_EPISODES"]):
            state, _ = self.env.reset()
            action = self.epsilon_greedy(state, epsilon, params_["n_actions"])
            total_reward = 0

            for _ in range(params_["MAX_STEPS"]):
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                next_action = self.epsilon_greedy(next_state, epsilon, params_["n_actions"])

                # SARSA update
                self.sarsa_update(state, action, reward, next_state, next_action,
                                  params_["ALPHA"], params_["GAMMA"])

                state = next_state
                action = next_action
                total_reward += reward

                if terminated or truncated:
                    break

            # Decay epsilon
            #epsilon = max(params_["EPSILON_MIN"], epsilon * params_["EPSILON_DECAY"])
            if epsilon > params_["EPSILON_MIN"]:
                epsilon *= params_["EPSILON_DECAY"]
            rewards.append(total_reward)

            # Logging every 1000 episodes
            if (episode + 1) % 1000 == 0:
                avg_reward = np.mean(rewards[-1000:])
                print(f"Episode: {episode+1}, Avg Reward: {avg_reward:.3f}, Epsilon: {epsilon:.3f}")

        return self.Q_table, rewards

In [11]:
def test_and_record(Q_table):

    # Create environment with video recording
    env = gym.make(ENV_NAME, render_mode="rgb_array")  # For recording
    env = gym.wrappers.RecordVideo(env, VIDEO_DIR, episode_trigger=lambda e: True)

    n_actions = env.action_space.n
    rewards = []

    for ep in range(5):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            # Greedy policy (no exploration)
            action = np.argmax(Q_table[state])
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            state = next_state

            if terminated or truncated:
                done = True

        rewards.append(total_reward)
        print(f"Episode {ep + 1}: Total Reward = {total_reward}")

    env.close()
    print(f"Videos saved in: {VIDEO_DIR}")
    return rewards

In [8]:
if __name__ == "__main__":
    S_agent = SarsaAgent()
    params_ = S_agent.params()
    Q_table, rewards = S_agent.train_agent(params_)

Episode: 1000, Avg Reward: -124.175, Epsilon: 0.010
Episode: 2000, Avg Reward: -2.405, Epsilon: 0.010
Episode: 3000, Avg Reward: 6.792, Epsilon: 0.010
Episode: 4000, Avg Reward: 7.424, Epsilon: 0.010
Episode: 5000, Avg Reward: 7.555, Epsilon: 0.010
Episode: 6000, Avg Reward: 7.376, Epsilon: 0.010
Episode: 7000, Avg Reward: 7.196, Epsilon: 0.010
Episode: 8000, Avg Reward: 7.420, Epsilon: 0.010
Episode: 9000, Avg Reward: 7.182, Epsilon: 0.010
Episode: 10000, Avg Reward: 7.449, Epsilon: 0.010


In [12]:
test_and_record(Q_table)

Episode 1: Total Reward = 12
Episode 2: Total Reward = 4
Episode 3: Total Reward = 7
Episode 4: Total Reward = 9
Episode 5: Total Reward = 8
Videos saved in: videos_taxi_sarsa


[12, 4, 7, 9, 8]

## A doctor agent using RL

In [1]:
import numpy as np
import random

In [None]:
# Define the environment (The Patient)
class PatientEnvironment:
    def __init__(self):
        self.current_health_state = random.choice([0, 1, 2])
        self.states = {0: "Healthy", 1: "Sick", 2: "Critical"}
        self.actions = {0: "Treatment A", 1: "Treatment B"}
        self.num_states = len(self.states)
        self.num_actions = len(self.actions)

    def reset(self):
        self.current_health_state = random.choice([0, 1, 2])
        return self.current_health_state
    
    def step(self, action):
        reward = 0
        done = False
        next_state = self.current_health_state

        # Simplified state transition and reward logic
        if self.current_health_state == 0: # Healthy
            if action == 0: # Treatment A (maintain health)
                reward = 1
                next_state = 0
            else: # Treatment B (mild negative effect on healthy patient)
                reward = -0.5
                next_state = 1 # Becomes sick
        elif self.current_health_state == 1: # Sick
            if action == 0: # Treatment A (effective for sick)
                reward = 2
                next_state = 0 # Becomes healthy
            else: # Treatment B (less effective for sick)
                reward = 0
                next_state = 1 # Stays sick
        elif self.current_health_state == 2: # Critical
            if action == 0: # Treatment A (might save, but risky)
                if random.random() < 0.7: # 70% chance of recovery
                    reward = 5
                    next_state = 1 # Becomes sick (better than critical)
                else: # 30% chance of failure
                    reward = -10
                    next_state = 2 # Stays critical (or worse, not modelled here)
            else: # Treatment B (ineffective for critical)
                reward = -2
                next_state = 2 # Stays critical or worsens

        # If patient becomes healthy, the "episode"  ends
        if next_state == 0:
            done = True

        self.current_health_state = next_state
        return next_state, reward, done
    

In [3]:
# Define RL Agent (Q-learning)
class DoctorAgent:
    def __init__(self, env):
        self.env = env
        self.q_table = np.zeros((env.num_states, env.num_actions))
        self.learning_rate = 0.1
        self.discount_factor = 0.99
        self.epsilon = 1.0 # Exploration-exploitation trade-off
        self.epsilon_decay_rate = 0.001
        self.epsilon_min = 0.01

    def choose_action(self, state):
        # Epsilon-greedy strategy
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.env.num_actions - 1) # Explore
        else:
            return np.argmax(self.q_table[state, :]) # Exploit

    def learn(self, state, action, reward, next_state):
        # Q-learning formula
        old_value = self.q_table[state, action]
        next_max = np.max(self.q_table[next_state, :])

        new_value = old_value + self.learning_rate * (reward + self.discount_factor * next_max - old_value)
        self.q_table[state, action] = new_value

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay_rate)

In [5]:
# Training Loop
if __name__ == "__main__":
    env = PatientEnvironment()
    agent = DoctorAgent(env)

    num_episodes = 2000
    rewards_per_episode = []

    print("--- Starting Training ---")
    for episode in range(num_episodes):
        state = env.reset() # Reset patient for a new episode
        done = False
        episode_reward = 0

        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.learn(state, action, reward, next_state)

            state = next_state
            episode_reward += reward

        agent.decay_epsilon()
        rewards_per_episode.append(episode_reward)

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}: Total Reward = {episode_reward:.2f}, Epsilon = {agent.epsilon:.2f}")

    print("\n--- Training Finished ---")
    print("\nFinal Q-table:")
    print(agent.q_table)

    print("\n--- Testing the trained agent (10 episodes) ---")
    total_test_rewards = 0
    for _ in range(10):
        state = env.reset()
        done = False
        test_reward = 0
        print(f"\nNew Patient (Initial State: {env.states[state]})")
        while not done:
            action = np.argmax(agent.q_table[state, :]) # Agent acts purely based on learned Q-values
            print(f"  Agent chooses: {env.actions[action]} (Current State: {env.states[state]})")
            next_state, reward, done = env.step(action)
            test_reward += reward
            state = next_state
            if not done:
                print(f"  Patient is now: {env.states[state]} (Reward: {reward})")
            else:
                print(f"  Patient is now: {env.states[state]} (Reward: {reward}). Episode Finished.")
        total_test_rewards += test_reward
        print(f"  Total Test Reward for this patient: {test_reward:.2f}")

    print(f"\nAverage Test Reward over 10 episodes: {total_test_rewards / 10:.2f}")

--- Starting Training ---
Episode 100: Total Reward = 1.00, Epsilon = 0.90
Episode 200: Total Reward = 2.00, Epsilon = 0.80
Episode 300: Total Reward = 2.00, Epsilon = 0.70
Episode 400: Total Reward = 2.00, Epsilon = 0.60
Episode 500: Total Reward = 2.00, Epsilon = 0.50
Episode 600: Total Reward = 2.00, Epsilon = 0.40
Episode 700: Total Reward = 7.00, Epsilon = 0.30
Episode 800: Total Reward = -3.00, Epsilon = 0.20
Episode 900: Total Reward = 2.00, Epsilon = 0.10
Episode 1000: Total Reward = 7.00, Epsilon = 0.01
Episode 1100: Total Reward = 7.00, Epsilon = 0.01
Episode 1200: Total Reward = 1.50, Epsilon = 0.01
Episode 1300: Total Reward = 7.00, Epsilon = 0.01
Episode 1400: Total Reward = 2.00, Epsilon = 0.01
Episode 1500: Total Reward = 2.00, Epsilon = 0.01
Episode 1600: Total Reward = 2.00, Epsilon = 0.01
Episode 1700: Total Reward = 7.00, Epsilon = 0.01
Episode 1800: Total Reward = 2.00, Epsilon = 0.01
Episode 1900: Total Reward = 1.50, Epsilon = 0.01
Episode 2000: Total Reward = 1.5

In [1]:
"""Quickstart example for the ICU-Sepsis environment."""

import gymnasium as gym
import icu_sepsis


def main():
    env = gym.make('Sepsis/ICU-Sepsis-v2')

    state, info = env.reset()
    print('Initial state:', state)
    print('Extra info:', info)

    next_state, reward, terminated, truncated, info = env.step(0)
    print('\nTaking action 0:')
    print('Next state:', next_state)
    print('Reward:', reward)
    print('Terminated:', terminated)
    print('Truncated:', truncated)


if __name__ == '__main__':
    main()

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Initial state: 617
Extra info: {'admissible_actions': [5, 10, 15, 20], 'state_vector': array([-0.25      , -0.08333333, -0.43855   ,  0.33333333, -0.43708271,
        0.11068928,  0.42393451,  0.15369781, -0.26537177, -0.28637706,
       -0.08265458,  0.31503245, -0.06588095,  0.08514915,  1.33252377,
       -0.36103496, -0.28066148, -0.16137368,  1.82694543, -0.58913025,
       -0.48484812,  0.09477288, -0.54081085,  0.02363826,  0.15988011,
       -0.42123487, -0.23597269, -0.8504507 , -1.13166612, -1.21457448,
        0.38335443,  0.91633856,  0.88553353,  0.27268052, -0.29112997,
        0.52982805, -0.30634874,  1.86515795,  2.24978588,  0.75969555,
        0.74538692,  0.70060293,  0.26431062,  0.37329322,  1.27071747,
        0.09261225, -0.01282182]), 'sofa_score': np.float64(9.31060606060606)}

Taking action 0:
Next state: 617
Reward: 0.0
Terminated: False
Truncated: False


In [1]:
import gymnasium as gym
env = gym.make('Blackjack-v1', natural=False, sab=True)

In [2]:
env.action_space

Discrete(2)

In [3]:
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [44]:
env.observation_space.sample()

(np.int64(1), np.int64(1), np.int64(0))

## First Visit Monte Carlo Prediction on Blackjack 

In [1]:
import gymnasium as gym
import numpy as np
from collections import defaultdict

In [2]:
env = gym.make("Blackjack-v1", sab=True)

In [3]:
# Policy: Stick if player sum >= 20, else hit
def policy(state):
    player_sum, dealer_card, usable_ace = state
    return 0 if player_sum >= 20 else 0 

In [4]:
def first_visit_mc_prediction(env, ploicy, num_episodes=500000, gamma=1.0):
    """
    Estimates the Value function for a given policy using First-Visit Monte Carlo 
    prediction algorithm.A prediction algorithm is a method used to estimate the 
    value of a policy ie how good it is to follow a given policy.
    """
    returns_sum = defaultdict(float) # Sum of returns for each state
    returns_count = defaultdict(float) # Number of first visits
    V = defaultdict(float) # State value estimates

    for episode in range(num_episodes):
        # Generate a sequence of a 3 tuple (state, action, reward) for an episode
        episode_data = []
        state, _ = env.reset()
        done = False

        while not done:
            action = policy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode_data.append((state, action, reward))
            state = next_state
            done = terminated or truncated

        # Compute returns
        G = 0.0
        visited_states = set()
        for t in reversed(range(len(episode_data))):
            state_t, action_t, reward_t = episode_data[t]
            G = gamma * G + reward_t 
         # First visit check
            if state_t not in visited_states:
                visited_states.add(state_t)
                returns_sum[state_t] += G
                returns_count[state_t] += 1.0
                V[state_t] = returns_sum[state_t] / returns_count[state_t]
    return V

In [5]:
V = first_visit_mc_prediction(env, policy, num_episodes=500000)

In [None]:
V # The value function

defaultdict(float,
            {(18, 10, 0): -0.23957951350690623,
             (12, 10, 0): -0.5756250455572564,
             (15, 7, 0): -0.5045871559633027,
             (19, 5, 1): 0.4519650655021834,
             (17, 5, 1): -0.06858407079646017,
             (17, 7, 0): -0.10025817555938038,
             (16, 10, 0): -0.5816532258064516,
             (21, 4, 1): 1.0,
             (11, 10, 0): -0.5868312757201646,
             (21, 5, 1): 1.0,
             (8, 8, 0): -0.5387323943661971,
             (18, 5, 0): 0.23323615160349853,
             (12, 1, 0): -0.7807172251616696,
             (17, 2, 0): -0.15893760539629004,
             (18, 4, 0): 0.19738751814223512,
             (5, 10, 0): -0.5754251234229293,
             (14, 10, 0): -0.569496110923233,
             (17, 3, 0): -0.12164009111617312,
             (10, 5, 0): -0.13288426209430496,
             (13, 1, 0): -0.7828886844526219,
             (7, 2, 0): -0.3005464480874317,
             (20, 5, 0): 0.6616666666666

In [7]:
len(V)

270

## Iterative Policy Evaluation on FrozenLake

In [1]:
import numpy as np
import gymnasium as gym

In [2]:
def iterative_policy_evaluation(env, pi, gamma=0.99, theta=1e-6):
    """Computes the value of the function for a given policy pi"""
    n_states = env.observation_space.n
    V = np.zeros(n_states)
    env_unwrapped = env.unwrapped

    while True:
        delta = 0
        for s in range(n_states):
            v = 0
            for a, action_prob in enumerate(pi[s]):
                for prob, next_state, reward, terminated in env_unwrapped.P[s][a]:
                    v += action_prob * prob * (reward + gamma * V[next_state]) # Update V using the Bellman equation

            delta = max(delta, abs(v - V[s]))
            V[s] = v
        if delta < theta:
            break
    return V
        

In [None]:
if __name__ == "__main__":
    env = gym.make("FrozenLake-v1")
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    policy = np.ones((n_states, n_actions)) / n_actions
    V = iterative_policy_evaluation(env, pi=policy, gamma=0.99, theta=1e-8)

In [None]:
V.reshape(4, 4) # The estimated value function

array([[0.01235611, 0.01042444, 0.01933842, 0.00947774],
       [0.01478704, 0.        , 0.03889445, 0.        ],
       [0.03260247, 0.08433764, 0.13781085, 0.        ],
       [0.        , 0.17034482, 0.43357944, 0.        ]])

## Q learning on Blackjack

In [3]:
import numpy as np
import gymnasium as gym
from collections import defaultdict

In [4]:
class QAgent_Blackjack:
    def __init__(
        self, 
        env, 
        alpha: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        gamma: float = 0.95):
        
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
        self.alpha = alpha # Learning rate
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.gamma = gamma # Discount factor
        self.training_error = []

    def get_action(self, env, obs: tuple[int, int, bool]) -> int:
        """Epsilon Greedy policy"""
        if np.random.random() < self.epsilon:
            return env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))
        
    def update(self,
               obs: tuple[int, int, bool],
               action: int,
               reward: float,
               terminated: bool,
               next_obs: tuple[int, int, bool]):
        self.q_values[obs, action] = (((1 - self.alpha) * self.q_values[obs, action]) + 
                                      self.alpha * (reward + self.gamma * (not terminated) * np.max(self.q_values[next_obs])))
        
    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)     
    

        
        

In [10]:
# hyperparameters
alpha = 0.1
n_episodes = 1000000
initial_epsilon = 1.0
epsilon_decay = initial_epsilon / (n_episodes / 2)
final_epsilon = 0.1
video_dir = "videos_blackjack_ql"

In [6]:
env = gym.make("Blackjack-v1", sab=True)
Q_agent = QAgent_Blackjack(env=env, alpha=alpha,
                           initial_epsilon=initial_epsilon,
                           epsilon_decay=epsilon_decay,
                           final_epsilon=final_epsilon)

In [7]:
rewards = []
for episode in range(n_episodes):
    obs, info = env.reset()
    done = False
    total_reward = 0.0
    while not done:
        action = Q_agent.get_action(env, obs)
        next_obs, reward, terminated, truncated, _ = env.step(action)
        Q_agent.update(obs, action, reward, terminated, next_obs)
        total_reward += reward
        done = terminated or truncated
        obs = next_obs
    Q_agent.decay_epsilon()
    rewards.append(total_reward)
    if (episode + 1) % 1000 == 0:
        avg_reward = np.mean(rewards[-1000:])
        print(f"Episode: {episode+1}, Avg Reward: {avg_reward:.3f}, Epsilon: {Q_agent.epsilon:.3f}")

        

Episode: 1000, Avg Reward: -0.379, Epsilon: 0.998
Episode: 2000, Avg Reward: -0.396, Epsilon: 0.996
Episode: 3000, Avg Reward: -0.370, Epsilon: 0.994
Episode: 4000, Avg Reward: -0.354, Epsilon: 0.992
Episode: 5000, Avg Reward: -0.404, Epsilon: 0.990
Episode: 6000, Avg Reward: -0.372, Epsilon: 0.988
Episode: 7000, Avg Reward: -0.421, Epsilon: 0.986
Episode: 8000, Avg Reward: -0.428, Epsilon: 0.984
Episode: 9000, Avg Reward: -0.317, Epsilon: 0.982
Episode: 10000, Avg Reward: -0.437, Epsilon: 0.980
Episode: 11000, Avg Reward: -0.343, Epsilon: 0.978
Episode: 12000, Avg Reward: -0.392, Epsilon: 0.976
Episode: 13000, Avg Reward: -0.419, Epsilon: 0.974
Episode: 14000, Avg Reward: -0.353, Epsilon: 0.972
Episode: 15000, Avg Reward: -0.413, Epsilon: 0.970
Episode: 16000, Avg Reward: -0.366, Epsilon: 0.968
Episode: 17000, Avg Reward: -0.352, Epsilon: 0.966
Episode: 18000, Avg Reward: -0.340, Epsilon: 0.964
Episode: 19000, Avg Reward: -0.422, Epsilon: 0.962
Episode: 20000, Avg Reward: -0.408, Epsi

In [16]:
def test_and_record(Q_table, env_name):

    # Create environment with video recording
    env = gym.make(env_name, render_mode="rgb_array")  # For recording
    env = gym.wrappers.RecordVideo(env, video_dir, episode_trigger=lambda e: True)

    n_actions = env.action_space.n
    rewards = []

    for ep in range(5):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            # Greedy policy (no exploration)
            action = np.argmax(Q_table[state])
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            state = next_state

            if terminated or truncated:
                done = True

        rewards.append(total_reward)
        print(f"Episode {ep + 1}: Total Reward = {total_reward}")

    env.close()
    print(f"Videos saved in: {video_dir}")
    return rewards

In [22]:
test_and_record(Q_agent.q_values,"Blackjack-v1")

Episode 1: Total Reward = 1.0
Episode 2: Total Reward = 1.0
Episode 3: Total Reward = 1.0
Episode 4: Total Reward = -1.0
Episode 5: Total Reward = -1.0
Videos saved in: videos_blackjack_ql


[1.0, 1.0, 1.0, -1.0, -1.0]

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import ptan

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [3]:
import numpy as np

In [4]:
q_vals = np.array([[1, 2, 3], [1, -1, 0]])

In [5]:
selctor = ptan.actions.ArgmaxActionSelector()

In [6]:
selctor(q_vals)

array([2, 0], dtype=int64)

In [7]:
import gymnasium as gym

In [8]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.1, selector=ptan.actions.ArgmaxActionSelector())

In [9]:
selctor(q_vals)

array([2, 0], dtype=int64)