In [1]:
import gymnasium as gym
from ptan.experience import ExperienceFirstLast, ExperienceSourceFirstLast
import ptan
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import typing as tt


HIDDEN_SIZE = 128
BATCH_SIZE = 16
TGT_NET_SYNC = 10
GAMMA = 0.9
REPLAY_SIZE = 1000
LR = 1e-3
EPS_DECAY = 0.99


class Net(nn.Module):
    def __init__(self, obs_size: int, hidden_size: int, n_actions: int):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x.float())


@torch.no_grad()
def unpack_batch(batch: tt.List[ExperienceFirstLast], net: Net, gamma: float):
    states = []
    actions = []
    rewards = []
    done_masks = []
    last_states = []
    for exp in batch:
        states.append(exp.state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        done_masks.append(exp.last_state is None)
        if exp.last_state is None:
            last_states.append(exp.state)
        else:
            last_states.append(exp.last_state)

    states_v = torch.as_tensor(np.stack(states))
    actions_v = torch.tensor(actions)
    rewards_v = torch.tensor(rewards)
    last_states_v = torch.as_tensor(np.stack(last_states))
    last_state_q_v = net(last_states_v)
    best_last_q_v = torch.max(last_state_q_v, dim=1)[0]
    best_last_q_v[done_masks] = 0.0
    return states_v, actions_v, best_last_q_v * gamma + rewards_v


if __name__ == "__main__":
    env = gym.make("CartPole-v1")
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.ArgmaxActionSelector()
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1, selector=selector)
    agent = ptan.agent.DQNAgent(net, selector)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
    optimizer = optim.Adam(net.parameters(), LR)

    step = 0
    episode = 0
    solved = False

    while True:
        step += 1
        buffer.populate(1)

        for reward, steps in exp_source.pop_rewards_steps():
            episode += 1
            print(f"{step}: episode {episode} done, reward={reward:.2f}, "
                  f"epsilon={selector.epsilon:.2f}")
            solved = reward > 150
        if solved:
            print("Whee!")
            break
        if len(buffer) < 2*BATCH_SIZE:
            continue
        batch = buffer.sample(BATCH_SIZE)
        states_v, actions_v, tgt_q_v = unpack_batch(batch, tgt_net.target_model, GAMMA)
        optimizer.zero_grad()
        q_v = net(states_v)
        q_v = q_v.gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
        loss_v = F.mse_loss(q_v, tgt_q_v)
        loss_v.backward()
        optimizer.step()
        selector.epsilon *= EPS_DECAY

        if step % TGT_NET_SYNC == 0:
            tgt_net.sync()

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


24: episode 1 done, reward=23.00, epsilon=1.00
39: episode 2 done, reward=15.00, epsilon=0.93
64: episode 3 done, reward=25.00, epsilon=0.72
74: episode 4 done, reward=10.00, epsilon=0.66
93: episode 5 done, reward=19.00, epsilon=0.54
112: episode 6 done, reward=19.00, epsilon=0.45
134: episode 7 done, reward=22.00, epsilon=0.36
144: episode 8 done, reward=10.00, epsilon=0.32
157: episode 9 done, reward=13.00, epsilon=0.28
171: episode 10 done, reward=14.00, epsilon=0.25
181: episode 11 done, reward=10.00, epsilon=0.22
194: episode 12 done, reward=13.00, epsilon=0.20
203: episode 13 done, reward=9.00, epsilon=0.18
213: episode 14 done, reward=10.00, epsilon=0.16
224: episode 15 done, reward=11.00, epsilon=0.15
234: episode 16 done, reward=10.00, epsilon=0.13
243: episode 17 done, reward=9.00, epsilon=0.12
251: episode 18 done, reward=8.00, epsilon=0.11
260: episode 19 done, reward=9.00, epsilon=0.10
270: episode 20 done, reward=10.00, epsilon=0.09
281: episode 21 done, reward=11.00, ep

## A random agent on the CliffWalking environment

In [7]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo

# Create the Atari environment Breakout.
env = gym.make("CliffWalking-v0", render_mode="rgb_array")

# Wrap the environment to record a video.
# The video will be saved in a "videos_cliffwalking_random" folder.
env = RecordVideo(env, video_folder="./videos_cliffwalking_random")

total_reward = 0.0 
total_steps = 0
# The following line is needed to start the recording.
# The reset method returns the initial observation.
obs, info = env.reset()

# Run the environment for a total of 1000 steps.
for _ in range(100):
    # Take a random action from the environment's action space.
    action = env.action_space.sample()

    # The step method returns the next observation, the reward, whether the episode is terminated or truncated, and additional info.
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    total_steps += 1

    # If the episode is over, reset the environment.
    if terminated or truncated:
        obs, info = env.reset()

# Close the environment.
env.close()
print(f"Total reward: {total_reward}, Total steps: {total_steps}") 
print("Video of the random agent has been saved in the 'videos_cliffwalking_random' folder.")

Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_random\rl-video-episode-0.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_random\rl-video-episode-0.mp4



                                                    

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_random\rl-video-episode-0.mp4
Total reward: -496.0, Total steps: 100
Video of the random agent has been saved in the 'videos_cliffwalking_random' folder.




## Fathoming naive DQN algorithm in depth

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
policy_net = nn.Linear(4, 3)
target_net = nn.Linear(4, 3) # Here both the policy net and target net are equivalent to linear regression

In [3]:
policy_net

Linear(in_features=4, out_features=3, bias=True)

In [None]:
# Sample(here two samples are used) of (state, action, reward, next_state, done) tuples, the state 
# has 4 features and there are 3 possible actions (0, 1, 2)
states = torch.tensor([[0.1, 0.2, 0.3, 0.4],
                       [0.5, 0.6, 0.7, 0.8]], dtype=torch.float32) # Current state
actions = torch.tensor([[1], [2]]) # Actions taken in the current state
rewards = torch.tensor([[1.0], [0.5]]) # Rewards received after taking the actions
next_states = torch.tensor([[0.9, 0.1, 0.2, 0.3],
                            [0.4, 0.5, 0.6, 0.7]], dtype=torch.float32) # Next state after taking the actions
dones = torch.tensor([[0], [1]], dtype=torch.float32) # 1 if the episode ended after this step, else 0  
gamma = 0.9 # Discount factor


In [5]:
q_values = policy_net(states) # returns the Q-values for each action in the current states
q_values

tensor([[-0.2728,  0.1779,  0.3559],
        [-0.1902,  0.2848,  0.6769]], grad_fn=<AddmmBackward0>)

In [6]:
q_value = q_values.gather(1, actions) # Gather the Q-values corresponding to the actions taken
q_value

tensor([[0.1779],
        [0.6769]], grad_fn=<GatherBackward0>)

In [7]:
with torch.no_grad():
    next_q_values = target_net(next_states) # Q-values for the next states from the target network
    next_q_value = next_q_values.max(1, keepdims=True)[0] # Max Q-value for the next states
next_q_values, next_q_value

(tensor([[-0.6384,  0.2381,  0.3531],
         [-0.4915,  0.2273,  0.4723]]),
 tensor([[0.3531],
         [0.4723]]))

In [None]:
# loss function = (1 / N) * sum((Q(s, a) - (r + gamma * max_a' Q'(s', a') * (1 - done)))^2)
# Q(s, a) is the predicted Q-value for the current state and action
# Q'(s', a') is the target Q-value for the next state and action
# N is the batch size

In [8]:
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)
target = rewards + gamma * next_q_value * (1 - dones) # Compute the target Q-value
loss = F.mse_loss(q_value, target) # Compute the loss, mean sqaured error between current Q-value and target Q-value
loss.backward() # Compute gradients
optimizer.step() # Update the policy network parameters
target, loss

(tensor([[1.3178],
         [0.5000]]),
 tensor(0.6653, grad_fn=<MseLossBackward0>))

## TD(0)(one step temporal differenece) prediction on the FrozenLake environment

In [2]:
import numpy as np
import gymnasium as gym

In [3]:
env = gym.make("FrozenLake-v1")
n_states = env.observation_space.n
n_actions = env.action_space.n
print(f"State space dimension: {n_states}, Action space dimension: {n_actions}")

State space dimension: 16, Action space dimension: 4


In [4]:
# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
num_episodes = 10000  # Number of episodes 

In [5]:
# Initialize value function
V = np.zeros(n_states)

In [6]:
def random_policy(state):
    return env.action_space.sample()

In [7]:
def TD_0(num_episodes, policy=random_policy):
    for _ in range(num_episodes):
        state, info = env.reset()
        done = False
        while not done:
            action = policy(state)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            # TD(0) update V[state] = (1 - alpha) * V[state] + alpha * (reward + gamma * V[next_state])
            V[state] = (1- alpha) * V[state] + alpha * (reward + gamma * V[next_state])
            state = next_state
    return V

In [8]:
V = TD_0(num_episodes, random_policy).reshape((4, 4))

In [9]:
V

array([[0.01025745, 0.00438898, 0.00738122, 0.00517198],
       [0.01507457, 0.        , 0.01988743, 0.        ],
       [0.02726076, 0.04603213, 0.11986327, 0.        ],
       [0.        , 0.08434613, 0.30618266, 0.        ]])

## Double Q learning on FrozenLake

In [2]:
import numpy as np
import gymnasium as gym 
import random

In [3]:
env = gym.make("FrozenLake-v1", is_slippery=False)

In [None]:
class DoubleQLearningAgent:
    def __init__(self, env, alpha, gamma, epsilon, epsilon_decay, min_epsilon):
        self.env = env
        self.alpha = alpha # Learning rate
        self.gamma = gamma # Discount factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.Q1 = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        self.Q2 = np.zeros((self.env.observation_space.n, self.env.action_space.n))

    def choose_action(self, state):
        """This function selects an action using the epsilon-greedy policy."""
        if random.random() < self.epsilon:
            return self.env.action_space.sample() # Explore
        else:
            Q_sum = self.Q1[state] + self.Q2[state]
            return np.argmax(Q_sum)  # Exploit

    def update(self, state, action, reward, next_state, done):
        """This function performs the Double Q-Learning update."""
        if random.random() < 0.5:
            best_next_action = np.argmax(self.Q1[next_state])
            td_target = reward + (0 if done else self.gamma * self.Q2[next_state][best_next_action])
            td_error = td_target - self.Q1[state][action]
            self.Q1[state][action] += self.alpha * td_error
        else:
            best_next_action = np.argmax(self.Q2[next_state])
            td_target = reward + (0 if done else self.gamma * self.Q1[next_state][best_next_action])
            td_error = td_target - self.Q2[state][action]
            self.Q2[state][action] += self.alpha * td_error

    def decay_epsilon(self):
        """Decays the exploration rate epsilon."""
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)


In [26]:
# Hyperparameters
alpha = 0.8
gamma = 0.95
epsilon = 1.0   
epsilon_decay = 0.9995
min_epsilon = 0.01
num_episodes = 10000
video_dir = "videos_frozenlake_double_q"

In [24]:
agent = DoubleQLearningAgent(env, alpha, gamma, epsilon, epsilon_decay, min_epsilon)

In [None]:
"""Running the Double Q-Learning agent in the FrozenLake environment."""
rewards = []
for episode in range(num_episodes):
    state, info = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.choose_action(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        agent.update(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
    agent.decay_epsilon()
    rewards.append(total_reward)
    if (episode + 1) % 1000 == 0:
        avg_reward = np.mean(rewards[-1000:])
        print(f"Episode: {episode+1}, Avg Reward: {avg_reward:.3f}, Epsilon: {agent.epsilon:.3f}")


Episode: 1000, Avg Reward: 0.118, Epsilon: 0.606
Episode: 2000, Avg Reward: 0.436, Epsilon: 0.368
Episode: 3000, Avg Reward: 0.686, Epsilon: 0.223
Episode: 4000, Avg Reward: 0.828, Epsilon: 0.135
Episode: 5000, Avg Reward: 0.895, Epsilon: 0.082
Episode: 6000, Avg Reward: 0.922, Epsilon: 0.050
Episode: 7000, Avg Reward: 0.956, Epsilon: 0.030
Episode: 8000, Avg Reward: 0.968, Epsilon: 0.018
Episode: 9000, Avg Reward: 0.978, Epsilon: 0.011
Episode: 10000, Avg Reward: 0.991, Epsilon: 0.010


In [None]:
def test_and_record(Q_table, env_name, video_dir):
    """This function tests the learned Q-table and records videos of the agent's performance."""

    # Create environment with video recording
    env = gym.make(env_name,is_slippery=False, render_mode="rgb_array")  # For recording
    env = gym.wrappers.RecordVideo(env, video_dir, episode_trigger=lambda e: True)

    #n_actions = env.action_space.n
    rewards = []

    for ep in range(5):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            # Greedy policy (no exploration)
            action = np.argmax(Q_table[state])
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            state = next_state

            if terminated or truncated:
                done = True

        rewards.append(total_reward)
        print(f"Episode {ep + 1}: Total Reward = {total_reward}")

    env.close()
    print(f"Videos saved in: {video_dir}")
    return rewards

In [29]:
test_and_record(agent.Q1 + agent.Q2, "FrozenLake-v1", video_dir)

Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-0.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-0.mp4
Episode 1: Total Reward = 1.0
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-1.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-1.mp4



                                                  

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-1.mp4
Episode 2: Total Reward = 1.0
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-2.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-2.mp4



                                                  

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-2.mp4




Episode 3: Total Reward = 1.0
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-3.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-3.mp4



                                                  

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-3.mp4
Episode 4: Total Reward = 1.0
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-4.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-4.mp4



                                                  

Moviepy - Done !




Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_frozenlake_double_q\rl-video-episode-4.mp4
Episode 5: Total Reward = 1.0
Videos saved in: videos_frozenlake_double_q


[1.0, 1.0, 1.0, 1.0, 1.0]

## Sarsa on Cliffwalking

In [2]:
import numpy as np
import gymnasium as gym
import random

In [3]:
env = gym.make("CliffWalking-v0")
env.observation_space.n, env.action_space.n

(48, 4)

In [4]:
class SarsaAgent:
    def __init__(self, env, alpha, gamma, epsilon, epsilon_decay, min_epsilon):
        self.env = env
        self.alpha = alpha # Learning rate
        self.gamma = gamma # Discount factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay      
        self.min_epsilon = min_epsilon
        self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))

    def choose_action(self, state):
        """This function selects an action using the epsilon-greedy policy."""
        if random.random() < self.epsilon:
            return self.env.action_space.sample() # Explore
        else:
            return np.argmax(self.Q[state])  # Exploit
        
    def update(self, state, action, reward, next_state, next_action, done):
        """This function performs the SARSA update."""
        td_target = reward + (0 if done else self.gamma * self.Q[next_state][next_action])
        self.Q[state][action] += self.alpha * (td_target - self.Q[state][action])

    def decay_epsilon(self):
        """Decays the exploration rate epsilon."""
        self.epsilon = max(self.min_epsilon, self.epsilon - self.epsilon_decay)     

In [10]:
# Hyperparameters
num_episodes = 2500
alpha = 0.5
gamma = 1.0
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = (epsilon - epsilon_min) / 1600
video_dir = "videos_cliffwalking_sarsa"


In [11]:
agent = SarsaAgent(env, alpha, gamma, epsilon, epsilon_decay, epsilon_min)

In [12]:
"""Training the SARSA agent in the CliffWalking environment."""
rewards = []
for episode in range(num_episodes):
    state, info = env.reset()
    total_reward = 0
    done = False
    action = agent.choose_action(state)
    while not done:
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        next_action = agent.choose_action(next_state)
        agent.update(state, action, reward, next_state, next_action, done)
        state = next_state  
        action = next_action
        total_reward += reward
    agent.decay_epsilon()
    rewards.append(total_reward)
    if (episode + 1) % 50 == 0:
        avg_reward = np.mean(rewards[-100:])
        print(f"Episode: {episode+1}, Avg Reward: {avg_reward:.3f}, Epsilon: {agent.epsilon:.3f}")

Episode: 50, Avg Reward: -48811.040, Epsilon: 0.972
Episode: 100, Avg Reward: -34092.120, Epsilon: 0.944
Episode: 150, Avg Reward: -15767.280, Epsilon: 0.916
Episode: 200, Avg Reward: -9985.550, Epsilon: 0.887
Episode: 250, Avg Reward: -5761.890, Epsilon: 0.859
Episode: 300, Avg Reward: -2869.040, Epsilon: 0.831
Episode: 350, Avg Reward: -2016.030, Epsilon: 0.803
Episode: 400, Avg Reward: -1541.570, Epsilon: 0.775
Episode: 450, Avg Reward: -1028.120, Epsilon: 0.747
Episode: 500, Avg Reward: -801.970, Epsilon: 0.719
Episode: 550, Avg Reward: -536.840, Epsilon: 0.691
Episode: 600, Avg Reward: -398.750, Epsilon: 0.662
Episode: 650, Avg Reward: -287.820, Epsilon: 0.634
Episode: 700, Avg Reward: -284.010, Epsilon: 0.606
Episode: 750, Avg Reward: -266.080, Epsilon: 0.578
Episode: 800, Avg Reward: -181.770, Epsilon: 0.550
Episode: 850, Avg Reward: -153.310, Epsilon: 0.522
Episode: 900, Avg Reward: -116.210, Epsilon: 0.494
Episode: 950, Avg Reward: -132.350, Epsilon: 0.466
Episode: 1000, Avg R

In [None]:
def test_and_record(Q_table, env_name, video_dir):
    """This function tests the learned Q-table(of Sarsa) and records videos of the agent's performance for 5 episodes."""

    # Create environment with video recording
    env = gym.make(env_name, render_mode="rgb_array")  # For recording
    env = gym.wrappers.RecordVideo(env, video_dir, episode_trigger=lambda e: True)

    #n_actions = env.action_space.n
    rewards = []

    for ep in range(5):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            # Greedy policy (no exploration)
            action = np.argmax(Q_table[state])
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            state = next_state

            if terminated or truncated:
                done = True

        rewards.append(total_reward)
        print(f"Episode {ep + 1}: Total Reward = {total_reward}")

    env.close()
    print(f"Videos saved in: {video_dir}")
    return rewards

In [14]:
test_and_record(agent.Q, "CliffWalking-v0", video_dir)

  logger.warn(


Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-0.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-0.mp4



                                                   

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-0.mp4
Episode 1: Total Reward = -17
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-1.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-1.mp4



                                                   

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-1.mp4
Episode 2: Total Reward = -17
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-2.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-2.mp4



                                                   

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-2.mp4




Episode 3: Total Reward = -17
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-3.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-3.mp4



                                                   

Moviepy - Done !




Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-3.mp4
Episode 4: Total Reward = -17
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-4.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-4.mp4



                                                   

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cliffwalking_sarsa\rl-video-episode-4.mp4
Episode 5: Total Reward = -17
Videos saved in: videos_cliffwalking_sarsa


[-17, -17, -17, -17, -17]

## Reinforce on CartPole environment

In [2]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
n_actions = env.action_space.n
state_size, n_actions

(4, 2)

In [5]:
class PolicyNet(nn.Module):
    def __init__(self, obs_size: int, hidden_size: int, n_actions: int):
        super(PolicyNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.net(x)

In [6]:
policy = PolicyNet(state_size, 128, n_actions).to(device)
policy

PolicyNet(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
    (3): Softmax(dim=-1)
  )
)

In [7]:
optimizer = optim.Adam(policy.parameters(), lr=0.01)

In [8]:
def compute_returns(rewards, gamma=0.99):
    """The function computes the returns and returns as a torch tensor"""
    G = 0
    returns = []
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    return torch.tensor(returns, dtype=torch.float32, device=device)

In [7]:
# Hyperparameters
num_episodes = 500
gamma = 0.99
video_dir = "videos_cartpole_reinforce"

In [None]:
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    log_probs, rewards = [], []
    total_reward = 0
    
    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
        probs = policy(state_tensor) # Computing the probabilities of disparate actions
        dist = torch.distributions.Categorical(probs) # Create the Categorical distribution object
        action = dist.sample() # Sample from the output distribution using the probabilities
        log_probs.append(dist.log_prob(action))
        state, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated
        rewards.append(reward)
        total_reward += reward
    
    returns = compute_returns(rewards, gamma)

    loss = 0
    for t, (log_prob, Gt) in enumerate(zip(log_probs, returns)):
        loss += -log_prob * ((gamma ** t) * Gt)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (episode + 1) % 20 == 0:
        print(f"Episode {episode + 1}: {total_reward}")
        

Episode 20: 500.0
Episode 40: 500.0
Episode 60: 500.0
Episode 80: 500.0
Episode 100: 500.0


In [19]:
torch.save(policy.state_dict(), 'Reinforce_Cartpole_net/reinforce_agent.pth')

In [None]:
def test_and_record(env_name, video_dir):   
    """This function tests and records 5 episodes using the trained agent""" 
    # Create environment with video recording
    env = gym.make(env_name, render_mode="rgb_array")  
    env = gym.wrappers.RecordVideo(env, video_dir, episode_trigger=lambda e: True)
    # Load the model's state_dict
    loaded_state_dict = torch.load("Reinforce_Cartpole_net/reinforce_agent.pth")
    policy = PolicyNet(state_size, 128, n_actions).to(device) 
    policy.load_state_dict(loaded_state_dict)
    policy.eval()
    for episode in range(5):
        state, _ = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            probs = policy(state_tensor)
            dist = torch.distributions.Categorical(probs)
            action = dist.sample()
            state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            total_reward += reward
        print(f"Episode {episode + 1} reward: {total_reward}")
    env.close()
    print(f"Videos saved in: {video_dir}")

In [16]:
test_and_record("CartPole-v1", video_dir)

  logger.warn(


Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-0.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-0.mp4



                                                                

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-0.mp4
Episode 1 reward: 500.0
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-1.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-1.mp4



                                                             

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-1.mp4
Episode 2 reward: 82.0
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-2.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-2.mp4



                                                                

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-2.mp4
Episode 3 reward: 500.0
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-3.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-3.mp4



                                                                

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-3.mp4
Episode 4 reward: 500.0
Moviepy - Building video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-4.mp4.
Moviepy - Writing video c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-4.mp4



                                                                

Moviepy - Done !
Moviepy - video ready c:\Users\User\Desktop\VS_Code_Projects\Reinforcement_learning\videos_cartpole_reinforce\rl-video-episode-4.mp4
Episode 5 reward: 500.0
Videos saved in: videos_cartpole_reinforce
