In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.distributions import Normal
import gymnasium as gym

In [None]:
class Buffer():
    def __init__(self, mem_size, input_dims, action_dims):
        self.max_size = mem_size
        self.state = np.zeros((mem_size, input_dims))
        self.next_state = np.zeros((mem_size, input_dims))
        self.action = np.zeros((mem_size, action_dims))
        self.reward = np.zeros(mem_size)
        self.done = np.zeros(mem_size)
        self.counter = 0

    def store_transition(self, state, action, reward, next_state, done):
        index = self.counter % self.max_size

        self.state[index] = state
        self.next_state[index] = next_state
        self.action[index] = action
        self.reward[index] = reward
        self.done[index] = done

        self.counter += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.counter, self.max_size)

        batch = np.random.choice(max_mem, batch_size)

        return self.state[batch], self.action[batch], self.reward[batch], self.next_state[batch], self.done[batch]

In [None]:
class ActorNetwork(nn.Module):
    def __init__(self, input_dims, action_dims):
        super().__init__()
        self.fc1 = nn.Linear(input_dims, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dims)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))

        return self.fc3(x)

In [None]:
class CriticNetwork(nn.Module):
    def __init__(self, input_dims, action_dims):
        super().__init__()
        self.fc1 = nn.Linear(input_dims + action_dims, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 1)

    def forward(self, state, action):
        input = torch.cat([state, action], dim = 1)
        x = torch.relu(self.fc1(input))
        x = torch.relu(self.fc2(x))

        return self.fc3(x)

In [None]:
class Agent():
    def __init__(self, input_dims, action_dims, max_action, min_action, batch_size = 100, mem_size = 10000, tau = 0.005, gamma = 0.99):
        self.tau = tau
        self.gamma = gamma
        self.batch_size = batch_size
        self.max_action = torch.tensor(max_action, dtype = torch.float32)
        self.min_action = torch.tensor(min_action, dtype = torch.float32)
        self.memory = Buffer(mem_size, input_dims, action_dims)
        self.actor = ActorNetwork(input_dims, action_dims)
        self.critic1 = CriticNetwork(input_dims, action_dims)
        self.critic2 = CriticNetwork(input_dims, action_dims)

        self.target_actor = ActorNetwork(input_dims, action_dims)
        self.target_critic1 = CriticNetwork(input_dims, action_dims)
        self.target_critic2 = CriticNetwork(input_dims, action_dims)

        self.actor_opt   = torch.optim.Adam(self.actor.parameters(),  lr = 0.001)
        self.critic1_opt = torch.optim.Adam(self.critic1.parameters(), lr = 0.001)
        self.critic2_opt = torch.optim.Adam(self.critic2.parameters(), lr = 0.001)

        self.criterion = nn.MSELoss()

    def update_parameters(self, tau = None):
        if tau is None:
            tau = self.tau

        with torch.no_grad():
            for param1, param2 in zip(self.actor.parameters(), self.target_actor.parameters()):
                param2.copy_(self.tau * param1 + (1 - self.tau) * param2)

            for param1, param2 in zip(self.critic1.parameters(), self.target_critic1.parameters()):
                param2.copy_(self.tau * param1 + (1 - self.tau) * param2)

            for param1, param2 in zip(self.critic2.parameters(), self.target_critic2.parameters()):
                param2.copy_(self.tau * param1 + (1 - self.tau) * param2)

    def choose_action(self, state):
        state = torch.tensor(state, dtype = torch.float32)
        with torch.no_grad():
            target_action = self.actor(state)
            probability = Normal(0,1)
            epsilon = probability.sample()
            epsilon = torch.clamp(epsilon, -0.5, 0.5)
            target_action += epsilon
            target_action = torch.clamp(target_action, self.min_action, self.max_action)

        return target_action


    def Remember(self, state, action, reward, next_state, done):
        self.memory.store_transition(state, action, reward, next_state, done)

    def learn(self):
        state, action, reward, next_state, done = self.memory.sample_buffer(self.batch_size)

        state = torch.tensor(state, dtype = torch.float32)
        action = torch.tensor(action, dtype = torch.float32)
        reward = torch.tensor(reward, dtype = torch.float32)
        next_state = torch.tensor(next_state, dtype = torch.float32)
        done = torch.tensor(done, dtype = torch.float32)

        #------------------------- Critic Network training phase --------------------
        target_action = self.target_actor(next_state)
        probability = Normal(0,1)
        epsilon = probability.sample()
        epsilon = torch.clamp(epsilon, -0.5, 0.5)
        target_action += epsilon
        target_action = torch.clamp(target_action, self.min_action, self.max_action)

        self.old_value_1 = self.target_critic1(next_state, target_action)
        self.old_value_2 = self.target_critic2(next_state, target_action)

        target_value = reward + self.gamma * torch.min(self.old_value_1, self.old_value_2)

        pred_value_1 = self.critic1(state, action)
        pred_value_2 = self.critic2(state, action)

        loss_critic1 = self.criterion(pred_value_1, target_value)
        loss_critic2 = self.criterion(pred_value_2, target_value)

        critic_loss = loss_critic1 + loss_critic2

        self.critic1_opt.zero_grad()
        self.critic2_opt.zero_grad()

        critic_loss.backward()

        self.critic1_opt.step()
        self.critic2_opt.step()

        #------------------------- Actor Network training phase ---------------------
        new_action = self.actor(state)

        probability = Normal(0,1)
        epsilon = probability.sample()
        epsilon = torch.clamp(epsilon, -0.5, 0.5)
        new_action += epsilon
        new_action = torch.clamp(new_action, self.min_action, self.max_action)

        value = self.critic1(state, new_action)

        actor_loss = -torch.mean(value)

        self.actor_opt.zero_grad()

        actor_loss.backward()

        self.actor_opt.step()

        #-------------------------- Target Network training phase -------------------
        self.update_parameters()         

In [None]:
env = gym.make("Pendulum-v1")

In [None]:
print(env.action_space.high)

In [None]:
print(env.action_space.low)

In [None]:
agent = Agent(input_dims = env.observation_space.shape[0], action_dims = env.action_space.shape[0], max_action= env.action_space.high, min_action = env.action_space.low)
n_games = 1500
score_history = []

# Replace your training loop with this:
for i in range(n_games):
    observation = env.reset()
    observation = observation[0]  # New reset returns (obs, info)
    score = 0
    done = False
    
    while not done:
        action = agent.choose_action(torch.tensor(observation))
        observation_, reward, terminated, truncated, info = env.step(action)
        
        # 'done' is now either terminated or truncated
        done = terminated or truncated

        agent.Remember(observation, action, reward, observation_[0], done)
        
        score += reward
        agent.learn()
        observation = observation_

    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    print(f"epoch : {i} :: score : {score} :: avg_score : {avg_score}")