In [None]:
import torch
import torch.nn as nn
from torch.distributions import Categorical

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, in_channel, n_actions):
        super().__init__()
        self.fc1    = nn.Linear(in_channel, 1024)
        self.fc2    = nn.Linear(1024, 512)
        self.value  = nn.Linear(512, 1)
        self.policy = nn.Linear(512, n_actions)

    def forward(self, state):
        state = torch.relu(self.fc1(state))
        state = torch.relu(self.fc2(state))
        v     = self.value(state)
        pi    = torch.softmax(self.policy(state), dim = -1)

        return v, pi

In [None]:
class Model():
    def __init__(self, gamma, alpha, learning_rate, n_actions, in_channel):
        self.gamma = gamma
        self.alpha = alpha
        self.actor_critic = ActorCritic(in_channel, n_actions) 
        self.action = None
        self.optimizer = torch.optim.Adam(self.actor_critic.parameters(), lr = learning_rate)

    def choose_action(self, state):
        _, pi = self.actor_critic(state)
        actions = Categorical(pi)
        self.action = actions.sample()

        return self.action

    def learn(self, state, reward, next_state, done):
        state = torch.tensor(state)
        next_state = torch.tensor(next_state)
        value, pi = self.actor_critic(state)
        action_probs = Categorical(pi)
        log_prob = action_probs.log_prob(self.action)
        next_value, _ = self.actor_critic(next_state)

        delta = reward + self.gamma * next_value * (1 - int(done)) - value

        critic_loss = delta**2

        actor_loss = -delta * log_prob

        total_loss = actor_loss + critic_loss

        self.optimizer.zero_grad()
        total_loss.backward()
        self.optimizer.step()


In [None]:
import gym
import numpy as np
from gym import wrappers

try:
    np.bool8
except AttributeError:
    np.bool8 = np.bool_

try:
    np.int
except AttributeError:
    np.int = int

try:
    np.float
except AttributeError:
    np.float = float

env = gym.make('CartPole-v0')
agent = Model(gamma = 0.99, alpha = 1e-5, learning_rate = 1e-4, n_actions = env.action_space.n, in_channel = env.observation_space.shape[0])
n_games = 1500
best_score = env.reward_range[0]
score_history = []

# Replace your training loop with this:
for i in range(n_games):
    observation = env.reset()
    observation = observation[0]  # New reset returns (obs, info)
    score = 0
    done = False
    
    while not done:
        action = agent.choose_action(torch.tensor(observation).unsqueeze(0))
        observation_, reward, terminated, truncated, info = env.step(action.item())
        
        # 'done' is now either terminated or truncated
        done = terminated or truncated
        
        score += reward
        agent.learn(observation, reward, observation_, done)
        observation = observation_

    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    print(f"epoch : {i} :: score : {score} :: avg_score : {avg_score}")