In [1]:
import gymnasium as gym
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

os.makedirs("outputs", exist_ok=True)

DEVICE = "cpu"
ACTION_SPACE = [0, 1, 2, 3]
EPISODES = 5000
STEPS = 300  # Aumentado para permitir episodios más largos
GAMMA = 0.99
RENDER = False

class ReinforceNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(ReinforceNetwork, self).__init__()
        self.fc1 = nn.Linear(n_inputs, 16)
        self.fc2 = nn.Linear(16, 32)
        self.fc3 = nn.Linear(32, n_outputs)

    def forward(self, x):
        #print("Forma de x antes de la conversión:", x.shape)  # Agregar esta línea para depurar
        
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype=torch.float32, device=DEVICE)
        x = x.unsqueeze(0) if x.dim() == 1 else x
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        actions = torch.softmax(self.fc3(x), dim=-1)
        action = self.get_action(actions)
        log_prob_action = torch.log(actions.squeeze(0))[action]
        return action, log_prob_action

    def get_action(self, actions):
        return np.random.choice(ACTION_SPACE, p=actions.squeeze(0).detach().cpu().numpy())
    

env = gym.make("LunarLander-v2")
print(env.action_space, env.observation_space)

model = ReinforceNetwork(env.observation_space.shape[0], env.action_space.n).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
all_rewards = []
best_rolling = -99999

for episode in range(EPISODES):
    done = False
    state, _ = env.reset()

    #print("Tipo y forma de state después de reset:", type(state), state)

    episode_rewards = []
    log_probs = []

    while not done:
        if RENDER:
            env.render()
        
        #print("Forma de state:", state.shape) 
        action, log_prob = model(state)
        #state, reward, done, _ = env.step(action)
        step_result = env.step(action)
        #print("Resultado de env.step:", step_result)  # Agregar esta línea para depurar

        state, reward, done, info = step_result[:4]
        #print("Tipo y forma de state después de step:", type(state), state)

        log_probs.append(log_prob)
        episode_rewards.append(reward / 100)  # Scaling reward

        if done:
            total_reward = np.sum(episode_rewards)
            all_rewards.append(total_reward)

            if episode % 100 == 0:
                rolling_avg_reward = pd.Series(all_rewards).tail(100).mean()
                print(f"EPISODE {episode} SCORE: {total_reward} roll: {rolling_avg_reward}")
                #torch.save(model.state_dict(), 'outputs/last_params_cloud.ckpt')
                torch.save(model.state_dict(), 'outputs/last_params_cloud.pth')
                
                if rolling_avg_reward > best_rolling:
                    best_rolling = rolling_avg_reward
                    print("saving...")
                    #torch.save(model.state_dict(), 'outputs/best_params_cloud.ckpt')
                    torch.save(model.state_dict(), 'outputs/best_params_cloud.pth')

            # Calculate discounted rewards
            discounted_rewards = []
            Gt = 0
            for reward in reversed(episode_rewards):
                Gt = reward + GAMMA * Gt
                discounted_rewards.insert(0, Gt)

            discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32, device=DEVICE)
            discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
            log_probs = torch.stack(log_probs)

            policy_gradient = -log_probs * discounted_rewards

            optimizer.zero_grad()
            policy_gradient.sum().backward()
            optimizer.step()

env.close()


Discrete(4) Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)
EPISODE 0 SCORE: -1.654563947323429 roll: -1.654563947323429
saving...
EPISODE 100 SCORE: 0.09808566950888081 roll: -1.5655089669989615
saving...
EPISODE 200 SCORE: -0.41694778601551835 roll: -0.8799713440758741
saving...
EPISODE 300 SCORE: -0.08450193812046386 roll: -0.7103306156213588
saving...
EPISODE 400 SCORE: 0.017841346584022255 roll: -0.28085478427436716
saving...
EPISODE 500 SCORE: 0.35434214847085355 roll: -0.7484668508419695
EPISODE 600 SCORE: -2.058811538772516 roll: 0.2760381465031992
saving...


: 

In [None]:
import gymnasium as gym
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

os.makedirs("outputs", exist_ok=True)

DEVICE = "cpu"
ACTION_SPACE = [0, 1, 2, 3]
EPISODES = 5000
BATCH_SIZE = 8
GAMMA = 0.99
RENDER = False

class ReinforceNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(ReinforceNetwork, self).__init__()
        self.fc1 = nn.Linear(n_inputs, 16)
        self.fc2 = nn.Linear(16, 32)
        self.fc3 = nn.Linear(32, n_outputs)

    def forward(self, x):
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype=torch.float32, device=DEVICE)
        x = x.unsqueeze(0) if x.dim() == 1 else x
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        actions = torch.softmax(self.fc3(x), dim=-1)
        action = self.get_action(actions)
        log_prob_action = torch.log(actions.squeeze(0))[action]
        return action, log_prob_action

    def get_action(self, actions):
        return np.random.choice(ACTION_SPACE, p=actions.squeeze(0).detach().cpu().numpy())

env = gym.make("LunarLander-v2")
model = ReinforceNetwork(env.observation_space.shape[0], env.action_space.n).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

all_rewards = []
all_losses = []
batch_rewards = []
batch_log_probs = []
best_rolling = -99999

for episode in range(EPISODES):
    state, _ = env.reset()
    episode_rewards = []
    episode_log_probs = []

    while True:
        if RENDER:
            env.render()

        action, log_prob = model(state)
        #state, reward, done, info = env.step(action)
        step_result = env.step(action)
        state, reward, done, info = step_result[:4]

        episode_log_probs.append(log_prob)
        episode_rewards.append(reward / 100)  # Scaling reward

        if done:
            batch_rewards.extend(episode_rewards)
            batch_log_probs.extend(episode_log_probs)

            if (episode + 1) % BATCH_SIZE == 0 or episode == EPISODES - 1:
                discounted_rewards = []
                Gt = 0
                for reward in reversed(batch_rewards):
                    Gt = reward + GAMMA * Gt
                    discounted_rewards.insert(0, Gt)

                discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32, device=DEVICE)
                discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
                batch_log_probs = torch.stack(batch_log_probs)

                policy_gradient = -batch_log_probs * discounted_rewards

                optimizer.zero_grad()
                loss = policy_gradient.sum()
                loss.backward()
                optimizer.step()

                all_rewards.append(np.sum(batch_rewards))
                all_losses.append(loss.item())
                batch_rewards = []
                batch_log_probs = []

                if episode % 100 == 0:
                    rolling_avg_reward = pd.Series(all_rewards).tail(100).mean()
                    print(f"EPISODE {episode} SCORE: {np.sum(batch_rewards)} roll: {rolling_avg_reward}")
                    torch.save(model.state_dict(), 'outputs/last_params_cloud.pth')
                    
                    if rolling_avg_reward > best_rolling:
                        best_rolling = rolling_avg_reward
                        print("saving...")
                        torch.save(model.state_dict(), 'outputs/best_params_cloud.pth')

            break

env.close()
