In [None]:
import gymnasium as gym
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

os.makedirs("outputs", exist_ok=True)

DEVICE = "cpu"
ACTION_SPACE = [0, 1, 2, 3]
EPISODES = 3000
BATCH_SIZE = 8
GAMMA = 0.99
RENDER = False

class ReinforceNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(ReinforceNetwork, self).__init__()
        self.fc1 = nn.Linear(n_inputs, 16)
        self.fc2 = nn.Linear(16, 32)
        self.fc3 = nn.Linear(32, n_outputs)

    def forward(self, x):
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype=torch.float32, device=DEVICE)
        x = x.unsqueeze(0) if x.dim() == 1 else x
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        actions = torch.softmax(self.fc3(x), dim=-1)
        action = self.get_action(actions)
        log_prob_action = torch.log(actions.squeeze(0))[action]
        return action, log_prob_action

    def get_action(self, actions):
        return np.random.choice(ACTION_SPACE, p=actions.squeeze(0).detach().cpu().numpy())

env = gym.make("LunarLander-v2")
model = ReinforceNetwork(env.observation_space.shape[0], env.action_space.n).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

all_rewards = []
all_losses = []
batch_rewards = []
batch_log_probs = []
best_rolling = -99999

for episode in range(EPISODES):
    state, _ = env.reset()
    episode_rewards = []
    episode_log_probs = []

    while True:
        if RENDER:
            env.render()

        action, log_prob = model(state)
        #state, reward, done, info = env.step(action)
        step_result = env.step(action)
        state, reward, done, info = step_result[:4]

        episode_log_probs.append(log_prob)
        episode_rewards.append(reward / 100)  # Scaling reward

        if done:
            total_reward = np.sum(episode_rewards)
            all_rewards.append(total_reward)

            rolling_avg_reward = pd.Series(all_rewards).tail(100).mean()
            print(f"EPISODE {episode} SCORE: {total_reward} roll: {rolling_avg_reward}")
            torch.save(model.state_dict(), 'outputs/last_params_gpt.pth')

            if rolling_avg_reward > best_rolling:
                best_rolling = rolling_avg_reward
                print("saving...")
                torch.save(model.state_dict(), 'outputs/best_params_gpt.pth')

            batch_rewards.extend(episode_rewards)
            batch_log_probs.extend(episode_log_probs)

            if (episode + 1) % BATCH_SIZE == 0 or episode == EPISODES - 1:
                discounted_rewards = []
                Gt = 0
                for reward in reversed(batch_rewards):
                    Gt = reward + GAMMA * Gt
                    discounted_rewards.insert(0, Gt)

                discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32, device=DEVICE)
                discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
                batch_log_probs = torch.stack(batch_log_probs)

                policy_gradient = -batch_log_probs * discounted_rewards

                optimizer.zero_grad()
                loss = policy_gradient.sum()
                loss.backward()
                optimizer.step()

                all_losses.append(loss.item())
                batch_rewards = []
                batch_log_probs = []

            break

env.close()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(all_rewards)
plt.title("Reards por Episode")
plt.xlabel("Episode")
plt.ylabel("Total Rewards")
plt.grid()
plt.show()

# Graficar la evolución de las recompensas promedio cada 100 episodios
average_rewards = [pd.Series(all_rewards).iloc[i:i+100].mean() for i in range(0, len(all_rewards), 100)]
plt.figure(figsize=(12, 6))
plt.plot(range(0, len(all_rewards), 100), average_rewards)
plt.title("Mean Reward for 100 episodes")
plt.xlabel("Episode")
plt.ylabel("Mean Reward ")
plt.grid()
plt.show()

# Graficar la evolución de la pérdida a lo largo del entrenamiento
plt.figure(figsize=(12, 6))
plt.plot(all_losses)
plt.title("Training Loss Evolution")
plt.xlabel("Training")
plt.ylabel("Loss")
plt.grid()
plt.show()
