In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Ortamı başlatın
env = gym.make("Acrobot-v1", render_mode="human")

# Sürekli aksiyonları ayrık aksiyonlara eşlemek için bir fonksiyon tanımlayın
discrete_to_continuous = {
    0: np.array([-1.0]),  # Sol
    1: np.array([0.0]),   # Hareket yok
    2: np.array([1.0])    # Sağ
}

# Actor ve Critic sinir ağları
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer1 = nn.Linear(state_dim, 256)
        self.layer2 = nn.Linear(256, 256)
        self.layer3 = nn.Linear(256, action_dim)
        self.max_action = max_action

    def forward(self, state):
        a = torch.relu(self.layer1(state))
        a = torch.relu(self.layer2(a))
        return self.max_action * torch.tanh(self.layer3(a))

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.layer1 = nn.Linear(state_dim + action_dim, 256)
        self.layer2 = nn.Linear(256, 256)
        self.layer3 = nn.Linear(256, 1)

    def forward(self, state, action):
        q = torch.relu(self.layer1(torch.cat([state, action], 1)))
        q = torch.relu(self.layer2(q))
        return self.layer3(q)

# Replay buffer
class ReplayBuffer:
    def __init__(self, max_size=100000):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(transition)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        states, actions, rewards, next_states, dones = [], [], [], [], []

        for i in ind:
            state, action, reward, next_state, done = self.storage[i]
            states.append(np.array(state, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(np.array(reward, copy=False))
            next_states.append(np.array(next_state, copy=False))
            dones.append(np.array(done, copy=False))

        return (
            torch.FloatTensor(np.array(states)),
            torch.FloatTensor(np.array(actions)),
            torch.FloatTensor(np.array(rewards)).unsqueeze(1),
            torch.FloatTensor(np.array(next_states)),
            torch.FloatTensor(np.array(dones)).unsqueeze(1)
        )

# Parametreler ve modeller
state_dim = env.observation_space.shape[0]
action_dim = 1  # Tek sürekli aksiyon
max_action = 1.0

actor = Actor(state_dim, action_dim, max_action)
critic = Critic(state_dim, action_dim)
actor_target = Actor(state_dim, action_dim, max_action)
critic_target = Critic(state_dim, action_dim)

actor_target.load_state_dict(actor.state_dict())
critic_target.load_state_dict(critic.state_dict())

actor_optimizer = optim.Adam(actor.parameters(), lr=1e-5)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-4)
replay_buffer = ReplayBuffer()

discount = 0.99
tau = 0.005

# DDPG güncelleme fonksiyonu
def update(batch_size=256):
    # Replay buffer'dan örnek al
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    # Kritik ağın kaybını hesapla
    target_q = critic_target(next_state, actor_target(next_state))
    target_q = reward + ((1 - done) * discount * target_q).detach()

    current_q = critic(state, action)
    critic_loss = nn.MSELoss()(current_q, target_q)

    # Kritik ağı güncelle
    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()

    # Aktör kaybı ve güncellemesi
    actor_loss = -critic(state, actor(state)).mean()
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()

    # Hedef ağları güncelle
    for param, target_param in zip(actor.parameters(), actor_target.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    for param, target_param in zip(critic.parameters(), critic_target.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

# Eğitim döngüsü
max_episodes = 5000
max_steps = 1000
batch_size = 256

for episode in range(max_episodes):
    state = env.reset()
    
    # Eğer state bir tuple ise, yalnızca ilk öğeyi al
    if isinstance(state, tuple):
        state = state[0]
    state = np.array(state)  # NumPy dizisine dönüştür

    episode_reward = 0
    for step in range(max_steps):
        # Aktör ağından sürekli aksiyon alın
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_cont = actor(state_tensor).detach().numpy()[0]

        # Sürekli aksiyonu ayrık aksiyona dönüştür
        action_disc = int(np.argmax(np.abs(action_cont)))

        # Ortamla etkileşim
        next_state, reward, terminated, truncated, info = env.step(action_disc)
        done = terminated or truncated
        if isinstance(next_state, tuple):
            next_state = next_state[0]
        next_state = np.array(next_state)  # NumPy dizisine dönüştür
        episode_reward += reward

        # Belleğe ekleyin
        replay_buffer.add((state, action_cont, reward, next_state, float(done)))

        state = next_state

        # DDPG güncellemesi
        if len(replay_buffer.storage) > batch_size:
            update(batch_size)

        if done:
            break

    print(f"Episode {episode}, Reward: {episode_reward}")

# Eğitim tamamlandıktan sonra, modelin kaydedilmesi
torch.save(actor.state_dict(), "actor_model_final_acrobot.pth")
torch.save(critic.state_dict(), "critic_model_final_acrobot.pth")
print("Final model saved.")

  if not isinstance(terminated, (bool, np.bool8)):


Episode 0, Reward: -500.0
Episode 1, Reward: -500.0
Episode 2, Reward: -500.0
Episode 3, Reward: -500.0
Episode 4, Reward: -500.0
Episode 5, Reward: -500.0
Episode 6, Reward: -500.0
Episode 7, Reward: -500.0
Episode 8, Reward: -500.0
Episode 9, Reward: -500.0
Episode 10, Reward: -500.0
Episode 11, Reward: -500.0
Episode 12, Reward: -500.0
Episode 13, Reward: -500.0
Episode 14, Reward: -500.0
Episode 15, Reward: -500.0
Episode 16, Reward: -500.0
Episode 17, Reward: -500.0
Episode 18, Reward: -500.0
Episode 19, Reward: -500.0
Episode 20, Reward: -500.0
Episode 21, Reward: -500.0
Episode 22, Reward: -500.0
Episode 23, Reward: -500.0
Episode 24, Reward: -500.0
Episode 25, Reward: -500.0
Episode 26, Reward: -500.0
Episode 27, Reward: -500.0
Episode 28, Reward: -500.0
Episode 29, Reward: -500.0
Episode 30, Reward: -500.0
Episode 31, Reward: -500.0
Episode 32, Reward: -500.0
Episode 33, Reward: -500.0
Episode 34, Reward: -500.0
Episode 35, Reward: -500.0
Episode 36, Reward: -500.0
Episode 37,

KeyboardInterrupt: 