# TER v1

In [None]:
import torch
import torch.nn as nn
import random

class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        # Initializes the TransformerExperienceReplay class.
        self.capacity = capacity  # Maximum number of experiences the buffer can hold.
        self.buffer = []  # List to store experiences.
        self.position = 0  # Tracks the next position to insert an experience.
        # Initializes a transformer model with specified input dimensions and architecture parameters.
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, num_encoder_layers=2)

    def push(self, experience):
        # Adds an experience to the replay buffer.
        if len(self.buffer) < self.capacity:
            # If buffer is not full, append a placeholder (None).
            self.buffer.append(None)
        # Inserts the new experience at the current position.
        self.buffer[self.position] = experience
        # Updates the position, wrapping around if it reaches the capacity.
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        # Samples experiences from the buffer based on transformer attention mechanism.
        if len(self.buffer) < batch_size:
            # Returns an empty list if there aren't enough experiences for a batch.
            return []

        # Prepares the current state as the query for the transformer.
        # Adds extra dimensions to fit the transformer's input requirements.
        query = current_state.unsqueeze(0).unsqueeze(1)
        # Stacks all non-None experiences to create a tensor for keys and values.
        keys_values = torch.stack([exp for exp in self.buffer if exp is not None])
        # Adds an extra dimension to the keys and values tensor.
        keys_values = keys_values.unsqueeze(1)

        # Passes the query, keys, and values through the transformer model.
        attention_output = self.transformer(query, keys_values, keys_values)
        # Applies softmax to the attention output to get probabilities for sampling.
        attention_weights = torch.softmax(attention_output, dim=0)

        # Samples indices based on the attention weights.
        sampled_indices = torch.multinomial(attention_weights.view(-1), batch_size, replacement=True)
        # Retrieves experiences corresponding to the sampled indices.
        sampled_experiences = [self.buffer[i] for i in sampled_indices]

        # Returns the sampled experiences.
        return sampled_experiences

    def __len__(self):
        # Returns the number of experiences currently in the buffer.
        return len(self.buffer)

    def count_parameters(self):
        # Counts the number of trainable parameters in the transformer model.
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
# Prints the number of trainable parameters in the transformer model.
print("Number of parameters in the transformer:", replay.count_parameters())


# TER v2

In [None]:
import torch
import torch.nn as nn
import random

class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim, nhead=4, num_encoder_layers=2, num_decoder_layers=2):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        self.transformer = nn.Transformer(d_model=input_dim, nhead=nhead, 
                                          num_encoder_layers=num_encoder_layers, 
                                          num_decoder_layers=num_decoder_layers)

    def push(self, experience):
        # 'experience' is expected to be a tuple: (state, action, reward, next_state, target)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = experience
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        query = current_state.unsqueeze(0).unsqueeze(1)
        return query

    def _prepare_keys_values(self):
        keys_values = torch.stack([torch.cat(exp) for exp in self.buffer if exp is not None])
        keys_values = keys_values.unsqueeze(1)
        return keys_values

    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


# TER v3

In [None]:
import torch
import torch.nn as nn
import random

class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        # Standard transformer configuration can be set here
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)

    def push(self, experience):
        # 'experience' is expected to be a tuple: (state, action, reward, next_state, target)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = experience
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        query = current_state.unsqueeze(0).unsqueeze(1)
        return query

    def _prepare_keys_values(self):
        keys_values = torch.stack([torch.cat(exp) for exp in self.buffer if exp is not None])
        keys_values = keys_values.unsqueeze(1)
        return keys_values

    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


# Test

In [2]:
!pip install gym

Collecting gym
  Using cached gym-0.26.2-py3-none-any.whl
Collecting cloudpickle>=1.2.0 (from gym)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting gym-notices>=0.0.4 (from gym)
  Downloading gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: gym-notices, cloudpickle, gym
Successfully installed cloudpickle-3.0.0 gym-0.26.2 gym-notices-0.0.8


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import copy
import gym



class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        # Standard transformer configuration can be set here
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)

    def push(self, experience):
        # 'experience' is expected to be a tuple: (state, action, reward, next_state, target)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = experience
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        query = current_state.unsqueeze(0).unsqueeze(1)
        return query

    def _prepare_keys_values(self):
        keys_values = torch.stack([torch.cat(exp) for exp in self.buffer if exp is not None])
        keys_values = keys_values.unsqueeze(1)
        return keys_values

    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


class DQN:
    def __init__(self, state_size, action_size=4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size

        self.model = nn.Sequential(
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2, l3),
            nn.ReLU(),
            nn.Linear(l3, l4)
        )

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            qvals = self.model(state)
        return qvals.numpy()

    def get_maxQ(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            q_values = self.model2(state)
        return torch.max(q_values).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute Q values for current states
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q values for next states using target network
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)  # gamma is the discount factor

        # Compute loss
        loss = self.loss_fn(Q_current, Q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()



Number of parameters in the transformer: 2505216




'\n# Example usage\nstate_size = env.observation_space.shape[0]  # Assuming \'env\' is your Gym environment\naction_size = env.action_space.n\ndqn_agent = DQN(state_size, action_size)\n\n\n\n\n# Initialize the Gym environment\nenv = gym.make(\'CartPole-v1\')\n\n# Parameters\ninput_dim = env.observation_space.shape[0] + 2  # state dim + action + reward\ncapacity = 1000\n\n# Initialize TER\nter = TransformerExperienceReplay(capacity, input_dim)\n\n\n\nfor episode in range(num_episodes):\n    state = env.reset()\n    total_reward = 0\n\n    while True:\n        action = agent.select_action(state)\n        next_state, reward, done, _ = env.step(action)\n\n        # Store experience in TER\n        experience = (state, action, reward, next_state)\n        ter.push(experience)\n\n        # Sample from TER for learning\n        if len(ter) > batch_size:\n            experiences = ter.sample(batch_size, current_state=torch.tensor(state))\n            agent.learn(experiences)\n\n        state =

In [5]:
import gym
import torch
import numpy as np
import random

import torch.nn as nn
import torch.optim as optim
import copy
import gym



class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        # Standard transformer configuration can be set here
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)

    def push(self, experience):
        # 'experience' is expected to be a tuple: (state, action, reward, next_state, target)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = experience
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        query = current_state.unsqueeze(0).unsqueeze(1)
        return query

    def _prepare_keys_values(self):
        keys_values = torch.stack([torch.cat(exp) for exp in self.buffer if exp is not None])
        keys_values = keys_values.unsqueeze(1)
        return keys_values

    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


class DQN:
    def __init__(self, state_size, action_size=4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size

        self.model = nn.Sequential(
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2, l3),
            nn.ReLU(),
            nn.Linear(l3, l4)
        )

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            qvals = self.model(state)
        return qvals.numpy()

    def get_maxQ(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            q_values = self.model2(state)
        return torch.max(q_values).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute Q values for current states
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q values for next states using target network
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)  # gamma is the discount factor

        # Compute loss
        loss = self.loss_fn(Q_current, Q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()


# Initialize the Gym environment
env = gym.make('CartPole-v1')

# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
input_dim = state_size + 2  # state dim + action + reward
capacity = 1000
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
num_episodes = 1000

# Initialize TER and DQN
ter = TransformerExperienceReplay(capacity, input_dim)
dqn_agent = DQN(state_size, action_size)

def select_action(state, epsilon):
    if np.random.rand() <= epsilon:
        return env.action_space.sample()  # Explore: select a random action
    else:
        q_values = dqn_agent.get_qvals(state)
        return np.argmax(q_values)  # Exploit: select the action with max value

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    while True:
        action = select_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)

        # Adjust reward for terminal states
        reward_adj = reward if not done or total_reward == 499 else -100

        # Store experience in TER
        experience = (state, action, reward_adj, next_state, done)
        ter.push(experience)

        # Decrease epsilon
        epsilon = max(epsilon_min, epsilon_decay * epsilon)

        # Sample from TER for learning
        if len(ter) >= batch_size:
            experiences = ter.sample(batch_size, current_state=torch.tensor(state, dtype=torch.float32))
            dqn_agent.train_one_step(experiences)

        state = next_state
        total_reward += reward

        if done:
            total_reward = total_reward if total_reward == 500 else total_reward + 100
            print(f"Episode {episode}: Total Reward: {total_reward}")
            break

        # Update target network
        if episode % 10 == 0:
            dqn_agent.update_target()

env.close()


AssertionError: embed_dim must be divisible by num_heads

# Test v2

In [9]:
import gym
import torch
import numpy as np
import random

import torch.nn as nn
import torch.optim as optim
import copy
import gym



class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        # Standard transformer configuration can be set here
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)

    def push(self, experience):
        # 'experience' is expected to be a tuple: (state, action, reward, next_state, target)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = experience
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        query = current_state.unsqueeze(0).unsqueeze(1)
        return query

    def _prepare_keys_values(self):
        keys_values = torch.stack([torch.cat(exp) for exp in self.buffer if exp is not None])
        keys_values = keys_values.unsqueeze(1)
        return keys_values

    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


class DQN:
    def __init__(self, state_size, action_size=4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size

        self.model = nn.Sequential(
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2, l3),
            nn.ReLU(),
            nn.Linear(l3, l4)
        )

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            qvals = self.model(state)
        return qvals.numpy()

    def get_maxQ(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            q_values = self.model2(state)
        return torch.max(q_values).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute Q values for current states
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q values for next states using target network
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)  # gamma is the discount factor

        # Compute loss
        loss = self.loss_fn(Q_current, Q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()


# Initialize the Gym environment
env = gym.make('CartPole-v1')

# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Adjust input_dim to be divisible by the number of heads in the transformer (which is 4)
input_dim = state_size + 2
if input_dim % 4 != 0:
    input_dim += (4 - input_dim % 4)  # Increase input_dim to the next multiple of 4

capacity = 1000
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
num_episodes = 1000

# Initialize TER and DQN with the adjusted input_dim
ter = TransformerExperienceReplay(capacity, input_dim)
dqn_agent = DQN(state_size, action_size)


def select_action(state, epsilon):
    if np.random.rand() <= epsilon:
        return env.action_space.sample()  # Explore: select a random action
    else:
        q_values = dqn_agent.get_qvals(state)
        return np.argmax(q_values)  # Exploit: select the action with max value

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    while True:
        action = select_action(state, epsilon)
        step_output = env.step(action)
        print("Step output:", step_output)  # Debugging print statement

        try:
            next_state, reward, done, _ , _= step_output
        except ValueError as e:
            print("Error unpacking step output:", e)
            print("Step output:", step_output)
            break

        # Adjust reward for terminal states
        reward_adj = reward if not done or total_reward == 499 else -100

        # Store experience in TER
        experience = (state, action, reward_adj, next_state, done)
        ter.push(experience)

        # Decrease epsilon
        epsilon = max(epsilon_min, epsilon_decay * epsilon)

        # Sample from TER for learning
        if len(ter) >= batch_size:
            experiences = ter.sample(batch_size, current_state=torch.tensor(state, dtype=torch.float32))
            dqn_agent.train_one_step(experiences)

        state = next_state
        total_reward += reward

        if done:
            total_reward = total_reward if total_reward == 500 else total_reward + 100
            print(f"Episode {episode}: Total Reward: {total_reward}")
            break

        # Update target network
        if episode % 10 == 0:
            dqn_agent.update_target()

env.close()


Number of parameters in the transformer: 2505216
Step output: (array([ 0.04639766, -0.2045838 ,  0.01733584,  0.32029274], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04230599, -0.00971296,  0.0237417 ,  0.03312691], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04211173,  0.18506062,  0.02440424, -0.25197172], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04581294,  0.37982574,  0.0193648 , -0.53685826], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.05340946,  0.18443695,  0.00862764, -0.23813714], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.05709819,  0.3794346 ,  0.00386489, -0.52808625], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.06468689,  0.18425848, -0.00669683, -0.23418798], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.06837206,  0.37947547, -0.01138059, -0.5289757 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.07596157,  0.18451545, 

  if not isinstance(terminated, (bool, np.bool8)):


TypeError: expected Tensor as element 0 in argument 0, but got tuple

# Test v3

In [13]:
import gym
import torch
import numpy as np
import random

import torch.nn as nn
import torch.optim as optim
import copy
import gym



class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        # Standard transformer configuration can be set here
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)

    def push(self, experience):
        # 'experience' is expected to be a tuple: (state, action, reward, next_state, target)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = experience
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        query = current_state.unsqueeze(0).unsqueeze(1)
        return query

    def _prepare_keys_values(self):
        processed_experiences = []
        for exp in self.buffer:
            if exp is not None:
                state, action, reward, next_state, done = exp
                
                # Check if state and next_state are in correct format
                if isinstance(state, np.ndarray) and isinstance(next_state, np.ndarray):
                    state_tensor = torch.from_numpy(state).float()
                    next_state_tensor = torch.from_numpy(next_state).float()
                else:
                    # Handle cases where state or next_state might not be a numpy array
                    print("Invalid state or next_state format:", state, next_state)
                    continue  # Skip this experience

                # Convert action, reward, and done to tensors
                action_tensor = torch.tensor([action], dtype=torch.float32)
                reward_tensor = torch.tensor([reward], dtype=torch.float32)
                done_tensor = torch.tensor([done], dtype=torch.float32)
                
                exp_tensor = torch.cat((state_tensor, action_tensor, reward_tensor, next_state_tensor, done_tensor))
                processed_experiences.append(exp_tensor)

        keys_values = torch.stack(processed_experiences)
        keys_values = keys_values.unsqueeze(1)
        return keys_values

    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


class DQN:
    def __init__(self, state_size, action_size=4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size

        self.model = nn.Sequential(
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2, l3),
            nn.ReLU(),
            nn.Linear(l3, l4)
        )

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            qvals = self.model(state)
        return qvals.numpy()

    def get_maxQ(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            q_values = self.model2(state)
        return torch.max(q_values).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute Q values for current states
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q values for next states using target network
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)  # gamma is the discount factor

        # Compute loss
        loss = self.loss_fn(Q_current, Q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()


# Initialize the Gym environment
env = gym.make('CartPole-v1')

# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Adjust input_dim to be divisible by the number of heads in the transformer (which is 4)
input_dim = state_size + 2
if input_dim % 4 != 0:
    input_dim += (4 - input_dim % 4)  # Increase input_dim to the next multiple of 4

capacity = 1000
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
num_episodes = 1000

# Initialize TER and DQN with the adjusted input_dim
ter = TransformerExperienceReplay(capacity, input_dim)
dqn_agent = DQN(state_size, action_size)


def select_action(state, epsilon):
    if np.random.rand() <= epsilon:
        return env.action_space.sample()  # Explore: select a random action
    else:
        # Ensure state is a numpy array before passing to get_qvals
        state_array = np.array(state) if not isinstance(state, np.ndarray) else state
        q_values = dqn_agent.get_qvals(state_array)
        return np.argmax(q_values)  # Exploit: select the action with max value


for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    while True:
        action = select_action(state, epsilon)
        step_output = env.step(action)
        print("Step output:", step_output)  # Debugging print statement

        try:
            next_state, reward, done, _ , _= step_output
        except ValueError as e:
            print("Error unpacking step output:", e)
            print("Step output:", step_output)
            break

        # Adjust reward for terminal states
        reward_adj = reward if not done or total_reward == 499 else -100

        # Store experience in TER
        experience = (state, action, reward_adj, next_state, done)
        ter.push(experience)

        # Decrease epsilon
        epsilon = max(epsilon_min, epsilon_decay * epsilon)

        # Sample from TER for learning
        if len(ter) >= batch_size:
            experiences = ter.sample(batch_size, current_state=torch.tensor(state, dtype=torch.float32))
            dqn_agent.train_one_step(experiences)

        state = next_state
        total_reward += reward

        if done:
            total_reward = total_reward if total_reward == 500 else total_reward + 100
            print(f"Episode {episode}: Total Reward: {total_reward}")
            break

        # Update target network
        if episode % 10 == 0:
            dqn_agent.update_target()

env.close()


Number of parameters in the transformer: 2505216
Step output: (array([-0.05043406, -0.23217718, -0.00655086,  0.29743278], dtype=float32), 1.0, False, False, {})
Step output: (array([-0.0550776 , -0.42720515, -0.00060221,  0.5880425 ], dtype=float32), 1.0, False, False, {})
Step output: (array([-0.0636217 , -0.23207475,  0.01115865,  0.29516992], dtype=float32), 1.0, False, False, {})
Step output: (array([-0.0682632 , -0.03711365,  0.01706204,  0.00602704], dtype=float32), 1.0, False, False, {})
Step output: (array([-0.06900547, -0.23247609,  0.01718258,  0.30404404], dtype=float32), 1.0, False, False, {})
Step output: (array([-0.07365499, -0.42783865,  0.02326347,  0.6020961 ], dtype=float32), 1.0, False, False, {})
Step output: (array([-0.08221176, -0.62327814,  0.03530538,  0.9020148 ], dtype=float32), 1.0, False, False, {})
Step output: (array([-0.09467733, -0.8188602 ,  0.05334568,  1.2055826 ], dtype=float32), 1.0, False, False, {})
Step output: (array([-0.11105453, -0.62446666, 

RuntimeError: the feature number of src and tgt must be equal to d_model

# Test v4

In [18]:
import gym
import torch
import numpy as np
import random

import torch.nn as nn
import torch.optim as optim
import copy
import gym



class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        # Standard transformer configuration can be set here
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)

    def push(self, experience):
        # 'experience' is expected to be a tuple: (state, action, reward, next_state, target)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = experience
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        # Convert state to tensor if it's a numpy array
        if isinstance(current_state, np.ndarray):
            current_state_tensor = torch.from_numpy(current_state).float()
        else:
            raise ValueError("Current state is not a numpy array")

        # Ensure current_state has the correct size
        if current_state_tensor.nelement() == 128:
            query = current_state_tensor.unsqueeze(0).unsqueeze(1)
        else:
            # Pad or truncate the current_state to have a size of 128
            query = torch.nn.functional.pad(current_state_tensor, (0, 128 - current_state_tensor.nelement()), "constant", 0)
            query = query.unsqueeze(0).unsqueeze(1)
        return query

    def _prepare_keys_values(self):
        processed_experiences = []
        for exp in self.buffer:
            if exp is not None:
                state, action, reward, next_state, done = exp
                
                # Verify that state and next_state are numpy arrays
                if not (isinstance(state, np.ndarray) and isinstance(next_state, np.ndarray)):
                    print("Invalid state or next_state format:", state, next_state)
                    continue

                state_tensor = torch.from_numpy(state).float()
                next_state_tensor = torch.from_numpy(next_state).float()

                # Convert action, reward, and done to tensors
                action_tensor = torch.tensor([action], dtype=torch.float32)
                reward_tensor = torch.tensor([reward], dtype=torch.float32)
                done_tensor = torch.tensor([done], dtype=torch.float32)
                
                # Concatenate tensors to create a single tensor for the experience
                exp_tensor = torch.cat((state_tensor, action_tensor, reward_tensor, next_state_tensor, done_tensor))
                
                # Ensure exp_tensor has the correct size
                if exp_tensor.nelement() == 128:
                    processed_experiences.append(exp_tensor)
                else:
                    # Pad or truncate to have a size of 128
                    padded_tensor = torch.nn.functional.pad(exp_tensor, (0, 128 - exp_tensor.nelement()), "constant", 0)
                    processed_experiences.append(padded_tensor)

        keys_values = torch.stack(processed_experiences)
        keys_values = keys_values.unsqueeze(1)
        return keys_values


    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


class DQN:
    def __init__(self, state_size, action_size=4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size

        self.model = nn.Sequential(
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2, l3),
            nn.ReLU(),
            nn.Linear(l3, l4)
        )

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            qvals = self.model(state)
        return qvals.numpy()

    def get_maxQ(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            q_values = self.model2(state)
        return torch.max(q_values).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute Q values for current states
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q values for next states using target network
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)  # gamma is the discount factor

        # Compute loss
        loss = self.loss_fn(Q_current, Q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()


# Initialize the Gym environment
env = gym.make('CartPole-v1')

# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Adjust input_dim to be divisible by the number of heads in the transformer (which is 4)
input_dim = state_size + 2
if input_dim % 4 != 0:
    input_dim += (4 - input_dim % 4)  # Increase input_dim to the next multiple of 4

capacity = 1000
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
num_episodes = 1000

# Initialize TER and DQN with the adjusted input_dim
ter = TransformerExperienceReplay(capacity, input_dim)
dqn_agent = DQN(state_size, action_size)


def select_action(state, epsilon):
    if np.random.rand() <= epsilon:
        return env.action_space.sample()  # Explore: select a random action
    else:
        # Debugging: Check the state's shape and type
        print("State before conversion:", state, "Type:", type(state))
        
        # Convert state to a NumPy array if it is not already
        if not isinstance(state, np.ndarray):
            try:
                state_array = np.asarray(state, dtype=np.float32)
            except Exception as e:
                print("Error converting state to NumPy array:", e)
                print("State:", state)
                return env.action_space.sample()  # Fallback action
        else:
            state_array = state

        # Debugging: Check the state array's shape after conversion
        print("State array shape:", state_array.shape)

        q_values = dqn_agent.get_qvals(state_array)
        return np.argmax(q_values)  # Exploit: select the action with max value




for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    while True:
        action = select_action(state, epsilon)
        step_output = env.step(action)
        print("Step output:", step_output)  # Debugging print statement

        try:
            next_state, reward, done, _ , _= step_output
        except ValueError as e:
            print("Error unpacking step output:", e)
            print("Step output:", step_output)
            break

        # Adjust reward for terminal states
        reward_adj = reward if not done or total_reward == 499 else -100

        # Store experience in TER
        experience = (state, action, reward_adj, next_state, done)
        ter.push(experience)

        # Decrease epsilon
        epsilon = max(epsilon_min, epsilon_decay * epsilon)

        # Sample from TER for learning
        if len(ter) >= batch_size:
            experiences = ter.sample(batch_size, current_state=torch.tensor(state, dtype=torch.float32))
            dqn_agent.train_one_step(experiences)

        state = next_state
        total_reward += reward

        if done:
            total_reward = total_reward if total_reward == 500 else total_reward + 100
            print(f"Episode {episode}: Total Reward: {total_reward}")
            break

        # Update target network
        if episode % 10 == 0:
            dqn_agent.update_target()

env.close()


Number of parameters in the transformer: 2505216
Step output: (array([ 0.03040762,  0.1812452 , -0.02534003, -0.25116494], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.03403252,  0.37671965, -0.03036333, -0.5517317 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04156691,  0.18203701, -0.04139796, -0.26876774], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04520765,  0.37772453, -0.04677331, -0.5742151 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.05276214,  0.57346994, -0.05825762, -0.8812584 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.06423154,  0.3791857 , -0.07588279, -0.6074445 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.07181525,  0.575282  , -0.08803167, -0.9230306 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.08332089,  0.3814526 , -0.10649229, -0.6592602 ], dtype=float32), 1.0, False, False, {})
State before conversion: [ 0.08332089  0.381452

ValueError: Current state is not a numpy array

In [38]:
import gym
import torch
import numpy as np
import random

import torch.nn as nn
import torch.optim as optim
import copy



class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = [None] * capacity
        self.position = 0
        # Standard transformer configuration can be set here
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)

    def push(self, experience):
        state, action, reward, next_state, done = experience
        state_tensor = state.clone().detach()
        next_state_tensor = next_state.clone().detach()

        action_tensor = torch.tensor([action], dtype=torch.float32)
        reward_tensor = torch.tensor([reward], dtype=torch.float32)
        done_tensor = torch.tensor([done], dtype=torch.float32)
        self.buffer[self.position] = (state_tensor, action_tensor, reward_tensor, next_state_tensor, done_tensor)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        # Pad or truncate to match the size of d_model
        current_state_padded = torch.nn.functional.pad(current_state, (0, self.transformer.d_model - current_state.nelement()), "constant", 0)
        query = current_state_padded.view(1, 1, self.transformer.d_model)  # Reshape to [1, 1, d_model]
        return query

    def _prepare_keys_values(self):
        processed_experiences = []
        for exp in self.buffer:
            if exp is not None:
                state, action, reward, next_state, done = exp

                # Use the tensors directly as they are already in the correct format
                state_tensor = state
                next_state_tensor = next_state

                # Convert action, reward, and done to tensors
                action_tensor = torch.tensor([action], dtype=torch.float32)
                reward_tensor = torch.tensor([reward], dtype=torch.float32)
                done_tensor = torch.tensor([done], dtype=torch.float32)

                # Concatenate tensors to create a single tensor for the experience
                exp_tensor = torch.cat((state_tensor, action_tensor, reward_tensor, next_state_tensor, done_tensor))

                # Ensure exp_tensor has the correct size
                if exp_tensor.nelement() < self.transformer.d_model:
                    # Pad to match the size of d_model
                    exp_tensor_padded = torch.nn.functional.pad(exp_tensor, (0, self.transformer.d_model - exp_tensor.nelement()), "constant", 0)
                else:
                    # Truncate to match the size of d_model
                    exp_tensor_padded = exp_tensor[:self.transformer.d_model]

                processed_experiences.append(exp_tensor_padded.view(1, 1, self.transformer.d_model))  # Reshape to [1, 1, d_model]

        keys_values = torch.cat(processed_experiences, dim=0)  # Concatenate along the batch dimension
        return keys_values

    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


class DQN:
    def __init__(self, state_size, action_size=4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size

        self.model = nn.Sequential(
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2, l3),
            nn.ReLU(),
            nn.Linear(l3, l4)
        )

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        # Assuming 'state' is already a PyTorch tensor
        with torch.no_grad():
            qvals = self.model(state)
        return qvals.numpy() 

    def get_maxQ(self, state):
        with torch.no_grad():
            q_values = self.model2(state)
        return torch.max(q_values).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute Q values for current states
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q values for next states using target network
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)  # gamma is the discount factor

        # Compute loss
        loss = self.loss_fn(Q_current, Q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()


# Initialize the Gym environment
env = gym.make('CartPole-v1')

# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Adjust input_dim to be divisible by the number of heads in the transformer (which is 4)
input_dim = state_size + 2
if input_dim % 4 != 0:
    input_dim += (4 - input_dim % 4)  # Increase input_dim to the next multiple of 4

capacity = 1000
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
num_episodes = 1000

# Initialize TER and DQN with the adjusted input_dim
ter = TransformerExperienceReplay(capacity, input_dim)
dqn_agent = DQN(state_size, action_size)


def select_action(state, epsilon):
    # Assuming state is already a tensor
    q_values = dqn_agent.get_qvals(state.unsqueeze(0))  # Add batch dimension
    return np.argmax(q_values) if np.random.rand() > epsilon else env.action_space.sample()


for episode in range(num_episodes):
    initial_state_info = env.reset()
    state = initial_state_info[0] if isinstance(initial_state_info, tuple) else initial_state_info
    state = torch.from_numpy(state).float()  # Convert initial state to tensor
    total_reward = 0

    while True:
        action = select_action(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        next_state = torch.from_numpy(next_state).float()  # Convert next state to tensor

        # Adjust reward for terminal states and convert to tensor
        reward_adj = torch.tensor([reward if not done or total_reward == 499 else -100], dtype=torch.float32)

        ter.push((state, action, reward_adj, next_state, torch.tensor([done], dtype=torch.float32)))
        state = next_state  # Update state to tensor

        # Decrease epsilon
        epsilon = max(epsilon_min, epsilon_decay * epsilon)

        # Sample from TER for learning
        if len(ter) >= batch_size:
            experiences = ter.sample(batch_size, current_state=state)
            dqn_agent.train_one_step(experiences)

        total_reward += reward

        if done:
            total_reward = total_reward if total_reward == 500 else total_reward + 100
            print(f"Episode {episode}: Total Reward: {total_reward}")
            break

        # Update target network
        if episode % 10 == 0:
            dqn_agent.update_target()


env.close()


Number of parameters in the transformer: 2505216


RuntimeError: The shape of the 3D attn_mask is torch.Size([1, 1, 8]), but should be (4, 1, 1).

In [42]:
import gym
import torch
import numpy as np
import random

import torch.nn as nn
import torch.optim as optim
import copy



class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = [None] * capacity
        self.position = 0
        self.input_dim = input_dim
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)


    def push(self, experience):
        state, action, reward, next_state, done = experience
        state_tensor = state.clone().detach()
        next_state_tensor = next_state.clone().detach()

        action_tensor = torch.tensor([action], dtype=torch.float32)
        reward_tensor = torch.tensor([reward], dtype=torch.float32)
        done_tensor = torch.tensor([done], dtype=torch.float32)
        self.buffer[self.position] = (state_tensor, action_tensor, reward_tensor, next_state_tensor, done_tensor)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state, batch_size=1, seq_length=1):
        current_state_flat = current_state.view(-1)
        # Pad or truncate to match the size of d_model (input_dim)
        if current_state_flat.nelement() < self.input_dim:
            current_state_padded = torch.nn.functional.pad(current_state_flat, (0, self.input_dim - current_state_flat.nelement()), "constant", 0)
        else:
            current_state_padded = current_state_flat[:self.input_dim]

        # Reshape to [batch_size, seq_length, feature_dim]
        query = current_state_padded.view(batch_size, seq_length, -1)
        return query

    def _prepare_keys_values(self, batch_size=1, seq_length=1):
        processed_experiences = []
        for exp in self.buffer:
            if exp is not None:
                state, action, reward, next_state, done = exp
                exp_tensor = torch.cat([s.view(-1) for s in [state, next_state, action, reward, done]])
                # Pad or truncate to match the size of d_model (input_dim)
                if exp_tensor.nelement() < self.input_dim:
                    exp_tensor_padded = torch.nn.functional.pad(exp_tensor, (0, self.input_dim - exp_tensor.nelement()), "constant", 0)
                else:
                    exp_tensor_padded = exp_tensor[:self.input_dim]

                processed_experiences.append(exp_tensor_padded.view(batch_size, seq_length, -1))

        if not processed_experiences:
            return torch.zeros((batch_size, seq_length, self.input_dim))

        keys_values = torch.cat(processed_experiences, dim=0)
        return keys_values


    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


class DQN:
    def __init__(self, state_size, action_size=4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size

        self.model = nn.Sequential(
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2, l3),
            nn.ReLU(),
            nn.Linear(l3, l4)
        )

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        # Assuming 'state' is already a PyTorch tensor
        with torch.no_grad():
            qvals = self.model(state)
        return qvals.numpy() 

    def get_maxQ(self, state):
        with torch.no_grad():
            q_values = self.model2(state)
        return torch.max(q_values).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute Q values for current states
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q values for next states using target network
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)  # gamma is the discount factor

        # Compute loss
        loss = self.loss_fn(Q_current, Q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()


# Initialize the Gym environment
env = gym.make('CartPole-v1')

# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Adjust input_dim to be divisible by the number of heads in the transformer (which is 4)
input_dim = state_size + 2
if input_dim % 4 != 0:
    input_dim += (4 - input_dim % 4)  # Increase input_dim to the next multiple of 4

capacity = 1000
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
num_episodes = 1000

# Initialize TER and DQN with the adjusted input_dim
ter = TransformerExperienceReplay(capacity, input_dim)
dqn_agent = DQN(state_size, action_size)


def select_action(state, epsilon):
    # Assuming state is already a tensor
    q_values = dqn_agent.get_qvals(state.unsqueeze(0))  # Add batch dimension
    return np.argmax(q_values) if np.random.rand() > epsilon else env.action_space.sample()


for episode in range(num_episodes):
    initial_state_info = env.reset()
    state = initial_state_info[0] if isinstance(initial_state_info, tuple) else initial_state_info
    state = torch.from_numpy(state).float()  # Convert initial state to tensor
    total_reward = 0

    while True:
        action = select_action(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        next_state = torch.from_numpy(next_state).float()  # Convert next state to tensor

        # Adjust reward for terminal states and convert to tensor
        reward_adj = torch.tensor([reward if not done or total_reward == 499 else -100], dtype=torch.float32)

        ter.push((state, action, reward_adj, next_state, torch.tensor([done], dtype=torch.float32)))
        state = next_state  # Update state to tensor

        # Decrease epsilon
        epsilon = max(epsilon_min, epsilon_decay * epsilon)

        # Sample from TER for learning
        if len(ter) >= batch_size:
            experiences = ter.sample(batch_size, current_state=state)
            dqn_agent.train_one_step(experiences)

        total_reward += reward

        if done:
            total_reward = total_reward if total_reward == 500 else total_reward + 100
            print(f"Episode {episode}: Total Reward: {total_reward}")
            break

        # Update target network
        if episode % 10 == 0:
            dqn_agent.update_target()


env.close()


Number of parameters in the transformer: 2505216


RuntimeError: The shape of the 3D attn_mask is torch.Size([1, 1, 8]), but should be (4, 1, 1).

In [43]:
import gym
import torch
import numpy as np
import random

import torch.nn as nn
import torch.optim as optim
import copy



class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = [None] * capacity
        self.position = 0
        self.input_dim = input_dim
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)


    def push(self, experience):
        state, action, reward, next_state, done = experience
        state_tensor = state.clone().detach()
        next_state_tensor = next_state.clone().detach()

        action_tensor = torch.tensor([action], dtype=torch.float32)
        reward_tensor = torch.tensor([reward], dtype=torch.float32)
        done_tensor = torch.tensor([done], dtype=torch.float32)
        self.buffer[self.position] = (state_tensor, action_tensor, reward_tensor, next_state_tensor, done_tensor)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state, batch_size=1, seq_length=1):
        # Flatten current_state and adjust its length to match d_model
        current_state_flat = current_state.view(-1)
        if current_state_flat.nelement() < self.input_dim:
            current_state_padded = torch.nn.functional.pad(current_state_flat, (0, self.input_dim - current_state_flat.nelement()), "constant", 0)
        else:
            current_state_padded = current_state_flat[:self.input_dim]

        # Reshape to [batch_size, seq_length, feature_dim]
        query = current_state_padded.view(batch_size, seq_length, self.input_dim)
        return query


    def _prepare_keys_values(self, batch_size=1, seq_length=1):
        processed_experiences = []
        for exp in self.buffer:
            if exp is not None:
                state, action, reward, next_state, done = exp
                exp_tensor = torch.cat([s.view(-1) for s in [state, next_state, action, reward, done]])
                if exp_tensor.nelement() < self.input_dim:
                    exp_tensor_padded = torch.nn.functional.pad(exp_tensor, (0, self.input_dim - exp_tensor.nelement()), "constant", 0)
                else:
                    exp_tensor_padded = exp_tensor[:self.input_dim]

                exp_tensor_padded = exp_tensor_padded.view(batch_size, seq_length, self.input_dim)
                processed_experiences.append(exp_tensor_padded)

        if not processed_experiences:
            return torch.zeros((batch_size, seq_length, self.input_dim))

        keys_values = torch.cat(processed_experiences, dim=0)
        return keys_values



    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

# Example usage:
replay = TransformerExperienceReplay(capacity=1000, input_dim=128)
print("Number of parameters in the transformer:", replay.count_parameters())


class DQN:
    def __init__(self, state_size, action_size=4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size

        self.model = nn.Sequential(
            nn.Linear(l1, l2),
            nn.ReLU(),
            nn.Linear(l2, l3),
            nn.ReLU(),
            nn.Linear(l3, l4)
        )

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        # Assuming 'state' is already a PyTorch tensor
        with torch.no_grad():
            qvals = self.model(state)
        return qvals.numpy() 

    def get_maxQ(self, state):
        with torch.no_grad():
            q_values = self.model2(state)
        return torch.max(q_values).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute Q values for current states
        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q values for next states using target network
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)  # gamma is the discount factor

        # Compute loss
        loss = self.loss_fn(Q_current, Q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()


# Initialize the Gym environment
env = gym.make('CartPole-v1')

# Parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Adjust input_dim to be divisible by the number of heads in the transformer (which is 4)
input_dim = state_size + 2
if input_dim % 4 != 0:
    input_dim += (4 - input_dim % 4)  # Increase input_dim to the next multiple of 4

capacity = 1000
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
num_episodes = 1000

# Initialize TER and DQN with the adjusted input_dim
ter = TransformerExperienceReplay(capacity, input_dim)
dqn_agent = DQN(state_size, action_size)


def select_action(state, epsilon):
    # Assuming state is already a tensor
    q_values = dqn_agent.get_qvals(state.unsqueeze(0))  # Add batch dimension
    return np.argmax(q_values) if np.random.rand() > epsilon else env.action_space.sample()


for episode in range(num_episodes):
    initial_state_info = env.reset()
    state = initial_state_info[0] if isinstance(initial_state_info, tuple) else initial_state_info
    state = torch.from_numpy(state).float()  # Convert initial state to tensor
    total_reward = 0

    while True:
        action = select_action(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        next_state = torch.from_numpy(next_state).float()  # Convert next state to tensor

        # Adjust reward for terminal states and convert to tensor
        reward_adj = torch.tensor([reward if not done or total_reward == 499 else -100], dtype=torch.float32)

        ter.push((state, action, reward_adj, next_state, torch.tensor([done], dtype=torch.float32)))
        state = next_state  # Update state to tensor

        # Decrease epsilon
        epsilon = max(epsilon_min, epsilon_decay * epsilon)

        # Sample from TER for learning
        if len(ter) >= batch_size:
            experiences = ter.sample(batch_size, current_state=state)
            dqn_agent.train_one_step(experiences)

        total_reward += reward

        if done:
            total_reward = total_reward if total_reward == 500 else total_reward + 100
            print(f"Episode {episode}: Total Reward: {total_reward}")
            break

        # Update target network
        if episode % 10 == 0:
            dqn_agent.update_target()


env.close()


Number of parameters in the transformer: 2505216


RuntimeError: the feature number of src and tgt must be equal to d_model

# TEST v5

In [29]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import copy

class TransformerExperienceReplay:
    def __init__(self, capacity, input_dim):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        self.transformer = nn.Transformer(d_model=input_dim, nhead=4, 
                                          num_encoder_layers=2, num_decoder_layers=2)

    def push(self, experience):
        state, action, reward, next_state, done = experience
        # Convert state and next_state to numpy arrays if they are not already
        state_array = np.array(state, dtype=np.float32) if not isinstance(state, np.ndarray) else state
        next_state_array = np.array(next_state, dtype=np.float32) if not isinstance(next_state, np.ndarray) else next_state
        
        # Ensure buffer size does not exceed capacity
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)

        self.buffer[self.position] = (state_array, action, reward, next_state_array, done)
        self.position = (self.position + 1) % self.capacity


    def sample(self, batch_size, current_state):
        if len(self.buffer) < batch_size:
            return []

        query = self._prepare_query(current_state)
        keys_values = self._prepare_keys_values()

        attention_output = self.transformer(query, keys_values, keys_values)
        attention_weights = torch.softmax(attention_output.squeeze(), dim=0)

        sampled_indices = torch.multinomial(attention_weights, batch_size, replacement=True)
        return [self.buffer[i] for i in sampled_indices]

    def _prepare_query(self, current_state):
        current_state_tensor = torch.from_numpy(current_state).float()
        query = current_state_tensor.unsqueeze(0).unsqueeze(1)
        return query



    def _prepare_keys_values(self):
        processed_experiences = []
        for exp in self.buffer:
            if exp is not None:
                state, action, reward, next_state, done = exp
                if not (isinstance(state, np.ndarray) and isinstance(next_state, np.ndarray)):
                    continue

                state_tensor = torch.from_numpy(state).float()
                next_state_tensor = torch.from_numpy(next_state).float()
                action_tensor = torch.tensor([action], dtype=torch.float32).unsqueeze(0)
                reward_tensor = torch.tensor([reward], dtype=torch.float32).unsqueeze(0)
                done_tensor = torch.tensor([done], dtype=torch.float32).unsqueeze(0)

                exp_tensor = torch.cat((state_tensor, action_tensor, reward_tensor, next_state_tensor, done_tensor), dim=0)
                processed_experiences.append(exp_tensor)

        keys_values = torch.stack(processed_experiences)
        keys_values = keys_values.unsqueeze(1)
        return keys_values

    def __len__(self):
        return len(self.buffer)

    def count_parameters(self):
        return sum(p.numel() for p in self.transformer.parameters() if p.requires_grad)

class DQN:
    def __init__(self, state_size, action_size=4):
        l1, l2, l3, l4 = state_size, 24, 24, action_size
        self.model = nn.Sequential(
            nn.Linear(l1, l2), nn.ReLU(), nn.Linear(l2, l3), nn.ReLU(), nn.Linear(l3, l4))
        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

    def get_qvals(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            return self.model(state).numpy()

    def get_maxQ(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            return torch.max(self.model2(state)).item()

    def train_one_step(self, experiences):
        states, actions, rewards, next_states, dones = zip(*experiences)
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        Q_current = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        Q_next = self.model2(next_states).detach().max(1)[0]
        Q_target = rewards + gamma * Q_next * (1 - dones)

        loss = self.loss_fn(Q_current, Q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

def select_action(state_info, epsilon):
    # Assuming state_info is a tuple where the first element is the actual state
    state = state_info[0] if isinstance(state_info, tuple) else state_info

    # Convert state to a NumPy array if it is not already
    try:
        state_array = np.asarray(state, dtype=np.float32)
    except ValueError as e:
        print("Error converting state to NumPy array:", e)
        print("State:", state)
        return env.action_space.sample()  # Fallback action

    q_values = dqn_agent.get_qvals(state_array)
    return np.argmax(q_values) if np.random.rand() > epsilon else env.action_space.sample()


env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
input_dim = state_size + 2
if input_dim % 4 != 0:
    input_dim += (4 - input_dim % 4)

capacity = 1000
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
num_episodes = 1000

ter = TransformerExperienceReplay(capacity, input_dim)
dqn_agent = DQN(state_size, action_size)

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    while True:
        action = select_action(state, epsilon)
        next_state_info, reward, done, _, _ = env.step(action)
        print("Step output:", step_output)
        # Extract the actual next state from next_state_info
        next_state = next_state_info[0] if isinstance(next_state_info, tuple) else next_state_info
        reward_adj = reward if not done or total_reward == 499 else -100

        ter.push((state[0], action, reward_adj, next_state, done))  # Ensure you're passing the actual state
        state = next_state_info  # Update the state with the new state info

        if len(ter) >= batch_size:
            experiences = ter.sample(batch_size, current_state=state)
            dqn_agent.train_one_step(experiences)

        total_reward += reward
        if done:
            break

        epsilon = max(epsilon_min, epsilon * epsilon_decay)

        if episode % 10 == 0:
            dqn_agent.update_target()

    print(f"Episode {episode}: Total Reward: {total_reward}")

env.close()


Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, False, False, {})
Step output: (array([ 0.04411922, -0.02007892,  0.10464764,  0.5665465 ], dtype=float32), 1.0, F

RuntimeError: Tensors must have same number of dimensions: got 1 and 2