In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,TensorDataset

import math
import numpy as np
#import seaborn as sns
import matplotlib.pylab as plt

#Especifico para el gym+dataset "D4RL_Pybullet"
import gym
import d4rl_pybullet

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Config

In [31]:
model_cfg = {
    "state_dim": 4*30,
    "act_dim": 30 , # act_dim=Contextlength?
    "ffn_dim": 12,  #FeedForwardNetwork Dimension
    "embed_dim": 128,
    "num_heads": 16,
    "num_blocks": 1,
    "max_timesteps": 4096,
    "mlp_ratio": 4,
    "dropout": 0.1,
    "vocab_size": 4,
    "rtg_dim": 1

}

## Masked Attention

In [32]:
class MaskedSelfAttention(nn.Module):

    def __init__(self, embed_dim, num_heads, seq_len, dropout):
        super().__init__()

        self.embed_dim = embed_dim # embeding dimensionality, includes all heads
        self.num_heads = num_heads #  num heads
        assert self.embed_dim % self.num_heads == 0 , \
            "Embedding dimension must be multiple of the number of heads."

        self.seq_len = seq_len

        # key, query, value projections
        self.proj_q = nn.Linear(embed_dim, embed_dim)
        self.proj_k = nn.Linear(embed_dim, embed_dim)
        self.proj_v = nn.Linear(embed_dim, embed_dim)

        # output projection
        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

    def forward(self, x):
        print("T FORWARD (tocho)")
        B, T, C = x.shape # batch size, sequence length, embedding dimensionality (embed_dim)
        #head_size = self.num_heads, C // self.num_heads

        # calculate query, key, values
        q = self.proj_q(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, seqLen, numHeads, headSize) -> (B, numHeads, seqLen, headSize)
        k = self.proj_k(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, seqLen, numHeads, headSize) -> (B, numHeads, seqLen, headSize)
        v = self.proj_v(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, seqLen, numHeads, headSize) -> (B, numHeads, seqLen, headSize)

        # causal self-attention; Self-attend: (B, numHeads, seqLen, headSize) x (B, numHeads, headSize, seqLen) -> (B, numHeads, seqLen, seqLen)
        # scaled_dot_product
        attn_logits = (q @ k.transpose(-2, -1))
        attn_logits = attn_logits / torch.sqrt(torch.tensor(k.size(-1)))
        # apply mask

        mask = torch.zeros(x.shape[1], x.shape[0]).bool() #toDevice
        subsequent_mask = torch.triu(torch.ones(B, T, T), 1).bool() #toDevice
        selfattn_mask = subsequent_mask + mask.unsqueeze(-2)
        attn_logits = attn_logits.masked_fill(selfattn_mask, float('-inf'))

        softmax = nn.Softmax(dim=-1)
        attention = softmax(attn_logits)

        attention = attention @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        out = self.attn_dropout(attention)

        out = out.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.proj_out(out))
        return y

## MLP

In [33]:
class MLP(nn.Module):

    def __init__(self, embed_dim, ffn_dim, dropout):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, ffn_dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(ffn_dim, embed_dim)
        self.drop= nn.Dropout(dropout)

    def forward(self, x):
        print("MLP FORWARD")
        x = self.act(self.fc1(x))
        x = self.drop(self.fc2(x))
        return x

## Decoder block

In [34]:
class DecoderBlock(nn.Module):

    def __init__(self, embed_dim, num_heads, seq_len, mlp_ratio, dropout):
        super().__init__()

        self.attn = MaskedSelfAttention(embed_dim, num_heads, seq_len, dropout)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.mlp = MLP(embed_dim, int(embed_dim * mlp_ratio),dropout)
        self.ln_2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        print("DECODER BLOCK FORWARD")
        x = self.ln1(x) # normalize
        x = self.attn(x) + x # add residual
        x = self.ln2(x)
        x = self.mlp(x) + x

        return x


## Decision Transformer

In [35]:
class DecisionTransformer(nn.Module):
    def __init__(self, state_dim, act_dim, ffn_dim, embed_dim, num_heads, num_blocks, max_timesteps, mlp_ratio, dropout, vocab_size, rtg_dim=1):
        super().__init__()

        self.ffn_dim = ffn_dim   #Nº de Layers "nn.Linear" ~~ "ffn_dim"
        self.seq_len = act_dim   # Omar/Shuang-> Provisional, revisar esta linea.
        # Construct embedding layer
        self.state_embed = nn.Linear(in_features=state_dim, out_features=ffn_dim)
        self.act_embed = nn.Linear(in_features=act_dim, out_features=ffn_dim)
        self.rtg_embed = nn.Linear(in_features=rtg_dim, out_features=ffn_dim)
        self.pos_embed = nn.Embedding(num_embeddings=max_timesteps, embedding_dim=ffn_dim)

        self.norm = nn.LayerNorm(ffn_dim)

        #TODO: Complete Basic Transformer parameters
        self.transformerGPT = nn.ModuleList([DecoderBlock(embed_dim, num_heads, self.seq_len, mlp_ratio, dropout) for _ in range(num_blocks)])

        self.rtg_pred = nn.Linear(in_features=ffn_dim, out_features=1)
        self.state_pred = nn.Linear(in_features=ffn_dim, out_features=state_dim)
        self.act_pred = nn.Sequential(
            nn.Linear(ffn_dim, act_dim),
            nn.Tanh()
        )

    def forward(self, timestep, max_timesteps, states, actions, returns_to_go):
        print("DT FORWARD")
        print(timestep)
        print(max_timesteps)
        print(states)
        print(actions)
        print(returns_to_go)
        B, T, _ = states.shape # [batch size, seq length, embed_dim]

        pos_embedding = self.pos_embed(max_timesteps, timestep)
        print("DEBUG 1")
        state_embedding = self.state_embed(states)
        act_embedding = self.act_embed(actions)
        rtg_embedding = self.rtg_embed(returns_to_go)
        print("DEBUG 2")
        state_embedding += pos_embedding
        act_embedding += pos_embedding
        returns_to_go += pos_embedding
        print("DEBUG 3")
        # (R{1}, S{1}, A{1}, ..., R{i}, S{i}, A{i}, ..., R{n}, S{n}, A{n}) | 1 < i < n
        stacked_inputs = torch.stack((rtg_embedding, state_embedding, act_embedding), dim=1) # [B, rtg_dim, state_dim, act_dim]
        stacked_inputs = stacked_inputs.permute(0, 2, 1, 3) #[B, state_dim, rtg_dim, act_dim]
        stacked_inputs = stacked_inputs.reshape(B, 3*T, self.ffn_dim) # [B, 3*T, hidden_size]  Nota: ffn_dim a.k.a "hidden_size"
        print("DEBUG 4")
        x = self.norm(stacked_inputs)
        if torch.is_tensor(x):
            print("SHAP INPUT TRANSFORMER TOCHO: ", x.shape)
        else:
            print("LEN INPUT T: ", len(x))
        print(x)
        #TODO: Complete Basic Transformer
        out = self.transformerGPT(x)
        out = out.reshape(B, T, 3, self.ffn_dim).permute(0, 2, 1, 3)  #[B, T, 3, hidden_size] --> [B, 3, T, hidden_size]     Nota: ffn_dim a.k.a "hidden_size"

        returns_to_go_preds = self.rtg_pred(out[:,2])     # predict next return given state and action [0 state, 1 action, 2 rtg]
        state_preds = self.state_pred(out[:,2])           # predict next state given state and action  [0, 1, 2 rtg]
        act_preds = self.act_pred(out[:,1])               # predict next action given state            [0, 1, 2]

        return returns_to_go, state_preds, act_preds

In [36]:
#Creamos instancia decision transformer con nuestra config
model_dt = DecisionTransformer(**model_cfg)
model_dt

DecisionTransformer(
  (state_embed): Linear(in_features=120, out_features=12, bias=True)
  (act_embed): Linear(in_features=30, out_features=12, bias=True)
  (rtg_embed): Linear(in_features=1, out_features=12, bias=True)
  (pos_embed): Embedding(4096, 12)
  (norm): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
  (transformerGPT): ModuleList(
    (0): DecoderBlock(
      (attn): MaskedSelfAttention(
        (proj_q): Linear(in_features=128, out_features=128, bias=True)
        (proj_k): Linear(in_features=128, out_features=128, bias=True)
        (proj_v): Linear(in_features=128, out_features=128, bias=True)
        (proj_out): Linear(in_features=128, out_features=128, bias=True)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (fc1): Linear(in_features=128, out_features=512, bias=True)
        (act): GELU(approximate='no

## DataSet


In [37]:
#Source:  https://github.com/takuseno/d4rl-pybullet

# dataset will be automatically downloaded into ~/.d4rl/datasets
env = gym.make('hopper-bullet-mixed-v0')

# interaction with its environment
dataset = env.get_dataset()

#Hacemos print de Observations/Actions/Rewards/Terminals
print("\n Observations:", dataset['observations'][0]) # observation data in N x dim_observation. O.Aguilera: Mostramos la Observation en el timestep "num_timestep" ->[num_timestep]
print("\n Actions:", dataset['actions'][0]) # action data in N x dim_action. O.Aguilera: Mostramos la Action en el timestep "num_timestep" ->[num_timestep]
print("\n Rewards:", dataset['rewards'][0]) # reward data in N x 1. O.Aguilera: Mostramos la Rewards "num_timestep" ->[num_timestep]
print("\n Terminals:", dataset['terminals'][0]) # terminal flags in N x 1. O.Aguilera: Mostramos la Terminals "num_timestep" ->[num_timestep]. Indica si ha terminat l'episodi (Todo FALSE menos ultimo TRUE)


pybullet build time: Feb  9 2024 18:21:48

 Observations: [ 0.          0.          1.          0.          0.          0.
  0.         -0.          0.93191123  0.          1.0711064   0.
  0.03378947  0.          0.        ]

 Actions: [-0.7135147  -0.45422366 -0.9843897 ]

 Rewards: 0.0

 Terminals: 0.0




In [38]:
#Automatically download and return the dataset
dataset = env.get_dataset()

#We can acces the dataset, and we will obtain Numpy arrays
arrayObservations = dataset['observations'] # Observation data in a [N x dim_observation] numpy array  ==> Para 'hopper-bullet-mixed-v0" = [59345 x 15]
arrayActions = dataset['actions'] # Action data in [N x dim_action] numpy array ==> Para 'hopper-bullet-mixed-v0" = [59345 x 3]
arrayRewards = dataset['rewards'] # Reward data in a [N x 1] numpy array ==> Para 'hopper-bullet-mixed-v0" = [59345 x 1]
arrayTerminals = dataset['terminals'] # Terminal flags in a [N x 1] numpy array ==> Para 'hopper-bullet-mixed-v0" = [59345 x 1]

In [39]:
#Contamos los episodios para el dataset importado
def get_episodes():
    terminals = dataset['terminals'].astype('int32')
    #Las posiciones donde estan los Terminal=1
    if terminals[-1] == 0 : 
        terminals[-1] = 1  
    terminal_pos = np.where(terminals==1)[0]
    return terminal_pos.tolist(), len(terminal_pos)

def get_rtgs(t_positions, rewards):
    # Initialize the starting index of the sub-list in B
    start_idx = 0
    rtgs = []

    
    for t in t_positions:
        end_idx = t + 1
        sub_list = rewards[start_idx:end_idx]
        #print(sub_list)
        for i in range(0, len(sub_list)):
            rtgs.append(sum(sub_list[i+1:]))
        start_idx = end_idx
    return rtgs

def optimized_get_rtgs(t_positions, rewards):

    rewards = np.array(rewards, dtype=np.float64)
    t_positions = np.array(t_positions)

    cumsum_rewards = np.cumsum(rewards)
    
    # Initialize an array to hold the RTGs
    rtgs = np.array([], dtype=int)
    
    # Keep track of the start index of the sub-list in rewards
    start_idx = 0
    for end_idx in t_positions:
        
        segment_rtgs = cumsum_rewards[end_idx] - cumsum_rewards[start_idx:end_idx]
        segment_rtgs = np.append(segment_rtgs, 0)
        rtgs = np.concatenate((rtgs, segment_rtgs))
    
        start_idx = end_idx+1
    return rtgs.tolist()

def get_timestep(terminal_pos):
    start_index = 0
    arrayTimesteps = np.zeros(len(dataset['rewards']), dtype=int)
    for i in terminal_pos:
        arrayTimesteps[start_index:(i+1)] = np.arange((i+1) - start_index)
        start_index = i
    return arrayTimesteps


In [40]:
terminals_pos, num_episodes = get_episodes()

In [41]:
rtgs = optimized_get_rtgs(terminals_pos, dataset['rewards'])

In [42]:
timesteps = get_timestep(terminals_pos)

In [43]:
timesteps[0:25]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,  7,
        8,  9, 10, 11, 12, 13, 14, 15])

In [44]:
A = np.array([0,0,0,1,0,0,1,0,1])
B = np.where(A ==1)[0]
B

array([3, 6, 8])

In [45]:
C = np.roll(B, shift=1) + 1
C[0] = 0
C

array([0, 4, 7])

In [46]:
aux = A[0:4]
aux

array([0, 0, 0, 1])

In [47]:
D = list(zip(C, B +1))
D

[(0, 4), (4, 7), (7, 9)]

In [48]:
D[2]

(7, 9)

In [49]:
sobs = arrayObservations[1:5]
pobs = np.zeros((3,15))
conc = np.concatenate((sobs,pobs), 0)
conc

array([[-4.60774550e-04,  0.00000000e+00,  1.00000000e+00,
        -1.39726877e-01,  0.00000000e+00, -4.94736880e-02,
         0.00000000e+00, -2.97637507e-02,  8.89707685e-01,
        -5.34664631e-01,  1.07110643e+00,  0.00000000e+00,
        -4.48584184e-02, -5.99934816e-01,  0.00000000e+00],
       [-2.59015057e-03,  0.00000000e+00,  1.00000000e+00,
        -2.13199124e-01,  0.00000000e+00, -6.87466413e-02,
         0.00000000e+00, -8.18618238e-02,  9.12356079e-01,
         6.11053050e-01,  9.75129962e-01, -1.22049451e+00,
        -3.98643315e-02,  3.93147528e-01,  0.00000000e+00],
       [-9.72105097e-03,  0.00000000e+00,  1.00000000e+00,
        -3.10635537e-01,  0.00000000e+00, -2.31184497e-01,
         0.00000000e+00, -6.99426532e-02,  1.00235379e+00,
        -1.86572224e-02,  7.96451211e-01, -1.08245051e+00,
         3.35372984e-02,  4.17567253e-01,  0.00000000e+00],
       [-2.42748205e-02,  0.00000000e+00,  1.00000000e+00,
        -4.17304337e-01,  0.00000000e+00, -3.92520458

In [50]:
AA = [1, 2, 3, 4, 5 ,6]
padd_mask = [1, 1, 1, 1, 0, 0]

In [56]:
acts = arrayActions[0:10]
acts = torch.tensor(acts, dtype=torch.float)
acts


tensor([[-0.7135, -0.4542, -0.9844],
        [ 0.2584, -0.5494,  0.7026],
        [ 0.9897, -0.6594, -0.4362],
        [ 0.4028, -0.9043, -0.6581],
        [-0.2461,  0.4336,  0.4671],
        [ 0.2918,  0.3195,  0.3627],
        [-0.5997,  0.8677, -0.5769],
        [ 0.2887,  0.3120, -0.5910],
        [-0.8467,  0.0023, -0.5059],
        [ 0.0000,  0.0000,  0.0000]])

In [65]:
padding = np.ones((10,))
padding[6:] = 0
padding = torch.tensor(padding, dtype=torch.bool)
padding

tensor([ True,  True,  True,  True,  True,  True, False, False, False, False])

In [66]:
padding_unsqueezed = padding.unsqueeze(-1)
padding_unsqueezed

tensor([[ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [False],
        [False],
        [False],
        [False]])

In [67]:
valid_actions = acts[padding]
valid_actions

tensor([[-0.7135, -0.4542, -0.9844],
        [ 0.2584, -0.5494,  0.7026],
        [ 0.9897, -0.6594, -0.4362],
        [ 0.4028, -0.9043, -0.6581],
        [-0.2461,  0.4336,  0.4671],
        [ 0.2918,  0.3195,  0.3627]])

In [16]:
max_timesteps = max(timesteps)
print("\nEl episodio con mayor numero de timesteps de nuestro dataset duró un total de",max_timesteps,"timesteps")


El episodio con mayor numero de timesteps de nuestro dataset duró un total de 1001 timesteps


In [17]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, observations, actions, steps, rtgs, terminals, blocks):
        self.observations = observations
        self.actions = actions
        self.steps = steps
        self.rtgs = rtgs
        self.terminals = terminals
        self.blocks = blocks

    def __len__(self):
        return len(self.observations)

    def __getitem__(self, idx):
        # to avoid blocks in between of 2 trajectories, if the idx is too close to the end of a trajectory, re-position
        # the idx to a block_size away to the end of the trajectory
        episode_ends = np.array(self.terminals)
        episode_starts=np.roll(episode_ends, shift=1) + 1
        episode_starts[0] = 0

        print("episode start", len(episode_starts))
        print("episode end", len(episode_ends))
        print(idx)
        start, end = list(zip(episode_starts, episode_ends +1))[idx]

        episode_length = end - start

        # Sample a start point for the sequence within the episode
        if episode_length >= self.blocks:
            seq_start = np.random.randint(start, end - self.blocks + 1)
            seq_end = seq_start + self.blocks
            n_padding = 0
        else:
            seq_start = start
            seq_end = start + episode_length - 1
            n_padding = self.blocks - episode_length + 1
        

        states = (self.observations[seq_start : seq_end])
        actions = (self.actions[seq_start : seq_end])
        rtgs = (self.rtgs[seq_start : seq_end])
        steps = (self.steps)
        
        if n_padding > 0:
            padding = np.zeros(n_padding)

            states = np.concatenate(states, padding)
            actions = np.concatenate(actions, padding)
            rtgs = np.concatenate(rtgs, padding)
            steps = np.concatenate(steps, padding)
        
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rtgs = torch.FloatTensor(rtgs)
        steps = torch.FloatTensor(steps)

        return states, actions, rtgs, steps

### DataSet && DataLoader

In [18]:
blocks = 16
dataset = MyDataset(arrayObservations, arrayActions, timesteps, rtgs, terminals_pos, blocks)

DTDataLoader = DataLoader(dataset, batch_size=1, shuffle=False)

In [19]:

# Definimos la función de pérdida
criterion = nn.MSELoss()

# Definir el optimizador
optimizer = optim.Adam(model_dt.parameters(), lr=0.001)

### Train

In [20]:
#Bucle de training

num_epochs = 5
timestep = 0
max_timesteps = max_timesteps #Calculated according to the longest episode in the dataset/env loaded.
for epoch in range(num_epochs):
    total_loss = 0.0  # Inicializar la pérdida total para el epoch

    # Iteración sobre los lotes de datos

    for states, actions, rtgs, steps in DTDataLoader:
        # Paso 1: Reiniciar los gradientes
        #timestep += 1 ==> No es necesario para el training, solo para evaluation
        optimizer.zero_grad()

        # Paso 2: Propagación hacia adelante (Forward pass)
        _, _, act_preds = model_dt(steps, max_timesteps, states, actions, rtgs) # timestep, max_timesteps, states, actions, returns_to_go
        #outputs = model(batch_obs)

        # Paso 3: Calcular la pérdida
        loss = criterion(act_preds, actions)

        # Paso 4: Propagación hacia atrás (Backward pass)
        loss.backward()

        # Paso 5: Actualización de los parámetros del modelo
        optimizer.step()

        # Sumar la pérdida del batch a la pérdida total del epoch
        total_loss += loss.item()

    # Calcular la pérdida media del epoch
    epoch_loss = total_loss / len(DTDataLoader)

    # Imprimir la pérdida media del epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

    # Paso 6 (Opcional): Evaluación del modelo en un conjunto de datos de evaluación
    # Aquí puedes agregar código para evaluar el modelo en un conjunto de datos de evaluación si lo tienes disponible

# Paso 7 (Opcional): Visualización de resultados o métricas de rendimiento
# Aquí puedes agregar código para mostrar otras métricas de rendimiento que desees analizar

episode start 581
episode end 581
0


TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
episode_ends = np.array([3,7,15])
episode_starts = np.roll(A, shift=1)+ 1

In [None]:
episode_starts
episode_starts[0] = 0
episode_starts

array([0, 4, 8])

In [None]:
a = list(zip(episode_starts, episode_ends +1))
a


[(0, 4), (4, 8), (8, 16)]

In [None]:
#Mostramos los maximos rewards para los Terminal=1 recogidos en "terminal_pos"

sample=terminal_pos[0]
print("\nRewards example:",arrayRewards[sample])

#Construimos el array de rewards to go
arrayReturnToGo = []

for i in terminal_pos:
    arrayReturnToGo.append(arrayRewards[i])

len(arrayReturnToGo)

#Creamos tensor de Returnstogo
tensorReturnToGo = torch.FloatTensor(arrayReturnToGo)
tensorReturnToGo.shape

In [None]:
# Creamos un array "timesteps" donde se identificaran los timesteps para cada episodio respecto al total del dataset
def get_timestep():
    start_index = 0
    arrayTimesteps = np.zeros(len(arrayActions), dtype=int)
    for i in terminal_pos:
        arrayTimesteps[start_index:i] = np.arange(i - start_index)
        start_index = i
    return arrayTimesteps
print("\nEl array Timesteps tendra la misma dimension que las samples del dataset:",len(arrayTimesteps)," samples")
print("\nEjemplo muestreo hasta el episodio 25:\n", arrayTimesteps[0:25])

#Comprovamos en el dataset (concretamente en el arrayTimesteps), cual es el episodio mas largo. Ese sera nuestro "max_timesteps"
max_timesteps = max(arrayTimesteps)
print("\nEl episodio con mayor numero de timesteps de nuestro dataset duró un total de",max_timesteps,"timesteps")

#Lo convertimos a tensor
tensorTimesteps = torch.FloatTensor(arrayTimesteps)
#tensorTimesteps.shape

In [None]:
prueba_omar = np.unique(arrayActions[0:20])
print('action possible numbers: ',prueba_omar )
#assert hparams['vocab_size'] == len(np.unique(arrayActions))
len(prueba_omar)

arrayActions[0:20]

In [None]:
#We transform all the arrays to Float tensors
if torch.cuda.is_available():
  tensorObservations = torch.cuda.FloatTensor(arrayObservations)
  tensorActions = torch.cuda.FloatTensor(arrayActions)
  tensorRewards = torch.cuda.FloatTensor(arrayRewards)
  tensorTerminals = torch.cuda.FloatTensor(arrayTerminals)
  print("\n Tensors in the GPU")
  print("\n////////////////////////////////////////////////////////////////////////////////")
else:
  tensorObservations = torch.FloatTensor(arrayObservations)
  tensorActions = torch.FloatTensor(arrayActions)
  tensorRewards = torch.FloatTensor(arrayRewards)
  tensorTerminals = torch.FloatTensor(arrayTerminals)
  print("\n Tensors in the CPU")

# Tensors shape
print("\nObservations shape:",tensorObservations.shape)
print("\nActions shape:",tensorActions.shape)
print("\nRewards shape:",tensorRewards.shape)
print("\nTerminals shape:",tensorTerminals.shape)
print("\n////////////////////////////////////////////////////////////////////////////////")

sample = 59343
# Tensors content at certain timestep
print("\nObservations example:",tensorObservations[sample])
print("\nActions example:",tensorActions[sample])
print("\nRewards example:",tensorRewards[sample])
print("\nTerminals example:",tensorTerminals[sample])

# Pasar este dataset al DT (Shuang/Omar)!!!

In [None]:
#Source: PyBullet Environment: https://pybullet.org/wordpress/
#                              https://github.com/bulletphysics/bullet3/tree/master

import numpy as np
import gym
import pybullet_envs
import argparse
import os
import torch

from tqdm import tqdm
#from .sac import SAC, seed_everything
#from .utility import save_buffer


def collect(env, sac, logdir, final_step, deterministic):
    buffer = []
    frames = []

    step = 0
    pbar = tqdm(total=final_step)
    while step < final_step:
        obs_t = env.reset()
        ter_t = False
        rew_t = 0.0


        while step < final_step and not ter_t:
            #act_t = sac.act([obs_t], deterministic=deterministic)[0] #SAC (Soft Actor Critic) es un algoritmo de RL
            frames.append(env.render(mode="rgb_array"))
            act_t = env.action_space.sample() # Temporal. Aqui tendriamos que la salida/action del DT.
            buffer.append([obs_t, act_t, [rew_t], [ter_t]])

            obs_t, rew_t, ter_t, _ = env.step(act_t)   #O.Aguilera (04/02/2024): La dimensio no es correcte.

            step += 1
            pbar.update(1)

        if ter_t:
            buffer.append([obs_t, np.zeros_like(act_t), [rew_t], [ter_t]])

    #save_buffer(buffer[:final_step], logdir)

    print('Collected data has been saved.')
    return frames

''' if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--final-step', type=int, default=1000000) #
    parser.add_argument('--load', type=str)
    parser.add_argument('--gpu', action='store_true')
    args = parser.parse_args() '''

#env = gym.make(args.env)
env = gym.make('hopper-bullet-mixed-v0')
#env.seed(args.seed)
env.seed(1)
#seed_everything(args.seed)
#seed_everything(1)

observation_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
device = 'cuda:0'

#sac = SAC(observation_size, action_size, device)
'''
if args.load:
    sac.load(args.load)
    name = 'medium'
    deterministic = True
else:'''
name = 'random'
deterministic = False

#logdir = os.path.join('logs', '{}_{}_{}'.format(args.env, name, args.seed))
#os.makedirs(logdir)


prova=collect(env, None, None, 100, deterministic)

In [None]:
prova[0]
prova[0].shape

In [None]:
from matplotlib import pyplot as plt
plt.imshow(prova[0], interpolation='nearest')
plt.show()

In [None]:
!apt-get install ffmpeg
!pip install notebook-video-writer

In [None]:
from notebook_video_writer import VideoWriter
with VideoWriter(fps=40) as vw:
    for i in range(len(prova)):
        vw.add(prova[i])

In [None]:
env = gym.make('hopper-bullet-mixed-v0')
print(env.action_space.sample())

## Notas Google Meet Domingo 04/02/2024


Edgar Planell
19:33
https://github.com/takuseno/d4rl-pybullet/blob/master/requirements.txt

Edgar Planell
19:37
pip install git+https://github.com/takuseno/d4rl-pybullet

Edgar Planell
19:59
pip setuptools wheel

Tú
19:59
brew install hdf5

Edgar Planell
20:01
versioned-hdf5

Edgar Planell
20:13
    import numpy as np
    import gym
    import pybullet_envs
    import argparse
    import os

    from tqdm import tqdm
    from .sac import SAC, seed_everything
    from .utility import save_buffer


    def collect(env, sac, logdir, final_step, deterministic):
        buffer = []

        step = 0
        pbar = tqdm(total=final_step)
        while step < final_step:
            obs_t = env.reset()
            ter_t = False
            rew_t = 0.0
            while step < final_step and not ter_t:
                act_t = sac.act([obs_t], deterministic=determin

      import numpy as np
      import gym
      import pybullet_envs
      import argparse
      import os

      from tqdm import tqdm
      from .sac import SAC, seed_everything
      from .utility import save_buffer


      def collect(env, sac, logdir, final_step, deterministic):
          buffer = []

          step = 0
          pbar = tqdm(total=final_step)
          while step < final_step:
              obs_t = env.reset()
              ter_t = False
              rew_t = 0.0
              while step < final_step and not ter_t:
                  act_t = sac.act([obs_t], deterministic=determin
https://github.com/takuseno/d4rl-pybullet/blob/master/d4rl_pybullet/collect.py

Edgar Planell
20:30
https://github.com/bulletphysics/bullet3/tree/master

Edgar Planell
20:32
https://github.com/bulletphysics/bullet3/blob/master/examples/pybullet/gym/pybullet_examples/biped2d_pybullet.py

Edgar Planell
20:37
https://github.com/benelot/pybullet-gym/tree/master

Josep Maria Bach Ramírez
20:43
env.render(mode="rgb_array")

Josep Maria Bach Ramírez
20:45
from matplotlib import pyplot as plt
plt.imshow(prova[0], interpolation='nearest')
plt.show()

Josep Maria Bach Ramírez
20:47
!apt-get install ffmpeg
!pip install notebook-video-writer

Josep Maria Bach Ramírez
20:48
from notebook_video_writer import VideoWriter
with VideoWriter(fps=40) as vw:
    for i in range(len(prova)):
        vw.add(prova[i])
        
Josep Maria Bach Ramírez
20:52
https://gymnasium.farama.org/api/env/

In [None]:
''' import pybullet as p
import pybullet_data
import os
import time
GRAVITY = -9.8
dt = 1e-3
iters = 2000
import pybullet_data

physicsClient = p.connect(p.GUI)
p.setAdditionalSearchPath(pybullet_data.getDataPath())
p.resetSimulation()
#p.setRealTimeSimulation(True)
p.setGravity(0, 0, GRAVITY)
p.setTimeStep(dt)
planeId = p.loadURDF("plane.urdf")
cubeStartPos = [0, 0, 1.13]
cubeStartOrientation = p.getQuaternionFromEuler([0., 0, 0])
botId = p.loadURDF("biped/biped2d_pybullet.urdf", cubeStartPos, cubeStartOrientation)

#disable the default velocity motors
#and set some position control with small force to emulate joint friction/return to a rest pose
jointFrictionForce = 1
for joint in range(p.getNumJoints(botId)):
  p.setJointMotorControl2(botId, joint, p.POSITION_CONTROL, force=jointFrictionForce)

#for i in range(10000):
#     p.setJointMotorControl2(botId, 1, p.TORQUE_CONTROL, force=1098.0)
#     p.stepSimulation()
#import ipdb
#ipdb.set_trace()
import time
p.setRealTimeSimulation(1)
while (1):
  #p.stepSimulation()
  #p.setJointMotorControl2(botId, 1, p.TORQUE_CONTROL, force=1098.0)
  p.setGravity(0, 0, GRAVITY)
  time.sleep(1 / 240.)
time.sleep(1000) '''