In [1]:


path = 'data'
file_prefix_list = ['depth_frames', 'can_pos_quat']
file_suffix_range = range(1, 5)


# load data
import numpy as np
import os

data = {}
data['depth_frames'] = []
data['can_pos_quat'] = []
# interate over all files
for idx in file_suffix_range:
    can_pos_quat_path = f"can_pos_quat_{idx}.npy"
    depth_frames_path = f"depth_frames_{idx}.npy"
    
    can_pos_quat = np.load(os.path.join(path, can_pos_quat_path))
    depth_frames = np.load(os.path.join(path, depth_frames_path))
    print(f"Loaded {can_pos_quat.shape[0]} samples from {can_pos_quat_path}")
    # concat
    data['can_pos_quat'].append(can_pos_quat)
    data['depth_frames'].append(depth_frames)
    

# concatenate all data
data['can_pos_quat'] = np.concatenate(data['can_pos_quat'], axis=0)
data['depth_frames'] = np.concatenate(data['depth_frames'], axis=0)

print(f"Concatenated can_pos_quat with shape {data['can_pos_quat'].shape}")
print(f"Concatenated depth_frames with shape {data['depth_frames'].shape}")

Loaded 1000 samples from can_pos_quat_1.npy
Loaded 1000 samples from can_pos_quat_2.npy
Loaded 1000 samples from can_pos_quat_3.npy
Loaded 1000 samples from can_pos_quat_4.npy
Concatenated can_pos_quat with shape (4000, 7)
Concatenated depth_frames with shape (4000, 256, 256, 1)


In [2]:
# Post process

# Cutting the quaternion to only the first 3 elements
data['can_pos_quat'] = data['can_pos_quat'][:, :3]
print(f"Cutting the quaternion to only the first 3 elements. New shape: {data['can_pos_quat'].shape}")


Cutting the quaternion to only the first 3 elements. New shape: (4000, 3)


In [3]:
# Normalize can_pos_quat
mean = data['can_pos_quat'].mean(axis=0)
std = data['can_pos_quat'].std(axis=0)
data['can_pos_quat'] = (data['can_pos_quat'] - mean) / std
print(f"Normalized can_pos_quat with mean {mean} and std {std}")

Normalized can_pos_quat with mean [ 0.10136824 -0.24822153  0.86      ] and std [7.20919620e-02 1.00940761e-01 2.50910404e-14]


In [4]:
# Load Dataset on GPU

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data['can_pos_quat'] = torch.tensor(data['can_pos_quat'], device=device)
data['depth_frames'] = torch.tensor(data['depth_frames'], device=device)

print(f"Loaded data on device {device}")

Loaded data on device cuda


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class DepthCanDataset(Dataset):
    def __init__(self, depth_frames, can_pos_quat):
        # Depth frames and corresponding position/quaternion data
        # Permute depth frames from (batch, 256, 256, 1) to (batch, 1, 256, 256)
        self.depth_frames = torch.tensor(depth_frames, dtype=torch.float32).permute(0, 3, 1, 2)
        self.can_pos_quat = torch.tensor(can_pos_quat, dtype=torch.float32)

    def __len__(self):
        return len(self.can_pos_quat)

    def __getitem__(self, idx):
        depth_frame = self.depth_frames[idx]
        can_pos_quat = self.can_pos_quat[idx]
        
        return depth_frame, can_pos_quat


# Create the dataset
dataset = DepthCanDataset(data['depth_frames'], data['can_pos_quat'][:, :3])


  self.depth_frames = torch.tensor(depth_frames, dtype=torch.float32).permute(0, 3, 1, 2)
  self.can_pos_quat = torch.tensor(can_pos_quat, dtype=torch.float32)


In [6]:
import torch
import gymnasium as gym
import numpy as np
import mediapy as media
# 필요한 클래스 및 함수 임포트
from cleanrl.cleanrl.ppo_continuous_action import Agent, Args, ppo_make_env
import cv2
import tqdm



def evaluate_object_model(model,num_episodes,visualize=False, 
                          load_eval_data=False,
                          success_threshold=0.01,
                          ):

    frames = []
    # Argument 설정
    task_id = 'pickplace'
    gamma = 0.99
    render_camera = ['birdview']#,'agentview'] #('frontview', 'birdview', 'agentview', 'robot0_robotview', 'robot0_eye_in_hand')
    camera_names = render_camera
    model.eval()

    # 환경 생성
    env = gym.vector.SyncVectorEnv(
        [ppo_make_env(
            task_id=task_id, 
            reward_shaping=True,
            idx=0, 
            control_mode="OSC_POSITION",
            capture_video=False, 
            run_name="eval", 
            gamma= gamma, 
            active_rewards="r",
            active_image=True, 
            fix_object=False,
            wandb_enabled=False,
            verbose=False,
            control_freq=20,
            render_camera=render_camera,
            camera_names=camera_names,

            )
        ]
    )


    def colorize_depth(frame):
        # Assuming the depth image is in float32 and contains values representing distances.
        # Normalize the depth image to 0-255 for visualization
        min_depth = np.min(frame)
        max_depth = np.max(frame)
        normalized_depth = 255 * (frame - min_depth) / (max_depth - min_depth)
        normalized_depth = normalized_depth.astype(np.uint8)

        # Apply a colormap for better visualization (COLORMAP_JET is commonly used)
        colorized_depth = cv2.applyColorMap(normalized_depth, cv2.COLORMAP_JET)
        return colorized_depth


    # 디바이스 설정 (cuda가 가능하면 cuda 사용)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device == torch.device("cuda"):
        pass
    else :
        assert device == torch.device("cpu")

    # 평가 수행
    viewer_image_key = 'birdview'+'_depth'

    # generate samples
    #model.to(device)
    visualize = False
    success_threshold = success_threshold
    success_count = 0
    
    with torch.no_grad():
        if load_eval_data:
            global_std = std.copy()
            global_mean = mean.copy()
            global_std = torch.tensor(global_std, device=device)
            global_mean = torch.tensor(global_mean, device=device)
            eval_data = {}

            path = 'data'
            if num_episodes == 100:
                can_pos_quat_path = 'can_pos_quat_eval.npy'
                depth_frames_path = 'depth_frames_eval.npy'
            elif num_episodes == 1000:
                can_pos_quat_path = 'can_pos_quat_eval_1000.npy'
                depth_frames_path = 'depth_frames_eval_1000.npy'
            
            can_pos_quat = np.load(os.path.join(path, can_pos_quat_path))
            depth_frames = np.load(os.path.join(path, depth_frames_path))
            eval_data['can_pos_quat'] = can_pos_quat
            eval_data['depth_frames'] = depth_frames
            print(f"Loaded eval data with can_pos_quat shape {can_pos_quat.shape} and depth_frames shape {depth_frames.shape}")
            # post process
            eval_data['can_pos_quat'] = torch.tensor(eval_data['can_pos_quat'], device=device)
            eval_data['depth_frames'] = torch.tensor(eval_data['depth_frames'], device=device)
            
            eval_data['can_pos_quat'] = eval_data['can_pos_quat'][:, :3]
            #print(eval_data['can_pos_quat'][:5])
            #eval_data['can_pos_quat'] = (eval_data['can_pos_quat'] - global_mean) / global_std # normalize
            #print(eval_data['can_pos_quat'][:5])
            # cuda

            
            model_input = torch.tensor(eval_data['depth_frames'], dtype=torch.float32).permute(0, 3, 1, 2).to(device)
            
            predicted_target = model(model_input)
            predicted_target = (predicted_target * global_std) + global_mean
            #print(predicted_target[:10])
            #print(predicted_target[:10], eval_data['can_pos_quat'][:10])
            gt = eval_data['can_pos_quat']
            #print(predicted_target.device, gt.device)
            error = torch.norm(predicted_target - gt, dim=1)
            #error = np.linalg.norm(predicted_target - gt, axis=1)
            success_count = torch.sum(error < success_threshold).item()
            #success_count = np.sum(error < success_threshold)
            success_rate = success_count / num_episodes
            #print(f"Success rate: {success_rate}, Success count: {success_count}, Total episodes: {num_episodes}")
            return {'success_rate': success_rate, 'success_count': success_count, 'total_episodes': num_episodes}
        
            

        

        for i in tqdm.tqdm(range(num_episodes)):
            obs, _ = env.reset()
            obs = torch.Tensor(obs).to(device)

            image_frame = env.envs[0].image_states[viewer_image_key]
            if not viewer_image_key.endswith('depth'):
                image_frame = np.array(image_frame[::-1, :, :], dtype=np.uint8)  # numpy 배열로 변환
            else:
                image_frame = np.array(image_frame[::-1, :, :], dtype=np.float32)

            model_input = torch.tensor(image_frame, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2).to(device)
            
            
            predicted_target = model(model_input).cpu().detach().numpy()
            predicted_target = (predicted_target * std) + mean
        

            gt = env.envs[0].sim.data.get_body_xpos('Can_main')
            # calculate the error
            error = np.linalg.norm(predicted_target - env.envs[0].sim.data.get_body_xpos('Can_main'))
            if error < success_threshold:
                success_count += 1
            if visualize :
                print(f"Predicted: {predicted_target}, GT: {gt}, Error: {error}")
                image_frame = colorize_depth(image_frame)
                
                can_pos = env.envs[0].sim.data.get_body_xpos('Can_main')  # Assuming the object is called 'Can'
                can_quat = env.envs[0].sim.data.get_body_xquat('Can_main')
                
                error_text = f"Error: {error.round(4)}"
                # in text, round values
                pos_text = f"GT pos: {can_pos.round(4)}"
                predict_text = f"Pd pos: {predicted_target.round(4)}"
                cv2.putText(image_frame, error_text, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
                cv2.putText(image_frame, pos_text, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
                cv2.putText(image_frame, predict_text, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
                
                frames.append(image_frame)
        success_rate = success_count / num_episodes
        print(f"Success rate: {success_rate}, Success count: {success_count}, Total episodes: {num_episodes}")
        if visualize:      
            media.show_video(frames, fps=20)
        return {'success_rate': success_rate, 'success_count': success_count, 'total_episodes': num_episodes}

#evaluate_object_model(model, 100, visualize=False, load_eval_data=True)



In [7]:


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torch
import torch.nn as nn
import torch.nn.functional as F


class DepthImageCoordinateRegressor(nn.Module):
    def __init__(self):
        super(DepthImageCoordinateRegressor, self).__init__()
        # Convolutional layers with larger kernels and strides, no padding
        self.conv1 = nn.Conv2d(1, 64, kernel_size=8, stride=4, padding=0)  # Output: (64, 63, 63)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=0)  # Output: (128, 21, 21)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=0)  # Output: (256, 10, 10)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=0)  # Output: (512, 8, 8)
        
        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
        self.bn3 = nn.BatchNorm2d(256)
        # Final pooling layer to reduce spatial dimensions
        self.pool = nn.MaxPool2d((8, 8))  # Output: (512, 6, 6)
        
        # Fully connected layers
        self.fc1 = nn.Linear(512, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 3)  # Output layer for x, y, z (3 real values)

    def forward(self, x):
        # Convolutional layers
        x = F.leaky_relu(self.bn1(self.conv1(x)))  # Output: (64, 62, 62)
        x = F.leaky_relu(self.bn2(self.conv2(x)))  # Output: (128, 30, 30)
        x = F.leaky_relu(self.bn3(self.conv3(x)))  # Output: (256, 14, 14)
        x = self.pool(self.conv4(x))  # Output: (512, 1, 1)
        
        # Pooling layer
        # x = self.pool(x)  # Output: (512, 6, 6)
        
        # Flatten the tensor for fully connected layers
        x = x.reshape(-1, 512)
        
        # Fully connected layers
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = self.fc3(x)  # Output: (batch_size, 3) - x, y, z
        
        return x



# Define batch size
batch_size = 512
learning_rate = 1e-3
num_epochs = 5000

# Model, loss function, optimizer
model = DepthImageCoordinateRegressor()
criterion = nn.MSELoss()#torch.nn.L1Loss() #nn.MSELoss()  # Mean squared error for regression
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# add lr scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1, verbose=True)
# Move model to appropriate device
# Create DataLoader
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)



#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
import time
# Example of training loop
def train_model(train_loader, eval_interval=100, eval_load=False, eval_episodes=50, eval_success_threshold=0.01):
    model.train()
    
    for epoch in range(num_epochs):
        epoch_strat_time = time.time()

        running_loss = 0.0
        for images, targets in train_loader:
            # Move data to appropriate device
            images, targets = images.to(device), targets.to(device)
            #print(f"target_shape: {targets.shape}")
            # Zero the parameter gradients
            optimizer.zero_grad()
            # Forward pass
            outputs = model(images)
            #print(f"output_shape: {outputs.shape}")
            loss = criterion(outputs, targets)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Print statistics
            running_loss += loss.item()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}')
        #print(f"Epoch time: {time.time() - epoch_strat_time}, Data per hour: {int(len(train_loader) * batch_size / (time.time() - epoch_strat_time) * 3600)}, lr: {scheduler.get_last_lr()}")
        if epoch % eval_interval == 0 and epoch != 0:
            model.eval()
            eval_ret = evaluate_object_model(model, num_episodes=eval_episodes, visualize=False, 
                                             load_eval_data=eval_load)
            print(f"Success rate: {eval_ret['success_rate']}, lr: {scheduler.get_last_lr()}")
            model.train()
        
        scheduler.step()


In [8]:
# Train the model
train_model(train_loader, eval_load=True, eval_episodes=1000, eval_interval=50, 
            eval_success_threshold=0.01)

Epoch [1/5000], Loss: 0.7230866327881813
Epoch [2/5000], Loss: 0.39447206631302834
Epoch [3/5000], Loss: 0.21959501318633556
Epoch [4/5000], Loss: 0.15059364400804043
Epoch [5/5000], Loss: 0.11421135254204273
Epoch [6/5000], Loss: 0.09634633548557758
Epoch [7/5000], Loss: 0.08914897218346596
Epoch [8/5000], Loss: 0.08138623926788568
Epoch [9/5000], Loss: 0.07331704627722502
Epoch [10/5000], Loss: 0.06802753452211618
Epoch [11/5000], Loss: 0.06874298630282283
Epoch [12/5000], Loss: 0.06625239364802837
Epoch [13/5000], Loss: 0.06414586957544088
Epoch [14/5000], Loss: 0.06559728737920523
Epoch [15/5000], Loss: 0.06464461889117956
Epoch [16/5000], Loss: 0.061561651062220335
Epoch [17/5000], Loss: 0.06014058040454984
Epoch [18/5000], Loss: 0.06025381200015545
Epoch [19/5000], Loss: 0.060132110957056284
Epoch [20/5000], Loss: 0.059110716450959444
Epoch [21/5000], Loss: 0.05629888642579317
Epoch [22/5000], Loss: 0.05953348195180297
Epoch [23/5000], Loss: 0.06012192368507385
Epoch [24/5000], L

KeyboardInterrupt: 

In [9]:
import time

# save model
torch.save(model.state_dict(), f'checkpoints/model_1000.pth')
print("Model saved to model.pth")

Model saved to model.pth


In [10]:

# load model
model = DepthImageCoordinateRegressor()
model.load_state_dict(torch.load(f'checkpoints/model_1000.pth'))
model.eval()
print("Model loaded from model.pth")


Model loaded from model.pth
