In [54]:
import json
import torch
import numpy as np
from transformers import CLIPImageProcessor
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
from pathlib import Path


In [53]:
image_processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14-336")

In [None]:
def extract_embodiedscan_frames(frames):
    if not isinstance(frames, list):
        frames = [frames]
    if 'scannet' in frames[0] or '3rscan' in frames[0]:
        images = []
        depths = []
        poses = []
        if 'scannet' in frames[0]:
            video = frames[0].split('/')[-4] + '/' + frames[0].split('/')[-2]
        elif '3rscan' in frames[0]:
            video = frames[0].split('/')[-4] + '/' + frames[0].split('/')[-3]
        video_info = scene[video]
        for frame in frames:
            path = Path(frame)
            frame_name = str(Path(*path.parts[-4:]))
            pose = np.array(video_info[frame_name]['pose']) # 4x4 array
            image = frame
            if 'scannet' in frame:
                depth = frame.replace('jpg', 'png')
            elif '3rscan' in frame:
                depth = frame.replace('color.jpg', 'depth.png')
            else:
                raise NotImplementedError
            # we need to ensure that the frame has valid pose
            images.append(image)
            depths.append(depth)
            poses.append(pose)
        depth_intrinsic_file = np.array(video_info['depth_intrinsic'])  # 4x4 array
        intrinsic_file = np.array(video_info['intrinsic']) # 4x4 array
        axis_align_matrix_file = np.array(video_info['axis_align_matrix'])  # 4x4 array
        video_info = dict()
        video_info['sample_image_files'] = images
        video_info['sample_depth_image_files'] = depths
        video_info['sample_pose_files'] = poses
        video_info['depth_intrinsic_file'] = depth_intrinsic_file
        video_info['intrinsic_file'] = intrinsic_file
        video_info['axis_align_matrix_file'] = axis_align_matrix_file
    else:
        raise NotImplementedError

    return video_info

In [46]:
def process_videos(videos, mode='random', device=None, text=None):
    if isinstance(videos, str):
        videos = [videos]
    new_videos = []
    for video in videos:
        video = preprocess(video, return_tensors='pt', mode=mode, device=device, text=text)
        new_videos.append(video)

def preprocess( video: str, 
                return_tensors='pt', 
                mode='random', 
                device=None, 
                text=None,
                do_rescale=True,
                do_normalize=True,
                do_depth_scale=True):
        """
            video:  1. str video id / single video frame
                    2. list  list of video frames
        """
        video_info = extract_embodiedscan_frames(video)

        dataset = video_info['dataset']
        sample_frame_num = video_info['sample_frame_num']

        depth_scale = 1000

        images = []
        depth_images = []
        poses = []

        if 'depth_intrinsic_file' in video_info:
            depth_intrinsic = video_info['depth_intrinsic_file']
            if not isinstance(depth_intrinsic, np.ndarray):
                depth_intrinsic = np.loadtxt(depth_intrinsic)

        intrinsic = video_info['intrinsic_file']  # (V, 4, 4) or (4, 4)
        if not isinstance(intrinsic, np.ndarray):
            intrinsic = np.loadtxt(intrinsic)

        for id, image_file in enumerate(video_info['sample_image_files']):
            image = Image.open(image_file).convert('RGB')
            image_size = image.size
            image = image_processor.preprocess(images=image, do_rescale=do_rescale, do_normalize=do_normalize, return_tensors=return_tensors)['pixel_values'][0] # [3, H, W]
            depth_image = Image.open(video_info['sample_depth_image_files'][id])
            depth_image_size = depth_image.size
            depth_image, resize_shape = preprocess_depth_image(depth_image, do_depth_scale=do_depth_scale, depth_scale=depth_scale)
            depth_image = torch.as_tensor(np.ascontiguousarray(depth_image)).float() # [H, W]
            pose = video_info['sample_pose_files'][id]
            if not isinstance(pose, np.ndarray):
                pose = np.loadtxt(pose)
            pose = torch.from_numpy(pose).float()  # [4, 4]
            images.append(image)
            depth_images.append(depth_image)
            poses.append(pose)

        #if dataset == 'scannet':
            #intrinsic = preprocess_instrinsic(depth_intrinsic, depth_image_size, resize_shape)
        #else:
        intrinsic = preprocess_instrinsic(intrinsic, image_size, resize_shape)  # 3rscan / matterport

        intrinsic = torch.from_numpy(intrinsic).float()

        if intrinsic.dim() == 2:  # scannet/3rscan
            intrinsic = intrinsic.unsqueeze(0).repeat(sample_frame_num, 1, 1)  # (V, 4, 4)

        axis_align_matrix = video_info['axis_align_matrix_file']
        if not isinstance(axis_align_matrix, np.ndarray):
            axis_align_matrix = np.loadtxt(axis_align_matrix)

        axis_align_matrix = torch.from_numpy(axis_align_matrix).float()
        
        # transform pose to axis_align pose
        poses = [axis_align_matrix @ pose for pose in poses]

        video_dict = dict()
        video_dict['images'] = torch.stack(images)  # (V, 3, 336, 336)
        video_dict['depth_images'] = torch.stack(depth_images)  # (V, 336,336)
        video_dict['poses'] = torch.stack(poses)  # (V, 4, 4)
        video_dict['intrinsic'] = intrinsic  # (V, 4, 4)

        return video_dict

SyntaxError: invalid syntax (1716784140.py, line 58)

In [44]:
video_path = '../data/3RScan/754e884c-ea24-2175-8b34-cead19d4198d'
videos_dict = process_videos(video_path)

TypeError: preprocess() missing 1 required positional argument: 'video'

In [34]:
scan_id = '754e884c-ea24-2175-8b34-cead19d4198d'
sub_key = f'3rscan/{scan_id}/sequence/frame-000000.color.jpg'
with open('../data/embodiedscan_infos_full.json') as file:
    data = json.load(file)

In [35]:
scan = None
for key, value in data.items():
    if key.startswith('3rscan'):
        if sub_key in value:
            scan = value
            

In [37]:
# 3Scan from our dataset
# 0.0123306 -0.999799 0.0158277 -0.00961019
# -0.297868 0.0114376 0.954539 -0.00204422
# -0.954527 -0.0164846 -0.297667 0.022526
# 0 0 0 1

In [38]:
scan[sub_key]

{'pose': [[0.0123306, -0.999799, 0.0158277, -0.00961019],
  [-0.297868, 0.0114376, 0.954539, -0.00204422],
  [-0.954527, -0.0164846, -0.297667, 0.022526],
  [0.0, 0.0, 0.0, 1.0]],
 'depth': '3rscan/754e884c-ea24-2175-8b34-cead19d4198d/sequence/frame-000000.depth.pgm'}