A demo script to use TAC depth encoder

In [3]:
import os
import numpy as np
import torch
from torch.nn import functional as F
import matplotlib.pyplot as plt
from matplotlib import cm
import json
from PIL import Image
import requests
import natsort
os.chdir("/root/TAC/")

In [6]:
import imports
from config.default import get_config
from common.registry import registry

config = get_config("config/v2/v2_tac.yaml")

model_cls = registry.get_model(config.MODEL.name)
model = model_cls.from_config(config)
ckpt = torch.load("tac_model.pth")
model.load_state_dict(ckpt["state_dict"], strict=False)

_IncompatibleKeys(missing_keys=[], unexpected_keys=['image_transformer.vision_model.embeddings.position_ids', 'depth_transformer.vision_model.embeddings.position_ids'])

In [21]:
MIN_DEPTH = 0.0
MAX_DEPTH = 10.0
DEPTH_SCALE = 1000

depth_path = "test.png"
depth = Image.open(depth_path)
depth = np.array(depth).astype("float32") / DEPTH_SCALE  # to meters
depth = np.clip(depth, MIN_DEPTH, MAX_DEPTH) # clip to [MIN_DEPTH, MAX_DEPTH]
depth = (depth - MIN_DEPTH) / (MAX_DEPTH - MIN_DEPTH) # normalize to [0,1]
depth = np.expand_dims(depth, axis=2).repeat(3, axis=2) # extend to 3 channels
depth = model.depth_processor(depth, do_rescale=False, return_tensors="pt").pixel_values # preprocess (resize, normalize and to tensor)

depth_embedding = F.normalize(model.embed_depth(depth)) # get embedding with FC. the feature locates in a unified space with RGB modality
depth_embedding_nofc = model.embed_depth(depth, fc=False) # get embedding without FC. may be used for other downstream fine-tuning

seperate the depth encoder and use it alone

In [15]:
depth_encoder = model.depth_transformer
torch.save(depth_encoder.state_dict(), "depth_encoder.pth")

In [17]:
from transformers import CLIPImageProcessor, CLIPVisionModel, CLIPVisionConfig
config = CLIPVisionConfig()
depth_encoder = CLIPVisionModel(config=config)
ckpt = torch.load("depth_encoder.pth")
depth_encoder.load_state_dict(ckpt)
depth_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [20]:
MIN_DEPTH = 0.0
MAX_DEPTH = 10.0
DEPTH_SCALE = 1000

depth_path = "test.png"
depth = Image.open(depth_path)
depth = np.array(depth).astype("float32") / DEPTH_SCALE  # to meters
depth = np.clip(depth, MIN_DEPTH, MAX_DEPTH) # clip to [MIN_DEPTH, MAX_DEPTH]
depth = (depth - MIN_DEPTH) / (MAX_DEPTH - MIN_DEPTH) # normalize to [0,1]
depth = np.expand_dims(depth, axis=2).repeat(3, axis=2) # extend to 3 channels
depth = depth_processor(depth, do_rescale=False, return_tensors="pt").pixel_values # preprocess (resize, normalize and to tensor)

outputs = depth_encoder(pixel_values=depth)
outputs = outputs["last_hidden_state"][:, 0, :] # get embedding without FC. may be used for other downstream fine-tuning

In [23]:
assert (depth_embedding_nofc!=outputs).sum()==0 # check consistency

Push to hub

In [7]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
# model.depth_transformer.push_to_hub("TAC-ViT-base", safe_serialization=True)
model.depth_transformer.push_to_hub("TAC-ViT-base", safe_serialization=False)
model.depth_processor.push_to_hub("TAC-ViT-base")

pytorch_model.bin:   0%|          | 0.00/350M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/RavenK/TAC-ViT-base/commit/4d73c79e726d9263da51919747a55984cb8966f0', commit_message='Upload model', commit_description='', oid='4d73c79e726d9263da51919747a55984cb8966f0', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
from transformers import CLIPImageProcessor, CLIPVisionModel, CLIPVisionConfig
import numpy as np
tac_depth_model = CLIPVisionModel.from_pretrained("RavenK/TAC-ViT-base")
tac_depth_processor = CLIPImageProcessor.from_pretrained("RavenK/TAC-ViT-base")

# Assuming test.png is a depth image with a scale factor 1000
MIN_DEPTH = 0.0
MAX_DEPTH = 10.0
DEPTH_SCALE = 1000

depth_path = "test.png"
depth = Image.open(depth_path)
depth = np.array(depth).astype("float32") / DEPTH_SCALE  # to meters
depth = np.clip(depth, MIN_DEPTH, MAX_DEPTH) # clip to [MIN_DEPTH, MAX_DEPTH]
depth = (depth - MIN_DEPTH) / (MAX_DEPTH - MIN_DEPTH) # normalize to [0,1]
depth = np.expand_dims(depth, axis=2).repeat(3, axis=2) # extend to 3 channels
depth = tac_depth_processor(depth, do_rescale=False, return_tensors="pt").pixel_values # preprocess (resize, normalize and to tensor)

outputs = tac_depth_model(pixel_values=depth)
outputs = outputs["last_hidden_state"][:, 0, :] # get embedding without FC. may be used for other downstream fine-tuning

In [22]:
assert (depth_embedding_nofc!=outputs).sum()==0 # check consistency