In [70]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
import torch
import numpy as np
import urllib.request
import sys, os
import glob
from PIL import Image

# (선택) decord로 비디오 읽기 – 안 쓰고 직접 frame 텐서를 만들고 싶으면 이 부분만 바꿔도 됨
from decord import VideoReader, cpu

In [72]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from model import Model

In [73]:
# -----------------------------------
# 1. device 설정 & 모델 / preprocessor 로드
# -----------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# torch.hub에서 preprocessor / encoder 로드
processor = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_preprocessor')
loaded  = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant')

if isinstance(loaded, tuple):
    vjepa2_encoder = loaded[0]    # encoder만 사용
else:
    vjepa2_encoder = loaded

vjepa2_encoder = vjepa2_encoder.to(device).eval()

Using cache found in C:\Users\JM/.cache\torch\hub\facebookresearch_vjepa2_main
Using cache found in C:\Users\JM/.cache\torch\hub\facebookresearch_vjepa2_main


In [74]:
# ---------------------------------------------------
# 2. frames 폴더에서 _0.jpg ~ _7.jpg 읽기
# ---------------------------------------------------
# frames_dir = "../A horse galloping on a street"
# frames_dir = "A basketball free falls in the air"
frames_dir = "../A tiger leaping over a small rock on a flat ground"

# 파일 목록 정렬 (_0.jpg, _1.jpg, ... 순서 보장)
frame_paths = glob.glob(os.path.join(frames_dir, "*.jpg"))

def frame_key(path):
    # 예: path = "frames/_3.jpg" -> stem = "_3" -> idx = 3
    name = os.path.splitext(os.path.basename(path))[0]  # "_3"
    idx = int(name.split("_")[-1])
    return idx

frame_paths = sorted(frame_paths, key=frame_key)

print("frames:", frame_paths)

# 각 프레임을 np.ndarray(H,W,3)로 읽어서 리스트로 만들기
imgs = [Image.open(p).convert("RGB") for p in frame_paths]
buffer = [np.array(im) for im in imgs]   # <-- ★ 여기! 리스트로 전달할 애

frames: ['../A tiger leaping over a small rock on a flat ground\\_0.jpg', '../A tiger leaping over a small rock on a flat ground\\_1.jpg', '../A tiger leaping over a small rock on a flat ground\\_2.jpg', '../A tiger leaping over a small rock on a flat ground\\_3.jpg', '../A tiger leaping over a small rock on a flat ground\\_4.jpg', '../A tiger leaping over a small rock on a flat ground\\_5.jpg', '../A tiger leaping over a small rock on a flat ground\\_6.jpg', '../A tiger leaping over a small rock on a flat ground\\_7.jpg']


In [75]:
# ---------------------------------------
# 3) processor로 clip 생성
# ---------------------------------------
inputs = processor(buffer)   # 보통 [clip_tensor] 형태

if isinstance(inputs, list):
    clip = inputs[0]
else:
    clip = inputs

print("clip type:", type(clip))
print("clip shape:", clip.shape)  # 예: (3, T, 256, 256) 또는 (T, 3, 256, 256)

# clip을 (C, T, H, W)로 통일
if clip.ndim == 4 and clip.shape[0] != 3:
    # (T, C, H, W) → (C, T, H, W)
    if clip.shape[1] == 3:
        clip = clip.permute(1, 0, 2, 3)
    else:
        raise ValueError(f"Unexpected clip shape: {clip.shape}")

print("normalized clip shape:", clip.shape)  # (3, T, H, W)

C, T, H, W = clip.shape

clip type: <class 'torch.Tensor'>
clip shape: torch.Size([3, 8, 256, 256])
normalized clip shape: torch.Size([3, 8, 256, 256])


In [76]:
# ---------------------------------------
# 4) 누적 world state w_t (t ≥ 2) 계산
# ---------------------------------------
# V-JEPA2 patch_embed3D의 temporal kernel size = 2 이므로
# 최소 T=2 이상부터 입력 가능
min_T = 2
if T < min_T:
    raise ValueError(f"Need at least {min_T} frames, but got T={T}")

w_list = []

for t in range(min_T, T + 1):
    # 앞 t개 프레임 사용: (3, t, H, W)
    clip_t = clip[:, :t, :, :]
    video_t = clip_t.unsqueeze(0).to(device)  # (1, 3, t, H, W)

    with torch.no_grad():
        out_t = vjepa2_encoder(video_t)       # (B, N, D) or dict

    # encoder output에서 토큰 텐서 선택
    if isinstance(out_t, dict):
        if "x" in out_t:
            tokens = out_t["x"]
        else:
            tokens = None
            for k, v in out_t.items():
                if torch.is_tensor(v):
                    tokens = v
                    # print("using key:", k)
                    break
            if tokens is None:
                raise RuntimeError("No tensor output found in encoder dict.")
    else:
        tokens = out_t

    # tokens: (1, N, D) → 평균해서 world state w_t (1, D)
    w_t = tokens.mean(dim=1)
    w_list.append(w_t)

# (T - min_T + 1, D) 형태로 stack
W = torch.cat(w_list, dim=0)  # shape: ((T-1), D)  ← 여기서 첫 행이 w_2에 해당
print("Final W shape:", W.shape)

Final W shape: torch.Size([7, 1408])


In [77]:
# W: shape (T-1, D)  # 여기서는 (7, 1408)
# 인덱스 매핑: W[0] = w2, W[1] = w3, ..., W[6] = w8

# 1) 1차 차분: world-state velocity-like 벡터
vel = W[1:, :] - W[:-1, :]          # shape: (6, 1408)

# 2) 2차 차분: world-state acceleration-like 벡터
acc = vel[1:, :] - vel[:-1, :]      # shape: (5, 1408)

# 3) 간단한 스칼라 지표들
vel_norm_per_step = vel.norm(dim=1)     # (6,) – 각 step마다 state 변화량
acc_norm_per_step = acc.norm(dim=1)     # (5,) – 각 step마다 가속도 변화량

vel_loss  = (vel**2).mean()             # 전체 velocity 에너지
phys_loss = (acc**2).mean()             # 전체 가속도 변화량 (부드러움/물리성)

print("vel_norm_per_step:", vel_norm_per_step)
print("acc_norm_per_step:", acc_norm_per_step)
print("vel_loss:", vel_loss.item())
print("phys_loss:", phys_loss.item())

vel_norm_per_step: tensor([ 0.0000, 10.9501,  0.0000,  9.8504,  0.0000,  9.5318], device='cuda:0')
acc_norm_per_step: tensor([10.9501, 10.9501,  9.8504,  9.8504,  9.5318], device='cuda:0')
vel_loss: 0.03643355518579483
phys_loss: 0.07453503459692001
