In [1]:
import torch
from process.data import MPIIDataset
from model.modules import FCBlock,MixerLayer
from torchvision.transforms import transforms
from torch.utils.data import DataLoader

In [2]:
# -----------------------------
# CONFIG
# -----------------------------
IMAGE_DIR = '../images/'
CSV_PATH = '../mpii_human_pose_v1_u12_2/mpii_human_pose.csv'
IMG_SIZE = 224
NUM_JOINTS = 16

In [3]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [4]:
mpii = MPIIDataset(CSV_PATH,IMAGE_DIR,transform)

In [5]:
mpii[0][1].shape

torch.Size([16, 2])

In [6]:
data = DataLoader(mpii,batch_size=16,shuffle=True)

In [7]:
from model.encoder import CompositionalEncoder,VectorQuantizer,Decoder
import torch

random_tensor = torch.randn(size=(1,16,2))
with torch.no_grad():
    encoder = CompositionalEncoder(k=16,d=2,h=256,m=16)
    vq = VectorQuantizer(v=512,h=256,commitment_cost=0.25)
    decoder = Decoder(k=16,d=2,h=256,m=16)
    output = encoder(random_tensor)
    print(output.shape)
    output = vq(output)[0]
    print(output.shape)
    output = decoder(output)
    print(output.shape)

torch.Size([1, 16, 256])
torch.Size([1, 16, 256])
torch.Size([1, 16, 2])


In [None]:
# Increasing the number of entries V in the
# codebook decreases the quantization error. However, it also
# increases the classification difficulty as the number of categories becomes larger.
from model.backbone import SwinTransformerV2
import torch
IMAGE_SIZE = 256
model = SwinTransformerV2(
    img_size=IMAGE_SIZE,
    patch_size=4,
    in_chans=3,
    num_classes=0,                # Feature extraction only
    embed_dim=128,                # Swin-B uses 128 as base
    depths=[2, 2, 18, 2],         # Swin-B depth
    num_heads=[4, 8, 16, 32],     # Swin-B heads
    window_size=8,                # Use 8 (since 256 % 8 = 0)
    ape=False,
    patch_norm=True,
    use_checkpoint=False
)
# Load a 256x256 RGB image tensor
dummy_input = torch.randn(1, 3, IMAGE_SIZE, IMAGE_SIZE)  # Batch size 1, 3 channels

# Extract features from the model (this bypasses the classifier head)
features = model.forward_features(dummy_input)
print(features.shape)  # Expecting [1, feature_dim]

torch.Size([1, 1024])
