In [1]:
# VIT6D.ipynb refactored for Minty (Linux Mint)
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm
import timm
import time
import json
import numpy as np

In [2]:
with open("GlobVar.json", "r") as file:
    gv = json.load(file)
mod_id = gv['mod_id']

In [3]:

# Constants and environment setup
BATCH_ID = 5
BATCH_SIZE = 32
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
IMG_SIZE = 224  # Required input size for ViT

# Paths assume Linux-style forward slashes
BASE_DIR = os.path.expanduser("~/SKRIPSI/SCRIPTS")  # Refactor path to be absolute
DATASET_DIR = os.path.join(BASE_DIR, f"dataset/batch{BATCH_ID}")
MODEL_SAVE_PATH = os.path.join(BASE_DIR, f"model/ViT6DP_batch{BATCH_ID}.{mod_id}.pth")

# Use CUDA if available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

In [4]:
class PoseDataset(Dataset):
    def __init__(self, image_dir, label_csv, transform=None):
        self.image_dir = image_dir
        self.labels = pd.read_csv(label_csv)
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        row = self.labels.iloc[idx]
        img_path = os.path.join(self.image_dir, row['image_name'])
        image = Image.open(img_path).convert('RGB')
        label = torch.tensor(row[1:].values.astype('float32'))  # x, y, z, pitch, roll, yaw
        if self.transform:
            image = self.transform(image)
        return image, label

In [5]:
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.5] * 3, [0.5] * 3)
])

In [6]:
def get_dataloader(split):
    image_dir = os.path.join(DATASET_DIR, split, 'images')
    label_csv = os.path.join(DATASET_DIR, split, 'labels.csv')
    dataset = PoseDataset(image_dir, label_csv, transform)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=(split == 'train'))

train_loader = get_dataloader('train')
val_loader = get_dataloader('val')

In [7]:
class ViT6DP(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model('vit_base_patch16_224', pretrained=True)
        self.backbone.head = nn.Sequential(
            nn.Linear(self.backbone.head.in_features, 512),
            nn.ReLU(),
            nn.Linear(512, 6)
        )

    def forward(self, x):
        return self.backbone(x)

In [8]:
def normalize_pose(pose):
    translation = pose[:, :3]                     # [x, y, z]
    rotation = pose[:, 3:]
    return torch.cat([translation, rotation], dim=1)

In [9]:
def compute_rmse(pred, target):
    pred_trans, pred_rot = pred[:, :3], pred[:, 3:]
    target_trans, target_rot = target[:, :3], target[:, 3:]

    trans_rmse = torch.sqrt(nn.MSELoss()(pred_trans, target_trans))
    rot_rmse = torch.sqrt(nn.MSELoss()(pred_rot, target_rot))

    return trans_rmse.item(), rot_rmse.item()

In [10]:
def combined_loss(pred, target, alpha=1.0, beta=1.0):
    pred_trans, pred_rot = pred[:, :3], pred[:, 3:]
    target_trans, target_rot = target[:, :3], target[:, 3:]

    # Optionally normalize rotation
    pred_rot = pred_rot
    target_rot = target_rot

    trans_loss = nn.MSELoss()(pred_trans, target_trans)
    rot_loss = nn.MSELoss()(pred_rot, target_rot)
    
    return alpha * trans_loss + beta * rot_loss


In [11]:
model = ViT6DP().to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [12]:
def train(validate=True):
    now = []
    now.append(time.time())
    for epoch in range(NUM_EPOCHS):
        print("\n")
        model.train()
        running_loss = 0.0
        for images, labels in tqdm(train_loader):
            
            images, labels = images.to(DEVICE), normalize_pose(labels.to(DEVICE))
            outputs = normalize_pose(model(images))

            loss = combined_loss(outputs, labels, alpha=1.0, beta=1.0)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        
        now.append(time.time())
        avg_train_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Train Loss: {avg_train_loss:.4f}")
        print(f"Time took per epoch : {epoch+1}: {int(now[epoch+1]-now[epoch])}s")
        if validate:
            # Validation
            model.eval()
            val_loss = 0.0
            total_trans_rmse, total_rot_rmse = 0.0, 0.0

            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(DEVICE), normalize_pose(labels.to(DEVICE))
                    outputs = normalize_pose(model(images))
                    
                    loss = combined_loss(outputs, labels)
                    val_loss += loss.item()

                    trans_rmse, rot_rmse = compute_rmse(outputs, labels)
                    total_trans_rmse += trans_rmse
                    total_rot_rmse += rot_rmse

            avg_val_loss = val_loss / len(val_loader)
            print(f"Val Loss: {avg_val_loss:.4f}")
            print(f"RMSE - Translation: {total_trans_rmse / len(val_loader):.4f}, "
                  f"Rotation: {total_rot_rmse / len(val_loader):.4f}")
        else:
            print("Skipping validation for this epoch.")

In [13]:
train(validate=True)





  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 1/20, Train Loss: 0.1234
Time took per epoch : 1: 32s
Val Loss: 0.1019
RMSE - Translation: 0.0671, Rotation: 0.3113




100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


Epoch 2/20, Train Loss: 0.0955
Time took per epoch : 2: 35s
Val Loss: 0.0947
RMSE - Translation: 0.0561, Rotation: 0.3021




100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


Epoch 3/20, Train Loss: 0.0953
Time took per epoch : 3: 35s
Val Loss: 0.0973
RMSE - Translation: 0.0533, Rotation: 0.3067




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 4/20, Train Loss: 0.0889
Time took per epoch : 4: 35s
Val Loss: 0.0911
RMSE - Translation: 0.0573, Rotation: 0.2962




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 5/20, Train Loss: 0.0820
Time took per epoch : 5: 35s
Val Loss: 0.0831
RMSE - Translation: 0.0531, Rotation: 0.2829




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 6/20, Train Loss: 0.0622
Time took per epoch : 6: 35s
Val Loss: 0.0882
RMSE - Translation: 0.0474, Rotation: 0.2920




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 7/20, Train Loss: 0.0484
Time took per epoch : 7: 35s
Val Loss: 0.0949
RMSE - Translation: 0.0447, Rotation: 0.3036




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 8/20, Train Loss: 0.0332
Time took per epoch : 8: 35s
Val Loss: 0.0873
RMSE - Translation: 0.0446, Rotation: 0.2893




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 9/20, Train Loss: 0.0222
Time took per epoch : 9: 35s
Val Loss: 0.1096
RMSE - Translation: 0.0458, Rotation: 0.3272




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 10/20, Train Loss: 0.0183
Time took per epoch : 10: 35s
Val Loss: 0.0864
RMSE - Translation: 0.0395, Rotation: 0.2901




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 11/20, Train Loss: 0.0101
Time took per epoch : 11: 35s
Val Loss: 0.0888
RMSE - Translation: 0.0405, Rotation: 0.2927




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 12/20, Train Loss: 0.0070
Time took per epoch : 12: 35s
Val Loss: 0.0836
RMSE - Translation: 0.0366, Rotation: 0.2855




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 13/20, Train Loss: 0.0054
Time took per epoch : 13: 35s
Val Loss: 0.1006
RMSE - Translation: 0.0361, Rotation: 0.3143




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 14/20, Train Loss: 0.0075
Time took per epoch : 14: 35s
Val Loss: 0.0945
RMSE - Translation: 0.0374, Rotation: 0.3038




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 15/20, Train Loss: 0.0081
Time took per epoch : 15: 35s
Val Loss: 0.0939
RMSE - Translation: 0.0395, Rotation: 0.3015




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 16/20, Train Loss: 0.0092
Time took per epoch : 16: 35s
Val Loss: 0.0977
RMSE - Translation: 0.0381, Rotation: 0.3097




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 17/20, Train Loss: 0.0116
Time took per epoch : 17: 35s
Val Loss: 0.0899
RMSE - Translation: 0.0396, Rotation: 0.2961




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 18/20, Train Loss: 0.0085
Time took per epoch : 18: 35s
Val Loss: 0.0926
RMSE - Translation: 0.0399, Rotation: 0.3004




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 19/20, Train Loss: 0.0087
Time took per epoch : 19: 35s
Val Loss: 0.0838
RMSE - Translation: 0.0374, Rotation: 0.2863




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 20/20, Train Loss: 0.0057
Time took per epoch : 20: 35s
Val Loss: 0.0880
RMSE - Translation: 0.0365, Rotation: 0.2935


In [14]:
torch.save(model.state_dict(), MODEL_SAVE_PATH)
gv['mod_id'] = mod_id + 1
# Save the updated JSON back to the file
with open("GlobVar.json", "w") as file:
    json.dump(gv, file, indent=4)

In [15]:
test_loader = get_dataloader('test')

In [16]:
def test_model(model_path):
    model = ViT6DP().to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()

    test_total_loss = 0.0
    test_total_trans_rmse, test_total_rot_rmse = 0.0, 0.0
    preds = []
    gts = []

    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images, labels = images.to(DEVICE), normalize_pose(labels.to(DEVICE))
            outputs = normalize_pose(model(images))

            loss = combined_loss(outputs, labels)
            test_total_loss += loss.item()

            trans_rmse, rot_rmse = compute_rmse(outputs, labels)
            test_total_trans_rmse += trans_rmse
            test_total_rot_rmse += rot_rmse

            preds.extend(outputs.cpu().numpy())
            gts.extend(labels.cpu().numpy())

    test_avg_loss = test_total_loss / len(test_loader)
    print(f"Test Loss: {test_avg_loss:.4f}")
    print(f"Test RMSE - Translation: {test_total_trans_rmse / len(test_loader):.4f}, "
          f"Rotation: {test_total_rot_rmse / len(test_loader):.4f}")

    return preds, gts, test_avg_loss, test_total_trans_rmse, test_total_rot_rmse

In [17]:
predictions, ground_truths, test_avg_loss, test_total_trans_rmse, test_total_rot_rmse = test_model(MODEL_SAVE_PATH)

100%|██████████| 15/15 [00:05<00:00,  2.66it/s]

Test Loss: 0.0882
Test RMSE - Translation: 0.0374, Rotation: 0.2934





In [18]:
def validate_model(model_path):
    model = ViT6DP().to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()

    val_total_loss = 0.0
    val_total_trans_rmse, val_total_rot_rmse = 0.0, 0.0
    preds = []
    gts = []

    with torch.no_grad():
        for images, labels in tqdm(val_loader):
            images, labels = images.to(DEVICE), normalize_pose(labels.to(DEVICE))
            outputs = normalize_pose(model(images))

            loss = combined_loss(outputs, labels)
            val_total_loss += loss.item()

            trans_rmse, rot_rmse = compute_rmse(outputs, labels)
            val_total_trans_rmse += trans_rmse
            val_total_rot_rmse += rot_rmse

            preds.extend(outputs.cpu().numpy())
            gts.extend(labels.cpu().numpy())

    val_avg_loss = val_total_loss / len(val_loader)
    print(f"Validation Loss: {val_avg_loss:.4f}")
    print(f"Validation RMSE - Translation: {val_total_trans_rmse / len(val_loader):.4f}, "
          f"Rotation: {val_total_rot_rmse / len(val_loader):.4f}")

    return preds, gts, val_avg_loss, val_total_trans_rmse, val_total_rot_rmse

In [19]:
val_predictions, val_ground_truths, val_avg_loss, val_total_trans_rmse, val_total_rot_rmse = validate_model(MODEL_SAVE_PATH)

100%|██████████| 8/8 [00:02<00:00,  2.86it/s]

Validation Loss: 0.0880
Validation RMSE - Translation: 0.0365, Rotation: 0.2935





In [20]:
torch.cuda.empty_cache()

In [21]:
# Calculate accuracy metrics
def calculate_accuracy_metrics(predictions, ground_truths):
    # Convert to numpy arrays if they're not already
    preds = np.array(predictions)
    gts = np.array(ground_truths)
    
    # Translation metrics (convert from normalized to cm)
    # Assuming original data was in meters and normalized to 0±1 range (~1m range)
    trans_pred = preds[:, :3] * 100  # Convert to cm
    trans_gt = gts[:, :3] * 100      # Convert to cm
    trans_error = np.abs(trans_pred - trans_gt)
    trans_accuracy_cm = np.mean(trans_error)
    
    # Rotation metrics (convert from normalized to degrees)
    # Assuming original data was in degrees and normalized by 360
    rot_pred = preds[:, 3:] * 360    # Convert to degrees
    rot_gt = gts[:, 3:] * 360        # Convert to degrees
    rot_error = np.abs(rot_pred - rot_gt)
    # Handle angle wrapping (e.g., 359° and 1° should have 2° difference, not 358°)
    rot_error = np.minimum(rot_error, 360 - rot_error)
    rot_accuracy_deg = np.mean(rot_error)
    
    return trans_accuracy_cm, rot_accuracy_deg

In [22]:
# Calculate accuracy metrics for validation and test sets
val_trans_acc, val_rot_acc = calculate_accuracy_metrics(val_predictions, val_ground_truths)
test_trans_acc, test_rot_acc = calculate_accuracy_metrics(predictions, ground_truths)


In [None]:
# Generate eval.md with evaluation metrics
eval_content = f"""# Evaluation Results - Batch {BATCH_ID}

## Training Configuration
- Batch Size: {BATCH_SIZE}
- Epochs: {NUM_EPOCHS}
- Learning Rate: {LEARNING_RATE}
- Image Size: {IMG_SIZE}
- Device: {DEVICE}

## Model Architecture
- Backbone: ViT Base Patch16 224
- Head: Linear(768->512->6)

## Evaluation Metrics

### Validation Set
- Average Loss: {val_avg_loss:.4f}
- Translation RMSE: {val_total_trans_rmse / len(val_loader):.4f} (normalized)
- Rotation RMSE: {val_total_rot_rmse / len(val_loader):.4f} (normalized)
- Translation Accuracy: {val_trans_acc:.2f} cm
- Rotation Accuracy: {val_rot_acc:.2f}°

### Test Set
- Average Loss: {test_avg_loss:.4f}
- Translation RMSE: {test_total_trans_rmse / len(test_loader):.4f} (normalized)
- Rotation RMSE: {test_total_rot_rmse / len(test_loader):.4f} (normalized)
- Translation Accuracy: {test_trans_acc:.2f} cm
- Rotation Accuracy: {test_rot_acc:.2f}°

## File Locations
- Dataset Directory: {DATASET_DIR}
- Model Save Path: {MODEL_SAVE_PATH}
"""

eval_path = os.path.join(os.path.dirname(MODEL_SAVE_PATH), f"Eval_{BATCH_ID}.{mod_id}.md")
with open(eval_path, 'w') as f:
    f.write(eval_content)
    
print(f"Evaluation report saved to: {eval_path}")

Evaluation report saved to: /home/moreno/SKRIPSI/SCRIPTS/model/Eval_5.md
