In [1]:
# VIT6D.ipynb refactored for Minty (Linux Mint)
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm
import timm
import time
import datetime
import json
import numpy as np
import csv

PREREQUISITES AND DECLARATIONS AND YADA YADA YADA 

In [2]:
with open("GlobVar.json", "r") as file:
    gv = json.load(file)
mod_id = gv['mod_id']
# Constants and environment setup
BATCH_ID = 1
BATCH_SIZE = 32
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
IMG_SIZE = 224  # Required input size for ViT

# Paths assume Linux-style forward slashes
BASE_DIR = os.path.expanduser("~/SKRIPSI/SCRIPTS")  # Refactor path to be absolute
DATASET_DIR = os.path.join(BASE_DIR, f"dataset/batch{BATCH_ID}")
MODEL_SAVE_PATH = os.path.join(BASE_DIR, f"model/ViT6DP_batch{BATCH_ID}.{mod_id}.pth")

# Use CUDA if available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

In [3]:
class PoseDataset(Dataset):
    def __init__(self, image_dir, label_csv, transform=None):
        self.image_dir = image_dir
        self.labels = pd.read_csv(label_csv)
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        row = self.labels.iloc[idx]
        img_path = os.path.join(self.image_dir, row['image_name'])
        image = Image.open(img_path).convert('RGB')
        label = torch.tensor(row[1:].values.astype('float32'))  # x, y, z, pitch, roll, yaw
        if self.transform:
            image = self.transform(image)
        return image, label

In [4]:
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.5] * 3, [0.5] * 3)
])

In [5]:
def get_dataloader(split):
    image_dir = os.path.join(DATASET_DIR, split, 'images')
    label_csv = os.path.join(DATASET_DIR, split, 'labels.csv')
    dataset = PoseDataset(image_dir, label_csv, transform)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=(split == 'train'))

train_loader = get_dataloader('train')
val_loader = get_dataloader('val')
test_loader = get_dataloader('test')

In [6]:
def get_dataset_stats(loader):
    all_labels = []
    for _, labels in loader:
        all_labels.append(labels)
    all_labels = torch.cat(all_labels, dim=0)
    
    # Translation stats (x,y,z)
    trans_stats = {
        'min': all_labels[:, :3].min(dim=0)[0],
        'max': all_labels[:, :3].max(dim=0)[0],
        'mean': all_labels[:, :3].mean(dim=0),
        'std': all_labels[:, :3].std(dim=0)
    }
    
    # Rotation stats (pitch, roll, yaw)
    rot_stats = {
        'min': all_labels[:, 3:].min(dim=0)[0],
        'max': all_labels[:, 3:].max(dim=0)[0],
        'mean': all_labels[:, 3:].mean(dim=0),
        'std': all_labels[:, 3:].std(dim=0)
    }
    
    return trans_stats, rot_stats

# Get stats for each dataset split
train_trans_stats, train_rot_stats = get_dataset_stats(train_loader)
val_trans_stats, val_rot_stats = get_dataset_stats(val_loader)
test_trans_stats, test_rot_stats = get_dataset_stats(test_loader)
print(f"Train Translation stat:{train_trans_stats}      |       Train Rotation stat: {train_rot_stats}")
print(f"Validation Translation stat:{val_trans_stats}     |       Validation Rotation stat: {val_rot_stats}")
print(f"Test Translation stat:{test_trans_stats}      |       Test Rotation stat: {test_rot_stats}")


Train Translation stat:{'min': tensor([-0.2061, -0.1280,  0.0981]), 'max': tensor([0.2004, 0.1593, 0.6350]), 'mean': tensor([-0.0092,  0.0355,  0.3953]), 'std': tensor([0.0525, 0.0431, 0.0779])}      |       Train Rotation stat: {'min': tensor([-1.0000, -1.0000, -0.9999, -0.9999, -0.9490, -0.9959]), 'max': tensor([0.9999, 1.0000, 0.9997, 0.9986, 0.9766, 0.9765]), 'mean': tensor([ 0.1674, -0.0010, -0.0022, -0.1675, -0.0266,  0.0288]), 'std': tensor([0.7254, 0.5623, 0.5539, 0.6985, 0.3727, 0.4095])}
Validation Translation stat:{'min': tensor([-0.1940, -0.1262,  0.1145]), 'max': tensor([0.1277, 0.1506, 0.6368]), 'mean': tensor([-0.0093,  0.0361,  0.3949]), 'std': tensor([0.0512, 0.0447, 0.0856])}     |       Validation Rotation stat: {'min': tensor([-0.9989, -0.9985, -0.9958, -0.9977, -0.8561, -0.9641]), 'max': tensor([0.9998, 0.9985, 0.9947, 0.9969, 0.9158, 0.9269]), 'mean': tensor([ 0.0725, -0.0196,  0.0007, -0.0803,  0.0033, -0.0223]), 'std': tensor([0.7408, 0.5672, 0.5597, 0.7074, 0.3

In [7]:
class ViT6DP(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model('vit_base_patch16_224', pretrained=True)
        self.backbone.head = nn.Sequential(
            nn.Linear(self.backbone.head.in_features, 512),
            nn.ReLU(),
            nn.Linear(512, 9)
        )

    def forward(self, x):
        return self.backbone(x)

In [8]:
def compute_rmse(pred, target):
    pred_trans, pred_rot = pred[:, :3], pred[:, 3:]
    target_trans, target_rot = target[:, :3], target[:, 3:]

    trans_rmse = torch.sqrt(nn.MSELoss()(pred_trans, target_trans))
    rot_rmse = torch.sqrt(nn.MSELoss()(pred_rot, target_rot))

    return trans_rmse.item(), rot_rmse.item()


In [9]:
def combined_loss(pred, target, alpha=1.0, beta=1.0):
    pred_trans, pred_rot = pred[:, :3], pred[:, 3:]
    target_trans, target_rot = target[:, :3], target[:, 3:]

    trans_loss = nn.MSELoss()(pred_trans, target_trans)
    rot_loss = nn.MSELoss()(pred_rot, target_rot)

    return alpha * trans_loss + beta * rot_loss

In [10]:
model = ViT6DP().to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

TRAINING

In [11]:
def train(validate=True):
    now = [time.time()]
    for epoch in range(NUM_EPOCHS):
        print("\n")
        model.train()
        running_loss = 0.0
        for images, labels in tqdm(train_loader):
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)  # shape: [B, 9] -> [tx, ty, tz, r1...r6]

            outputs = model(images)  # no normalize_pose anymore

            loss = combined_loss(outputs, labels, alpha=1.0, beta=1.0)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        now.append(time.time())
        avg_train_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Train Loss: {avg_train_loss:.4f}")
        print(f"Time per epoch {epoch + 1}: {int(now[epoch + 1] - now[epoch])}s")

        if validate:
            model.eval()
            val_loss = 0.0
            total_trans_rmse, total_rot_rmse = 0.0, 0.0
            with torch.no_grad():
                for images, labels in val_loader:
                    images = images.to(DEVICE)
                    labels = labels.to(DEVICE)
                    outputs = model(images)

                    loss = combined_loss(outputs, labels)
                    val_loss += loss.item()

                    trans_rmse, rot_rmse = compute_rmse(outputs, labels)
                    total_trans_rmse += trans_rmse
                    total_rot_rmse += rot_rmse

            avg_val_loss = val_loss / len(val_loader)
            print(f"Val Loss: {avg_val_loss:.4f}")
            print(f"RMSE - Translation: {total_trans_rmse / len(val_loader):.4f}, "
                  f"Rotation: {total_rot_rmse / len(val_loader):.4f}")
        else:
            print("Skipping validation for this epoch.")


In [12]:
train(validate=True)





100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


Epoch 1/20, Train Loss: 0.3623
Time per epoch 1: 32s
Val Loss: 0.3352
RMSE - Translation: 0.0669, Rotation: 0.5745




100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


Epoch 2/20, Train Loss: 0.3202
Time per epoch 2: 35s
Val Loss: 0.3539
RMSE - Translation: 0.0645, Rotation: 0.5909




100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


Epoch 3/20, Train Loss: 0.3238
Time per epoch 3: 35s
Val Loss: 0.3192
RMSE - Translation: 0.0650, Rotation: 0.5597




100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


Epoch 4/20, Train Loss: 0.2914
Time per epoch 4: 35s
Val Loss: 0.2718
RMSE - Translation: 0.0632, Rotation: 0.5169




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 5/20, Train Loss: 0.2413
Time per epoch 5: 35s
Val Loss: 0.2496
RMSE - Translation: 0.0743, Rotation: 0.4938




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 6/20, Train Loss: 0.1875
Time per epoch 6: 35s
Val Loss: 0.2627
RMSE - Translation: 0.0600, Rotation: 0.5054




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 7/20, Train Loss: 0.1269
Time per epoch 7: 35s
Val Loss: 0.2155
RMSE - Translation: 0.0558, Rotation: 0.4576




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 8/20, Train Loss: 0.0907
Time per epoch 8: 35s
Val Loss: 0.1912
RMSE - Translation: 0.0564, Rotation: 0.4317




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 9/20, Train Loss: 0.0575
Time per epoch 9: 35s
Val Loss: 0.2026
RMSE - Translation: 0.0506, Rotation: 0.4429




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 10/20, Train Loss: 0.0369
Time per epoch 10: 35s
Val Loss: 0.2086
RMSE - Translation: 0.0480, Rotation: 0.4505




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 11/20, Train Loss: 0.0257
Time per epoch 11: 35s
Val Loss: 0.2025
RMSE - Translation: 0.0522, Rotation: 0.4410




100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Epoch 12/20, Train Loss: 0.0153
Time per epoch 12: 35s
Val Loss: 0.1918
RMSE - Translation: 0.0475, Rotation: 0.4311




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 13/20, Train Loss: 0.0088
Time per epoch 13: 35s
Val Loss: 0.1992
RMSE - Translation: 0.0467, Rotation: 0.4383




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 14/20, Train Loss: 0.0054
Time per epoch 14: 35s
Val Loss: 0.2088
RMSE - Translation: 0.0490, Rotation: 0.4465




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 15/20, Train Loss: 0.0038
Time per epoch 15: 35s
Val Loss: 0.1970
RMSE - Translation: 0.0459, Rotation: 0.4366




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 16/20, Train Loss: 0.0027
Time per epoch 16: 35s
Val Loss: 0.1924
RMSE - Translation: 0.0453, Rotation: 0.4306




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 17/20, Train Loss: 0.0020
Time per epoch 17: 35s
Val Loss: 0.1958
RMSE - Translation: 0.0464, Rotation: 0.4349




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 18/20, Train Loss: 0.0024
Time per epoch 18: 35s
Val Loss: 0.1894
RMSE - Translation: 0.0464, Rotation: 0.4275




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 19/20, Train Loss: 0.0017
Time per epoch 19: 35s
Val Loss: 0.1927
RMSE - Translation: 0.0456, Rotation: 0.4314




100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Epoch 20/20, Train Loss: 0.0013
Time per epoch 20: 35s
Val Loss: 0.1960
RMSE - Translation: 0.0453, Rotation: 0.4351


SAVE + INCREMENT

In [13]:
torch.save(model.state_dict(), MODEL_SAVE_PATH)
# Increment
mod_id += 1
gv['mod_id'] = mod_id
# Save the updated JSON back to the file
with open("GlobVar.json", "w") as file:
    json.dump(gv, file, indent=4)

TEST THE MODEL

In [14]:
def test_model(model_path):
    model = ViT6DP().to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()

    test_total_loss = 0.0
    test_total_trans_rmse, test_total_rot_rmse = 0.0, 0.0
    preds, gts = [], []

    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)
            outputs = model(images)

            loss = combined_loss(outputs, labels)
            test_total_loss += loss.item()

            trans_rmse, rot_rmse = compute_rmse(outputs, labels)
            test_total_trans_rmse += trans_rmse
            test_total_rot_rmse += rot_rmse

            preds.extend(outputs.cpu().numpy())
            gts.extend(labels.cpu().numpy())

    test_avg_loss = test_total_loss / len(test_loader)
    print(f"Test Loss: {test_avg_loss:.4f}")
    print(f"Test RMSE - Translation: {test_total_trans_rmse / len(test_loader):.4f}, "
          f"Rotation: {test_total_rot_rmse / len(test_loader):.4f}")

    return preds, gts, test_avg_loss, test_total_trans_rmse, test_total_rot_rmse


In [15]:
predictions, ground_truths, test_avg_loss, test_total_trans_rmse, test_total_rot_rmse = test_model(MODEL_SAVE_PATH)

100%|██████████| 15/15 [00:05<00:00,  2.66it/s]

Test Loss: 0.1793
Test RMSE - Translation: 0.0452, Rotation: 0.4147





VALIDATE THE MODEL

In [16]:
def validate_model(model_path):
    model = ViT6DP().to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()

    val_total_loss = 0.0
    val_total_trans_rmse, val_total_rot_rmse = 0.0, 0.0
    preds, gts = [], []

    with torch.no_grad():
        for images, labels in tqdm(val_loader):
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)
            outputs = model(images)

            loss = combined_loss(outputs, labels)
            val_total_loss += loss.item()

            trans_rmse, rot_rmse = compute_rmse(outputs, labels)
            val_total_trans_rmse += trans_rmse
            val_total_rot_rmse += rot_rmse

            preds.extend(outputs.cpu().numpy())
            gts.extend(labels.cpu().numpy())

    val_avg_loss = val_total_loss / len(val_loader)
    print(f"Validation Loss: {val_avg_loss:.4f}")
    print(f"Validation RMSE - Translation: {val_total_trans_rmse / len(val_loader):.4f}, "
          f"Rotation: {val_total_rot_rmse / len(val_loader):.4f}")

    return preds, gts, val_avg_loss, val_total_trans_rmse, val_total_rot_rmse


In [17]:
val_predictions, val_ground_truths, val_avg_loss, val_total_trans_rmse, val_total_rot_rmse = validate_model(MODEL_SAVE_PATH)

100%|██████████| 8/8 [00:02<00:00,  2.85it/s]

Validation Loss: 0.1960
Validation RMSE - Translation: 0.0453, Rotation: 0.4351





In [18]:
torch.cuda.empty_cache()

CALCULATIONS AND SUCH (ALSO OUTPUTTING TO MD AND CSV)

In [19]:
def vectors_to_rotation_matrix(r1, r2):
    """Convert orthogonalized r1 and r2 vectors into a rotation matrix."""
    r1 = r1 / np.linalg.norm(r1)
    r2 = r2 / np.linalg.norm(r2)
    r3 = np.cross(r1, r2)
    return np.stack([r1, r2, r3], axis=1)

def calculate_translation_rmse(preds, gts):
    """Euclidean distance between predicted and GT translations (in meters)."""
    errors = np.linalg.norm(preds - gts, axis=1)  # Shape: [N]
    rmse = np.sqrt(np.mean(errors**2))
    return rmse * 1000  # Convert to mm

def calculate_rotation_rmse(preds_r1r2, gts_r1r2):
    """Angular difference (in degrees) between predicted and GT rotation matrices."""
    angles = []
    for pred, gt in zip(preds_r1r2, gts_r1r2):
        R_pred = vectors_to_rotation_matrix(pred[:3], pred[3:])
        R_gt = vectors_to_rotation_matrix(gt[:3], gt[3:])
        R_diff = R_pred.T @ R_gt
        angle = np.arccos(np.clip((np.trace(R_diff) - 1) / 2.0, -1.0, 1.0))
        angles.append(np.degrees(angle))
    return np.sqrt(np.mean(np.array(angles)**2))  # RMSE in degrees

In [20]:
val_trans_accuracy, val_rot_accuracy = val_total_trans_rmse / len(val_loader),  val_total_rot_rmse / len(val_loader)
test_trans_accuracy, test_rot_accuracy = test_total_trans_rmse / len(test_loader),  test_total_rot_rmse / len(test_loader)

In [21]:
train_rot_mag_min = train_rot_stats['min'].norm().item()
train_rot_mag_max = train_rot_stats['max'].norm().item()
val_rot_mag_min = val_rot_stats['min'].norm().mean().item()
val_rot_mag_max = val_rot_stats['max'].norm().mean().item()
test_rot_mag_min = test_rot_stats['min'].norm().mean().item()
test_rot_mag_max = test_rot_stats['max'].norm().mean().item()

WRITE TO MD

In [27]:
eval_content = f"""# Evaluation Results - Batch {BATCH_ID}

## Training Configuration
- Batch Size: {BATCH_SIZE}
- Epochs: {NUM_EPOCHS}
- Learning Rate: {LEARNING_RATE}
- Image Size: {IMG_SIZE}
- Device: {DEVICE}
- Optimizer : Adam

## Model Architecture
- Backbone: ViT Base Patch16 224
- Head: Linear(768->512->6)

## Evaluation Metrics

### Validation Set
- Average Loss: {val_avg_loss:.4f}
- Translation RMSE: {val_total_trans_rmse / len(val_loader):.4f}
- Translation Accuracy: {val_trans_accuracy:.2f} cm
- Rotation RMSE: {val_total_rot_rmse / len(val_loader):.4f}
- Rotation Accuracy: {val_rot_accuracy:.2f}°

### Test Set
- Average Loss: {test_avg_loss:.4f}
- Translation RMSE: {test_total_trans_rmse / len(test_loader):.4f}
- Translation Accuracy: {test_trans_accuracy:.2f} cm
- Rotation RMSE: {test_total_rot_rmse / len(test_loader):.4f}
- Rotation Accuracy: {test_rot_accuracy:.2f}°

## Dataset Statistics
### Training Set
- Translation range: [{train_trans_stats['min'].mean():.2f}, {train_trans_stats['max'].mean():.2f}] m
- Rotation magnitude range: [{train_rot_mag_min:.2f}, {train_rot_mag_max:.2f}]

### Validation Set
- Translation range: [{val_trans_stats['min'].mean():.2f}, {val_trans_stats['max'].mean():.2f}] m
- Rotation magnitude range: [{val_rot_mag_min:.2f}, {val_rot_mag_max:.2f}]

### Test Set
- Translation range: [{test_trans_stats['min'].mean():.2f}, {test_trans_stats['max'].mean():.2f}] m
- Rotation magnitude range: [{test_rot_mag_min:.2f}, {test_rot_mag_max:.2f}]

## File Locations
- Dataset Directory: {DATASET_DIR}
- Model Save Path: {MODEL_SAVE_PATH}
"""


eval_path = os.path.join(BASE_DIR, f"model/ViT6DP_EVAL_batch{BATCH_ID}.{mod_id-1}.md")
with open(eval_path, 'w') as f:
    f.write(eval_content)
    
print(f"Evaluation report saved to: {eval_path}")

Evaluation report saved to: /home/moreno/SKRIPSI/SCRIPTS/model/ViT6DP_EVAL_batch1.1.md


WRITE TO CSV

In [28]:
# First, define the CSV file path
csv_path = os.path.join(BASE_DIR, "model/eval_results.csv")

# Check if CSV exists to determine if we need to write headers
write_header = not os.path.exists(csv_path)


csv_data = {
    'timestamp': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset_id': BATCH_ID,
    'model_id': mod_id,
    'batch_size': BATCH_SIZE,
    'epochs': NUM_EPOCHS,
    'learning_rate': LEARNING_RATE,
    'val_loss': val_avg_loss,
    'val_trans_rmse': val_total_trans_rmse / len(val_loader),
    'val_trans_acc_cm': val_trans_accuracy,
    'val_rot_rmse': val_total_rot_rmse / len(val_loader),
    'val_rot_acc_deg': val_rot_accuracy,
    'test_loss': test_avg_loss,
    'test_trans_rmse': test_total_trans_rmse / len(test_loader),
    'test_trans_acc_cm': test_trans_accuracy,
    'test_rot_rmse': test_total_rot_rmse / len(test_loader),
    'test_rot_acc_deg': test_rot_accuracy,
    'train_trans_min': train_trans_stats['min'].mean().item(),
    'train_trans_max': train_trans_stats['max'].mean().item(),
    'train_rot_min': (train_rot_stats['min'].mean()*360).item(),
    'train_rot_max': (train_rot_stats['max'].mean()*360).item(),
    'val_rot_min': val_rot_mag_min,
    'val_rot_max': val_rot_mag_max,
    'test_rot_min': test_rot_mag_min,
    'test_rot_max': test_rot_mag_max,
    'train_rot_mag_min': train_rot_mag_min,
    'train_rot_mag_max': train_rot_mag_max,
    'model_path': MODEL_SAVE_PATH,
    'eval_path' : eval_path
}


# Write to CSV
with open(csv_path, 'a', newline='') as csvfile:
    fieldnames = csv_data.keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    if write_header:
        writer.writeheader()
    writer.writerow(csv_data)

print(f"Results appended to CSV: {csv_path}")

Results appended to CSV: /home/moreno/SKRIPSI/SCRIPTS/model/eval_results.csv
