<a href="https://colab.research.google.com/github/Sarvinayangar/Final_Project/blob/main/Durin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Your dataset folder in Drive
DATA_ROOT = "/content/drive/MyDrive/Oya Celiktutan - SAP Final Project -Dataset"

TRAIN_DIR = DATA_ROOT + "/train_set"
TEST_DIR  = DATA_ROOT + "/test_set"
ANN_DIR   = DATA_ROOT + "/annotations"
TRAIN_CSV = ANN_DIR   + "/train_set_labels.csv"   # <--- as you requested

import os

print("TRAIN_DIR:", TRAIN_DIR, "exists:", os.path.isdir(TRAIN_DIR))
print("TEST_DIR :", TEST_DIR,  "exists:", os.path.isdir(TEST_DIR))
print("ANN_DIR  :", ANN_DIR,   "exists:", os.path.isdir(ANN_DIR))
print("TRAIN_CSV:", TRAIN_CSV, "exists:", os.path.isfile(TRAIN_CSV))



Mounted at /content/drive
TRAIN_DIR: /content/drive/MyDrive/Oya Celiktutan - SAP Final Project -Dataset/train_set exists: True
TEST_DIR : /content/drive/MyDrive/Oya Celiktutan - SAP Final Project -Dataset/test_set exists: True
ANN_DIR  : /content/drive/MyDrive/Oya Celiktutan - SAP Final Project -Dataset/annotations exists: True
TRAIN_CSV: /content/drive/MyDrive/Oya Celiktutan - SAP Final Project -Dataset/annotations/train_set_labels.csv exists: True


In [None]:
!pip install opencv-python pandas




In [4]:
import os
import re
import random
import numpy as np
import pandas as pd

import cv2
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torchvision import transforms
import torchvision.models.video as video_models

# -------------------------
# CONFIG
# -------------------------
NUM_CLASSES = 30
NUM_TOOLS   = 4   # 0=none, 1=object, 2=drill, 3=polisher
NUM_FRAMES  = 16
IMG_SIZE    = 112

BATCH_SIZE  = 4
EPOCHS      = 10
LR          = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if device.type == "cuda":
    torch.cuda.manual_seed_all(42)

# -------------------------
# Helpers
# -------------------------

def subject_id_from_filename(fn: str) -> int:
    """Extract SIDxx from filename like CID01_SID03_VID02 -> 3"""
    m = re.search(r"SID(\d+)", fn)
    if m:
        return int(m.group(1))
    raise ValueError(f"Could not parse SID from {fn}")

def tool_from_class_id(cid: int) -> int:
    """0 = none, 1 = object, 2 = drill, 3 = polisher"""
    object_classes   = {1, 22, 28}
    drill_classes    = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 25, 29}
    polisher_classes = {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 24, 26, 30}

    if cid in object_classes:
        return 1
    elif cid in drill_classes:
        return 2
    elif cid in polisher_classes:
        return 3
    else:
        return 0

def build_video_path(root_dir: str, filename: str) -> str:
    """Ensure filenames have .avi extension and join with root."""
    if not filename.lower().endswith(".avi"):
        filename = filename + ".avi"
    return os.path.join(root_dir, filename)



Using device: cpu


In [5]:
# CSV has no header: rows like CID01_SID01_VID01,DeliverObject,1
df = pd.read_csv(TRAIN_CSV, header=None)
df.columns = ["filename", "class_name", "class_id"]
df["class_id"] = df["class_id"].astype(int)

# Add subject + tool
df["subject_id"] = df["filename"].apply(subject_id_from_filename)
df["tool_id"]    = df["class_id"].apply(tool_from_class_id)

print("DataFrame shape:", df.shape)
display(df.head())

subjects = sorted(df["subject_id"].unique())
print("Subjects:", subjects)

# last 2 subjects for validation
val_subjects   = subjects[-2:]
train_subjects = subjects[:-2]
print("Train subjects:", train_subjects)
print("Val subjects:", val_subjects)

train_df = df[df.subject_id.isin(train_subjects)].reset_index(drop=True)
val_df   = df[df.subject_id.isin(val_subjects)].reset_index(drop=True)

print("Train videos:", len(train_df), "Val videos:", len(val_df))



DataFrame shape: (2099, 5)


Unnamed: 0,filename,class_name,class_id,subject_id,tool_id
0,CID01_SID01_VID01,DeliverObject,1,1,1
1,CID01_SID01_VID02,DeliverObject,1,1,1
2,CID01_SID01_VID03,DeliverObject,1,1,1
3,CID01_SID01_VID04,DeliverObject,1,1,1
4,CID01_SID01_VID05,DeliverObject,1,1,1


Subjects: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10)]
Train subjects: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8)]
Val subjects: [np.int64(9), np.int64(10)]
Train videos: 1679 Val videos: 420


In [9]:
# -------------------------
# OpenCV video loader -> (T, H, W, C) uint8, RGB
# -------------------------
def read_video_cv2(path):
    cap = cv2.VideoCapture(path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(torch.from_numpy(frame))
    cap.release()

    if len(frames) == 0:
        return torch.zeros((NUM_FRAMES, IMG_SIZE, IMG_SIZE, 3), dtype=torch.uint8)

    return torch.stack(frames, dim=0)  # (T, H, W, 3)

# -------------------------
# Transforms
# -------------------------
base_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ConvertImageDtype(torch.float32),
    transforms.Normalize(mean=[0.45, 0.45, 0.45],
                         std=[0.225, 0.225, 0.225]),
])

class HRI30VideoDataset(Dataset):
    def __init__(self, df: pd.DataFrame, root_dir: str):
        self.df = df.reset_index(drop=True)
        self.root_dir = root_dir

    def __len__(self):
        return len(self.df)

    def _sample_indices(self, n: int):
        if n <= NUM_FRAMES:
            idx = list(range(n))
            while len(idx) < NUM_FRAMES:
                idx.extend(idx)
            return idx[:NUM_FRAMES]
        else:
            return sorted(random.sample(range(n), NUM_FRAMES))

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        video_path = build_video_path(self.root_dir, row["filename"])

        try:
            video = read_video_cv2(video_path)    # (T,H,W,3)
        except Exception as e:
            print("Error reading", video_path, "->", e)
            video = torch.zeros((NUM_FRAMES, IMG_SIZE, IMG_SIZE, 3), dtype=torch.uint8)

        T = video.shape[0]
        indices = self._sample_indices(T)
        frames = video[indices]                  # (T,H,W,3)
        frames = frames.permute(0, 3, 1, 2)      # (T,C,H,W)
        frames = torch.stack([base_transform(f) for f in frames])
        frames = frames.permute(1, 0, 2, 3)      # (C,T,H,W)

        action_label = int(row["class_id"]) - 1  # 0..29
        tool_label   = int(row["tool_id"])       # 0..3

        return frames, action_label, tool_label


class TestVideoDataset(Dataset):
    def __init__(self, filenames, root_dir: str):
        self.filenames = filenames
        self.root_dir = root_dir

    def __len__(self):
        return len(self.filenames)

    def _sample_indices(self, n: int):
        if n <= NUM_FRAMES:
            idx = list(range(n))
            while len(idx) < NUM_FRAMES:
                idx.extend(idx)
            return idx[:NUM_FRAMES]
        else:
            return np.linspace(0, n - 1, NUM_FRAMES, dtype=int)

    def __getitem__(self, idx):
        fn = self.filenames[idx]
        video_path = build_video_path(self.root_dir, fn)

        try:
            video = read_video_cv2(video_path)
        except Exception as e:
            print("Error reading", video_path, "->", e)
            video = torch.zeros((NUM_FRAMES, IMG_SIZE, IMG_SIZE, 3), dtype=torch.uint8)

        T = video.shape[0]
        indices = self._sample_indices(T)
        frames = video[indices]
        frames = frames.permute(0, 3, 1, 2)
        frames = torch.stack([base_transform(f) for f in frames])
        frames = frames.permute(1, 0, 2, 3)

        return frames, fn

train_dataset = HRI30VideoDataset(train_df, TRAIN_DIR)
val_dataset   = HRI30VideoDataset(val_df,   TRAIN_DIR)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                          shuffle=False, num_workers=2, pin_memory=True)

print("Train batches:", len(train_loader), "Val batches:", len(val_loader))



Train batches: 420 Val batches: 105


In [7]:
# -------------------------
# Class-balanced weights
# -------------------------
class_counts = np.zeros(NUM_CLASSES, dtype=np.int64)
for cid in train_df["class_id"]:
    class_counts[cid - 1] += 1

class_weights = 1.0 / (class_counts + 1e-6)
for bad in [25, 27, 29, 30]:  # historically weak classes
    class_weights[bad - 1] *= 1.5

class_weights = class_weights / class_weights.mean()
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)
print("Class weights:", class_weights_tensor)

# -------------------------
# Multi-task 3D ResNet18
# -------------------------
try:
    from torchvision.models.video import R3D_18_Weights
    backbone = video_models.r3d_18(weights=R3D_18_Weights.KINETICS400_V1)
    print("Loaded r3d_18 with Kinetics weights.")
except Exception as e:
    print("Could not load pretrained weights, using random init:", e)
    backbone = video_models.r3d_18(weights=None)

in_features = backbone.fc.in_features
backbone.fc = nn.Identity()

class MultiTaskR3D18(nn.Module):
    def __init__(self, backbone, in_features, num_classes, num_tools):
        super().__init__()
        self.backbone = backbone
        self.action_head = nn.Linear(in_features, num_classes)
        self.tool_head   = nn.Linear(in_features, num_tools)

    def forward(self, x):
        feat = self.backbone(x)
        return self.action_head(feat), self.tool_head(feat)

model = MultiTaskR3D18(backbone, in_features, NUM_CLASSES, NUM_TOOLS).to(device)

ce_action = nn.CrossEntropyLoss(weight=class_weights_tensor)
ce_tool   = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)



Class weights: tensor([0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370,
        0.9540, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370,
        0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 0.9370, 1.4055, 0.9370, 1.4055,
        0.9370, 1.4055, 1.4055])
Downloading: "https://download.pytorch.org/models/r3d_18-b3b3357e.pth" to /root/.cache/torch/hub/checkpoints/r3d_18-b3b3357e.pth


100%|██████████| 127M/127M [00:01<00:00, 116MB/s]


Loaded r3d_18 with Kinetics weights.


In [10]:
def train_one_epoch(model, loader, optimizer):
    model.train()
    running_loss = 0.0
    correct_action = 0
    correct_tool   = 0
    total = 0

    for frames, action_labels, tool_labels in loader:
        frames = frames.to(device)
        action_labels = action_labels.to(device)
        tool_labels   = tool_labels.to(device)

        optimizer.zero_grad()
        action_logits, tool_logits = model(frames)

        loss_action = ce_action(action_logits, action_labels)
        loss_tool   = ce_tool(tool_logits, tool_labels)
        loss = loss_action + 0.5 * loss_tool

        loss.backward()
        optimizer.step()

        bs = frames.size(0)
        running_loss += loss.item() * bs
        total += bs

        action_pred = action_logits.argmax(1)
        tool_pred   = tool_logits.argmax(1)

        correct_action += (action_pred == action_labels).sum().item()
        correct_tool   += (tool_pred == tool_labels).sum().item()

    return running_loss / total, correct_action / total, correct_tool / total


@torch.no_grad()
def eval_model(model, loader):
    model.eval()
    running_loss = 0.0
    correct_action = 0
    correct_tool   = 0
    total = 0

    for frames, action_labels, tool_labels in loader:
        frames = frames.to(device)
        action_labels = action_labels.to(device)
        tool_labels   = tool_labels.to(device)

        action_logits, tool_logits = model(frames)

        loss_action = ce_action(action_logits, action_labels)
        loss_tool   = ce_tool(tool_logits, tool_labels)
        loss = loss_action + 0.5 * loss_tool

        bs = frames.size(0)
        running_loss += loss.item() * bs
        total += bs

        action_pred = action_logits.argmax(1)
        tool_pred   = tool_logits.argmax(1)

        correct_action += (action_pred == action_labels).sum().item()
        correct_tool   += (tool_pred == tool_labels).sum().item()

    return running_loss / total, correct_action / total, correct_tool / total


# -------- Train --------
best_val_acc = 0.0
best_path = "/content/best_multitask_r3d18.pth"

for epoch in range(1, EPOCHS + 1):
    print(f"\n--- Epoch {epoch}/{EPOCHS} ---")
    train_loss, train_acc, train_tool_acc = train_one_epoch(model, train_loader, optimizer)
    val_loss, val_acc, val_tool_acc = eval_model(model, val_loader)

    print(f"Train L {train_loss:.3f} A {train_acc:.3f} T {train_tool_acc:.3f} | "
          f"Val L {val_loss:.3f} A {val_acc:.3f} T {val_tool_acc:.3f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), best_path)
        print("  -> Saved best model, val action acc:", best_val_acc)



--- Epoch 1/8 ---


KeyboardInterrupt: 

In [None]:
# Load best model
model.load_state_dict(torch.load(best_path, map_location=device))
model.to(device)
model.eval()

# Collect test filenames
test_files = sorted([f for f in os.listdir(TEST_DIR) if f.lower().endswith(".avi")])
print("Num test videos:", len(test_files))

test_dataset = TestVideoDataset(test_files, TEST_DIR)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                          shuffle=False, num_workers=2, pin_memory=True)

all_fns = []
all_preds = []

with torch.no_grad():
    for frames, fns in test_loader:
        frames = frames.to(device)
        action_logits, tool_logits = model(frames)
        preds = action_logits.argmax(1).cpu().numpy()  # 0..29
        preds = preds + 1                               # 1..30

        all_fns.extend(list(fns))
        all_preds.extend(list(preds))

pred_df = pd.DataFrame({"filename": all_fns, "class_id": all_preds})
pred_df = pred_df.sort_values("filename").reset_index(drop=True)
out_csv = DATA_ROOT + "/test_set_labels_colab.csv"
pred_df.to_csv(out_csv, index=False)
print("Saved predictions to:", out_csv)
