In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
project_path = '/content/drive/MyDrive/action-recognition-vit'  # Update with your project path
os.chdir(project_path)

In [5]:
!pip install -r requirements.txt

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 2))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 2))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->-r requirements.txt (line 2))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->-r requirements.txt (line 2))
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->-r requirements.txt (line 2))
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->-r requirements.txt (line 2))
  Downl

In [23]:
import zipfile
import os


# Unzip dataset
zip_path = '/content/drive/MyDrive/action-recognition-vit/HMDB_simp.zip'
extract_path = '/content/drive/MyDrive/action-recognition-vit'
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Dataset extracted to {extract_path}")


Dataset extracted to /content/drive/MyDrive/action-recognition-vit


In [24]:
dataset_path = '/content/drive/MyDrive/action-recognition-vit/HMDB_simp'  # Update with your dataset path
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at {dataset_path}")
else:
    print("Dataset found!")

Dataset found!


In [25]:
import torch
import numpy as np
import json
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers.trainer_utils import IntervalStrategy
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)  # Suppress FutureWarnings
from torch import nn, optim
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import logging
import sys
import argparse
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split

from transformers import AutoImageProcessor, AutoModelForVideoClassification
import os
from transformers import TimesformerForVideoClassification

In [27]:
import os
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split

class MultiClipHMDBDataset(Dataset):
    def __init__(self, root_dir, clip_size=8, frame_rate=32, transform=None):
        """
        Virtual dataset that creates multiple clips per video with frame augmentation

        Args:
            root_dir (str): Path to dataset directory
            clip_size (int): Number of frames per clip
            frame_rate (int): Sampling rate (every Nth frame)
            transform: Torchvision transforms
        """
        self.root_dir = root_dir
        self.clip_size = clip_size
        self.frame_rate = frame_rate
        self.transform = transform
        self.data = self._load_data()

    def _augment_frames_list(self, frames_list):
        """Augment frames when we have fewer than clip_size"""
        if len(frames_list) == 0:
            # Create black frame paths
            return ['black_frame'] * self.clip_size

        elif len(frames_list) == 1:
            # Repeat the single frame
            return frames_list * self.clip_size

        else:
            # Frame interpolation and temporal reversal
            augmented_frames = []

            # Add original frames
            for i, frame in enumerate(frames_list):
                augmented_frames.append(frame)

                # Add interpolated frame between consecutive frames
                if i < len(frames_list) - 1:
                    augmented_frames.append(f"interp_{frame}_{frames_list[i+1]}")

            # If still not enough, use temporal reversal
            while len(augmented_frames) < self.clip_size:
                reversed_frames = augmented_frames[::-1]
                augmented_frames.extend(reversed_frames)

            return augmented_frames[:self.clip_size]

    def _load_data(self):
        """Create virtual clips from videos with multiple clips per video"""
        data = []
        subfolders = sorted(os.listdir(self.root_dir))

        if len(subfolders) != 25:
            raise ValueError(f"Dataset must contain exactly 25 subfolders. Found {len(subfolders)}.")

        for label, action in enumerate(subfolders):
            action_path = os.path.join(self.root_dir, action)
            if not os.path.isdir(action_path):
                continue

            for video_folder in os.listdir(action_path):
                video_path = os.path.join(action_path, video_folder)
                if not os.path.isdir(video_path):
                    continue

                # Skip already processed folders
                if '_' in video_folder and video_folder.split('_')[-1].isdigit():
                    continue

                # Get image frames
                all_frames = sorted([f for f in os.listdir(video_path)
                                   if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])

                if len(all_frames) == 0:
                    continue

                # Sample frames
                sampled_frames = all_frames[::self.frame_rate]

                # Create mutually exclusive clips (no overlapping frames)
                clips_created = 0

                if len(sampled_frames) >= self.clip_size:
                    # Calculate how many complete clips we can make
                    num_complete_clips = len(sampled_frames) // self.clip_size

                    if num_complete_clips >= 2:
                        # Create multiple non-overlapping clips
                        for clip_idx in range(num_complete_clips):
                            start_idx = clip_idx * self.clip_size
                            end_idx = start_idx + self.clip_size
                            clip_frames = sampled_frames[start_idx:end_idx]

                            data.append((video_path, label, clips_created, clip_frames))
                            clips_created += 1

                    else:
                        # Only enough frames for 1 complete clip (8-15 frames)
                        # Take the first clip_size frames
                        clip_frames = sampled_frames[:self.clip_size]
                        data.append((video_path, label, clips_created, clip_frames))
                        clips_created += 1

                else:
                    # Use augmentation for insufficient frames (<8 frames)
                    augmented_frames = self._augment_frames_list(sampled_frames)
                    data.append((video_path, label, clips_created, augmented_frames))
                    clips_created += 1

                # Calculate remaining frames that were not used
                frames_used = min(len(sampled_frames), (len(sampled_frames) // self.clip_size) * self.clip_size)
                frames_discarded = len(sampled_frames) - frames_used


        print(f"Total clips created: {len(data)}")
        return data

    def __len__(self):
        return len(self.data)

    def _load_frames_from_clip(self, video_path, clip_frames):
        """Load frames for a specific clip"""
        frames = []

        for frame_name in clip_frames:
            if frame_name == 'black_frame':
                # Create black frame
                img = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))

            elif frame_name.startswith('interp_'):
                # Handle interpolated frames
                # Format: "interp_frame1_frame2"
                parts = frame_name.replace('interp_', '').split('_')
                if len(parts) >= 2:
                    frame1_name = '_'.join(parts[:-1])
                    frame2_name = parts[-1]

                    frame1_path = os.path.join(video_path, frame1_name)
                    frame2_path = os.path.join(video_path, frame2_name)

                    if os.path.exists(frame1_path) and os.path.exists(frame2_path):
                        try:
                            img1 = np.array(Image.open(frame1_path))
                            img2 = np.array(Image.open(frame2_path))
                            # Simple interpolation
                            img_avg = np.mean([img1, img2], axis=0, dtype=np.uint8)
                            img = Image.fromarray(img_avg)
                        except Exception as e:
                            print(f"Error interpolating frames: {e}")
                            img = Image.open(frame1_path)  # Fallback to first frame
                    else:
                        # Fallback to any available frame
                        available_frames = [f for f in os.listdir(video_path)
                                          if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
                        if available_frames:
                            img = Image.open(os.path.join(video_path, available_frames[0]))
                        else:
                            img = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))
                else:
                    # Fallback
                    available_frames = [f for f in os.listdir(video_path)
                                      if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
                    if available_frames:
                        img = Image.open(os.path.join(video_path, available_frames[0]))
                    else:
                        img = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))

            else:
                # Regular frame
                frame_path = os.path.join(video_path, frame_name)
                if os.path.exists(frame_path):
                    try:
                        img = Image.open(frame_path)
                    except Exception as e:
                        print(f"Error loading frame {frame_path}: {e}")
                        img = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))
                else:
                    print(f"Frame not found: {frame_path}")
                    img = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))

            frames.append(img)

        # Apply brightness variation for repeated frames
        frame_names_unique = list(set([f for f in clip_frames if not f.startswith('interp_')]))
        if len(frame_names_unique) == 1 and frame_names_unique[0] != 'black_frame':
            # All frames are the same, apply brightness variation
            varied_frames = []
            for i, frame in enumerate(frames):
                brightness_factor = 0.8 + (i % 5) * 0.1
                frame_np = np.asarray(frame).astype(np.float32)
                frame_np = frame_np * brightness_factor
                frame_np = np.clip(frame_np, 0, 255).astype(np.uint8)
                varied_frames.append(Image.fromarray(frame_np))
            frames = varied_frames

        # Ensure exactly clip_size frames
        while len(frames) < self.clip_size:
            if frames:
                frames.append(frames[-1])  # Repeat last frame
            else:
                frames.append(Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8)))

        frames = frames[:self.clip_size]
        return frames

    def __getitem__(self, idx):
        video_path, label, clip_idx, clip_frames = self.data[idx]

        # Load frames for this specific clip
        frames = self._load_frames_from_clip(video_path, clip_frames)

        # Apply transforms
        if self.transform:
            frames = [self.transform(frame) for frame in frames]

        return torch.stack(frames), label

def get_dataloader(root_dir, batch_size=8, clip_size=8, train_ratio=0.8, val_ratio=0.1, frame_rate=32):
    """
    Create dataloaders using virtual multi-clip dataset with augmentation
    """
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Create virtual dataset
    dataset = MultiClipHMDBDataset(
        root_dir=root_dir,
        clip_size=clip_size,
        frame_rate=frame_rate,
        transform=transform
    )

    if len(dataset) == 0:
        raise ValueError("No valid clips found in dataset. Check your data structure and frame_rate parameter.")

    # Create splits
    indices = range(len(dataset))
    labels = [label for _, label, _, _ in dataset.data]

    # Check if we have enough samples for each class
    unique_labels, counts = np.unique(labels, return_counts=True)
    min_count = np.min(counts)
    print(f"Minimum samples per class: {min_count}")

    if min_count < 2:
        print("Some classes have only 1 sample. Using random split instead of stratified split.")
        # First split: train+val vs test
        train_val_idx, test_idx = train_test_split(
            indices, test_size=1-(train_ratio+val_ratio), random_state=42
        )

        # Second split: train vs val
        train_idx, val_idx = train_test_split(
            train_val_idx, test_size=val_ratio/(train_ratio+val_ratio), random_state=42
        )
    else:
        # First split: train+val vs test
        train_val_idx, test_idx = train_test_split(
            indices, test_size=1-(train_ratio+val_ratio), stratify=labels, random_state=42
        )

        # Second split: train vs val
        train_val_labels = [labels[i] for i in train_val_idx]
        train_idx, val_idx = train_test_split(
            train_val_idx, test_size=val_ratio/(train_ratio+val_ratio),
            stratify=train_val_labels, random_state=42
        )

    # Create subsets
    train_dataset = torch.utils.data.Subset(dataset, train_idx)
    val_dataset = torch.utils.data.Subset(dataset, val_idx)
    test_dataset = torch.utils.data.Subset(dataset, test_idx)

    # Create dataloaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True,
        num_workers=0, pin_memory=False
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False,
        num_workers=0, pin_memory=False
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False,
        num_workers=0, pin_memory=False
    )

    print(f"Dataset splits: Train={len(train_dataset)}, Val={len(val_dataset)}, Test={len(test_dataset)}")
    return train_loader, val_loader, test_loader

def analyze_dataset_statistics(dataset):
    """
    Print detailed statistics about the dataset showing multiple clips per video
    """
    print("=== DATASET STATISTICS ===")
    print(f"Total clips: {len(dataset)}")

    # Analyze by video and action class
    video_clips = {}
    action_stats = {}

    for video_path, label, clip_idx, clip_frames in dataset.data:
        action_name = os.path.basename(os.path.dirname(video_path))
        video_name = os.path.basename(video_path)

        # Count clips per video
        video_key = f"{action_name}/{video_name}"
        if video_key not in video_clips:
            video_clips[video_key] = 0
        video_clips[video_key] += 1

        # Count clips per action
        if action_name not in action_stats:
            action_stats[action_name] = {'clips': 0, 'videos': set()}
        action_stats[action_name]['clips'] += 1
        action_stats[action_name]['videos'].add(video_name)

    print("\n=== CLIPS PER VIDEO DISTRIBUTION ===")
    clips_per_video_dist = {}
    for video, clip_count in video_clips.items():
        if clip_count not in clips_per_video_dist:
            clips_per_video_dist[clip_count] = 0
        clips_per_video_dist[clip_count] += 1

    for clip_count in sorted(clips_per_video_dist.keys()):
        video_count = clips_per_video_dist[clip_count]
        print(f"{clip_count} clips per video: {video_count} videos")

    print("\n=== BY ACTION CLASS ===")
    for action, stats in sorted(action_stats.items()):
        avg_clips_per_video = stats['clips'] / len(stats['videos']) if stats['videos'] else 0
        print(f"{action}: {stats['clips']} clips from {len(stats['videos'])} videos (avg: {avg_clips_per_video:.1f} clips/video)")

    print(f"\n=== SUMMARY ===")
    total_videos = len(video_clips)
    total_clips = len(dataset)
    avg_clips_per_video = total_clips / total_videos if total_videos > 0 else 0
    print(f"Total videos: {total_videos}")
    print(f"Total clips: {total_clips}")
    print(f"Average clips per video: {avg_clips_per_video:.2f}")

    # Show some examples of multi-clip videos
    multi_clip_videos = [(v, c) for v, c in video_clips.items() if c > 1]
    if multi_clip_videos:
        print(f"\n=== EXAMPLES OF MULTI-CLIP VIDEOS ===")
        for video, clip_count in sorted(multi_clip_videos, key=lambda x: x[1], reverse=True)[:10]:
            print(f"{video}: {clip_count} clips")

    return video_clips, action_stats

In [28]:
def load_timesformer_model():
    """
    Load the pre-trained TimeSformer model for video classification.

    """
    import logging
    warnings.filterwarnings("ignore", message="Some weights of TimesformerForVideoClassification were not initialized")
    logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

    label_index_dict={'brush_hair': 0, 'cartwheel': 1, 'catch': 2, 'chew': 3, 'climb': 4, 'climb_stairs': 5, 'draw_sword': 6, 'eat': 7, 'fencing': 8, 'flic_flac': 9, 'golf': 10, 'handstand': 11, 'kiss': 12, 'pick': 13, 'pour': 14, 'pullup': 15, 'pushup': 16, 'ride_bike': 17, 'shoot_bow': 18, 'shoot_gun': 19, 'situp': 20, 'smile': 21, 'smoke': 22, 'throw': 23, 'wave': 24}
    index_label_dict={0: 'brush_hair', 1: 'cartwheel', 2: 'catch', 3: 'chew', 4: 'climb', 5: 'climb_stairs', 6: 'draw_sword', 7: 'eat', 8: 'fencing', 9: 'flic_flac',10: 'golf', 11: 'handstand', 12: 'kiss', 13: 'pick', 14: 'pour', 15: 'pullup', 16: 'pushup', 17: 'ride_bike', 18: 'shoot_bow', 19: 'shoot_gun', 20: 'situp', 21: 'smile', 22: 'smoke', 23: 'throw', 24: 'wave'}
    # Load the processor and model from Hugging Face
    processor =AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k600")
    ckpt = "facebook/timesformer-base-finetuned-k600"
    model = TimesformerForVideoClassification.from_pretrained(ckpt,label2id = label_index_dict,id2label = index_label_dict,ignore_mismatched_sizes = True)
    #model.classifier = torch.nn.Linear(model.config.hidden_size, 25)
    # Optionally load fine-tuned weights if available
    checkpoint_path = "/content/drive/MyDrive/action-recognition-vit/timesformer_model.pth"  # Update this path if you have fine-tuned weights
    if checkpoint_path and os.path.exists(checkpoint_path):
        model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
        print("Loaded fine-tuned weights from:", checkpoint_path)
    else:
        print("Using pre-trained TimeSformer weights.")

    return processor, model

In [29]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # Top-1 accuracy
    top1_predictions = np.argmax(predictions, axis=1)
    top1_accuracy = np.mean(top1_predictions == labels)

    # Top-5 accuracy
    top5_predictions = np.argsort(predictions, axis=1)[:, -5:]
    top5_correct = np.array([labels[i] in top5_predictions[i] for i in range(len(labels))])
    top5_accuracy = np.mean(top5_correct)

    return {
        "top1_accuracy": top1_accuracy,
        "top5_accuracy": top5_accuracy,
        "eval_top1_accuracy": top1_accuracy,
        "eval_top5_accuracy": top5_accuracy,
    }


In [31]:
def train_model_with_trainer(data_dir, epochs, batch_size, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    train_loader, val_loader, test_loader = get_dataloader(data_dir, batch_size)
    processor, model = load_timesformer_model()

    train_dataset = train_loader.dataset
    val_dataset = val_loader.dataset

    def data_collator(batch):
        videos = torch.stack([item[0] for item in batch])
        labels = torch.tensor([item[1] for item in batch])
        return {
            'pixel_values': videos,
            'labels': labels
        }
    optimizer = optim.SGD(model.parameters(), momentum= 0.9, weight_decay= 1e-3,
    			  lr= learning_rate)

    training_args = TrainingArguments(output_dir = "./results",
                                      overwrite_output_dir = True,
                                      eval_strategy = 'epoch',
                                      per_device_train_batch_size = 8,
                                      per_device_eval_batch_size = 8,
                                      num_train_epochs = epochs,
                                      logging_dir = os.path.normpath(os.path.join('./results', 'logs')),
                                      logging_strategy = "epoch",
                                      save_strategy = 'epoch',
                                      save_total_limit = 1,
                                      remove_unused_columns = False,
                                      load_best_model_at_end = True,
                                      metric_for_best_model = 'eval_top1_accuracy',
                                      greater_is_better = True,
                                      label_smoothing_factor = 0.1,
                                      report_to = "tensorboard",
                                      push_to_hub = False,
                                      save_only_model= True)


    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience= 5 )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        optimizers=(optimizer,None),
        eval_dataset=val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

    print("Starting training...")
    trainer.train()

    print("\nEvaluating on test set...")
    test_dataset = test_loader.dataset
    test_results = trainer.evaluate(eval_dataset=test_dataset)

    print(f"\n=== FINAL TEST RESULTS ===")
    print(f"Test Top-1 Accuracy: {test_results.get('eval_top1_accuracy', 'N/A'):.4f}")
    print(f"Test Top-5 Accuracy: {test_results.get('eval_top5_accuracy', 'N/A'):.4f}")
    print(f"Test Loss: {test_results.get('eval_loss', 'N/A'):.4f}")

    trainer.save_model("./timesformer_model")

    with open("./test_results.json", "w") as f:
        json.dump(test_results, f, indent=2)

    print("Training complete. Model and results saved.")
    return test_results


In [32]:
def evaluate_saved_model(model_path, data_dir, batch_size=8):
    from transformers import TimesformerForVideoClassification

    model = TimesformerForVideoClassification.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    _, _, test_loader = get_dataloader(data_dir, batch_size)

    def data_collator(batch):
        videos = torch.stack([item[0] for item in batch])
        labels = torch.tensor([item[1] for item in batch])
        return {
            'pixel_values': videos,
            'labels': labels
        }

    trainer = Trainer(
        model=model,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    results = trainer.evaluate(eval_dataset=test_loader.dataset)

    print(f"=== Model Evaluation: {model_path} ===")
    print(f"Top-1 Accuracy: {results.get('eval_top1_accuracy', 'N/A'):.4f}")
    print(f"Top-5 Accuracy: {results.get('eval_top5_accuracy', 'N/A'):.4f}")
    print(f"Loss: {results.get('eval_loss', 'N/A'):.4f}")

    return results


In [33]:
train_loader, val_loader, test_loader = get_dataloader(
    root_dir="/content/drive/MyDrive/action-recognition-vit/HMDB_simp",
    batch_size=8,
    clip_size=8,
    frame_rate=32
)

Total clips created: 1260
Minimum samples per class: 50
Dataset splits: Train=1007, Val=127, Test=126


In [34]:
train_model_with_trainer(
        data_dir="/content/drive/MyDrive/action-recognition-vit/HMDB_simp",
        epochs=20,
        batch_size=8,
        learning_rate=0.005
    )

Using device: cuda
Total clips created: 1260
Minimum samples per class: 50
Dataset splits: Train=1007, Val=127, Test=126
Using pre-trained TimeSformer weights.
Starting training...


model.safetensors:   0%|          | 0.00/487M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Top1 Accuracy,Top5 Accuracy
1,2.1602,1.345165,0.80315,0.976378
2,1.0386,1.080723,0.84252,0.984252
3,0.8243,1.030909,0.858268,0.976378
4,0.7227,1.023571,0.874016,0.968504
5,0.6748,1.013771,0.88189,0.976378
6,0.6522,1.010637,0.88189,0.952756
7,0.6424,1.013888,0.889764,0.96063
8,0.6375,1.026655,0.874016,0.96063
9,0.6349,1.031386,0.874016,0.952756
10,0.6333,1.036304,0.874016,0.952756



Evaluating on test set...



=== FINAL TEST RESULTS ===
Test Top-1 Accuracy: 0.8492
Test Top-5 Accuracy: 1.0000
Test Loss: 1.0259
Training complete. Model and results saved.


{'eval_top1_accuracy': 0.8492063492063492,
 'eval_top5_accuracy': 1.0,
 'eval_loss': 1.025852084159851,
 'eval_runtime': 8.7083,
 'eval_samples_per_second': 14.469,
 'eval_steps_per_second': 1.837,
 'epoch': 12.0}