# Full ViT Experiment with 20,000 Samples
- **Purpose**: Maximize ViT-Base performance within Colab constraints using a larger, balanced dataset.
- **Details**: Trained for **6** epochs with early stopping (patience=2), achieving **78.85%** accuracy.
- **Outcome**: Strong results for financial classes (e.g., Invoice: **72.8%**), saved model to `models/rvl_cdip_vit_model/`.

In [None]:
# Checking system resources
import torch
import psutil
import os

# CPU and RAM info
print(f"Total RAM: {psutil.virtual_memory().total / (1024**3):.2f} GB")
print(f"Available RAM: {psutil.virtual_memory().available / (1024**3):.2f} GB")
print(f"Used RAM: {psutil.virtual_memory().used / (1024**3):.2f} GB")
print(f"CPU Count: {os.cpu_count()}")

# GPU info
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB")
    print(f"Allocated GPU Memory: {torch.cuda.memory_allocated(0) / (1024**2):.2f} MB")
    print(f"Free GPU Memory: {torch.cuda.memory_reserved(0) / (1024**2):.2f} MB")
else:
    print("No GPU available yet. Try again later or check runtime settings.")

# Disk info
!df -h

Total RAM: 12.67 GB
Available RAM: 11.20 GB
Used RAM: 1.17 GB
CPU Count: 2
GPU Available: True
GPU Name: Tesla T4
Total GPU Memory: 14.74 GB
Allocated GPU Memory: 0.00 MB
Free GPU Memory: 0.00 MB
Filesystem      Size  Used Avail Use% Mounted on
overlay         113G   40G   74G  36% /
tmpfs            64M     0   64M   0% /dev
shm             5.7G     0  5.7G   0% /dev/shm
/dev/root       2.0G  1.2G  820M  59% /usr/sbin/docker-init
/dev/sda1        92G   72G   21G  79% /opt/bin/.nvidia
tmpfs           6.4G  904K  6.4G   1% /var/colab
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware


In [None]:
# Step 1: Installing required libraries & Setting up the Environment
!pip install -q transformers datasets torch torchvision accelerate

import torch
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, IterableDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import defaultdict
from torchvision import transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")
!free -h
!df -h

Using device: cuda
GPU name: Tesla T4
               total        used        free      shared  buff/cache   available
Mem:            12Gi       1.9Gi       846Mi       2.0Mi         9Gi        10Gi
Swap:             0B          0B          0B
Filesystem      Size  Used Avail Use% Mounted on
overlay         113G   40G   74G  36% /
tmpfs            64M     0   64M   0% /dev
shm             5.7G     0  5.7G   0% /dev/shm
/dev/root       2.0G  1.2G  820M  59% /usr/sbin/docker-init
/dev/sda1        92G   72G   21G  79% /opt/bin/.nvidia
tmpfs           6.4G  916K  6.4G   1% /var/colab
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware


In [None]:
# Step 2: Loading Dataset with Balanced Streaming and Augmentation
dataset = load_dataset("aharley/rvl_cdip", streaming=True)
label_map = {0: "letter", 1: "form", 2: "email", 3: "handwritten", 4: "advertisement",
             5: "scientific report", 6: "scientific publication", 7: "specification",
             8: "file folder", 9: "news article", 10: "budget", 11: "invoice",
             12: "presentation", 13: "questionnaire", 14: "resume", 15: "memo"}
num_labels = len(label_map)
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

class BalancedStreamingDataset(IterableDataset):
    def __init__(self, dataset_split, total_samples, num_classes=16):
        self.dataset = dataset_split
        self.total_samples = total_samples
        self.target_per_class = total_samples // num_classes  # ~1250 for 20,000
        self.num_classes = num_classes
        self._epoch = 0
        self.augment = transforms.Compose([
            transforms.RandomRotation(10),
            transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
            transforms.ColorJitter(brightness=0.2, contrast=0.2),
        ])

    def __iter__(self):
        class_counts = defaultdict(int)
        samples_yielded = 0
        for example in self.dataset:
            label = example["label"]
            if class_counts[label] < self.target_per_class:
                class_counts[label] += 1
                image = example["image"].convert("RGB")
                image = self.augment(image)
                inputs = processor(images=image, return_tensors="pt")
                yield {
                    "pixel_values": inputs["pixel_values"].squeeze(0),
                    "labels": label
                }
                samples_yielded += 1
                if samples_yielded >= self.total_samples:
                    break

    def __len__(self):
        return self.total_samples

    def set_epoch(self, epoch: int):
        self._epoch = epoch

train_size = 20000  # 20,000 samples
val_size = 2000
test_size = 2000
train_dataset = BalancedStreamingDataset(dataset["train"], train_size)
val_dataset = BalancedStreamingDataset(dataset["validation"], val_size)
test_dataset = BalancedStreamingDataset(dataset["test"], test_size)
print(f"Training size: {train_size}, Validation size: {val_size}, Test size: {test_size}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Training size: 20000, Validation size: 2000, Test size: 2000


In [None]:
# Step 3: Load Pre-trained ViT Model
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)
model.to(device)
print(f"GPU memory allocated: {torch.cuda.memory_allocated(device) / 1024**2:.2f} MB")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU memory allocated: 327.34 MB


In [None]:
# Step 4: Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./rvl_cdip_vit",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
    gradient_accumulation_steps=8,  # Effective batch size 64
)

In [None]:
# Step 6: Train the Model with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
print("Starting training...")
trainer.train()
print(f"GPU memory allocated post-training: {torch.cuda.memory_allocated(device) / 1024**2:.2f} MB")

Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgavhaneprasad14092001[0m ([33mgavhaneprasad14092001-indian-school-of-mines[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4753,1.456652,0.6325,0.654019,0.6325,0.62184
2,1.1496,1.163137,0.7005,0.71386,0.7005,0.69905
3,0.9907,1.047009,0.7205,0.747403,0.7205,0.723401
4,0.8442,0.920603,0.7575,0.779007,0.7575,0.761513
5,0.7979,0.855802,0.777,0.791837,0.777,0.780083
6,0.6795,0.801961,0.7895,0.794783,0.7895,0.790739


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 438958ce-54be-440f-8624-f48b32e70980)')' thrown while requesting GET https://huggingface.co/datasets/rvl_cdip/resolve/main/data/rvl-cdip.tar.gz
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 5e41a995-8973-47b7-a30a-bd4a885d803f)')' thrown while requesting GET https://huggingface.co/datasets/rvl_cdip/resolve/main/data/rvl-cdip.tar.gz
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: f2f72e2c-1ffd-444c-a8e4-aaf3475cae61)')' thrown while requesting GET https://huggingface.co/datasets/rvl_cdip/resolve/main/data/rvl-cdip.tar.gz
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)")

GPU memory allocated post-training: 1000.00 MB


In [None]:
# Step 7: Evaluate on Test Set
test_dataset = BalancedStreamingDataset(dataset["test"], test_size)
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
financial_classes = [1, 10, 11, 15]
for cls in financial_classes:
    mask = labels == cls
    cls_preds = preds[mask]
    cls_labels = labels[mask]
    acc = accuracy_score(cls_labels, cls_preds) if len(cls_labels) > 0 else 0
    print(f"Accuracy for {label_map[cls]} (label {cls}): {acc:.4f}")

Test Results: {'eval_loss': 0.8201947212219238, 'eval_accuracy': 0.7885, 'eval_precision': 0.7906251192026067, 'eval_recall': 0.7885, 'eval_f1': 0.7887677059200332, 'eval_runtime': 451.1928, 'eval_samples_per_second': 4.433, 'eval_steps_per_second': 0.554, 'epoch': 6.9792}


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3846b642-2059-429b-9bd7-c40703be98a9)')' thrown while requesting GET https://huggingface.co/datasets/rvl_cdip/resolve/main/data/rvl-cdip.tar.gz
Retrying in 1s [Retry 1/5].


Accuracy for form (label 1): 0.6160
Accuracy for budget (label 10): 0.6240
Accuracy for invoice (label 11): 0.7280
Accuracy for memo (label 15): 0.7120


In [None]:
# Step 8: Save the Model
model.save_pretrained("./rvl_cdip_vit_model")
processor.save_pretrained("./rvl_cdip_vit_model")
!du -sh ./rvl_cdip_vit_model
!df -h

# Optional: Save to Google Drive
from google.colab import drive
drive.mount('/content/drive')
!cp -r ./rvl_cdip_vit_model /content/drive/MyDrive/rvl_cdip_vit_model

328M	./rvl_cdip_vit_model
Filesystem      Size  Used Avail Use% Mounted on
overlay         113G   47G   67G  42% /
tmpfs            64M     0   64M   0% /dev
shm             5.7G   16K  5.7G   1% /dev/shm
/dev/root       2.0G  1.2G  820M  59% /usr/sbin/docker-init
/dev/sda1        92G   73G   20G  79% /opt/bin/.nvidia
tmpfs           6.4G  1.1M  6.4G   1% /var/colab
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware
