## Initial Experiment with 5,000 Samples

- **Purpose:** Test ViT-Base on a small subset to establish a baseline for document classification.  
- **Details:** Random sampling led to uneven class distribution, resulting in lower accuracy (**59.15%**).  
- **Outcome:** Identified need for balanced sampling, implemented in later experiments.  


In [None]:
# Installing required libraries
!pip install -q transformers datasets torch torchvision accelerate

import torch
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
from datasets import load_dataset, IterableDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check GPU and resources
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")
!free -h  # Check RAM
!df -h   # Check disk space

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Step 2: Load Dataset with Streaming
dataset = load_dataset("aharley/rvl_cdip", streaming=True)

label_map = {
    0: "letter", 1: "form", 2: "email", 3: "handwritten", 4: "advertisement",
    5: "scientific report", 6: "scientific publication", 7: "specification",
    8: "file folder", 9: "news article", 10: "budget", 11: "invoice",
    12: "presentation", 13: "questionnaire", 14: "resume", 15: "memo"
}
num_labels = len(label_map)

processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

class StreamingDataset(IterableDataset):
    def __init__(self, dataset_split, max_samples):
        self.dataset = dataset_split
        self.max_samples = max_samples
        self._epoch = 0  # Initialize epoch tracking

    def __iter__(self):
        count = 0
        for example in self.dataset:
            if count >= self.max_samples:
                break
            image = example["image"].convert("RGB")
            inputs = processor(images=image, return_tensors="pt")
            yield {
                "pixel_values": inputs["pixel_values"].squeeze(0),
                "labels": example["label"]
            }
            count += 1

    def __len__(self):
        return self.max_samples

    def set_epoch(self, epoch: int):
        self._epoch = epoch  # Update epoch value as required by Trainer

train_size = 5000
val_size = 2000
test_size = 2000

train_dataset = StreamingDataset(dataset["train"], train_size)
val_dataset = StreamingDataset(dataset["validation"], val_size)
test_dataset = StreamingDataset(dataset["test"], test_size)

print(f"Training size: {train_size}, Validation size: {val_size}, Test size: {test_size}")

Training size: 5000, Validation size: 2000, Test size: 2000


In [None]:
# Step 3: Load Pre-trained ViT Model
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)
model.to(device)
print(f"GPU memory allocated: {torch.cuda.memory_allocated(device) / 1024**2:.2f} MB")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU memory allocated: 654.69 MB


In [None]:
# Step 4: Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./rvl_cdip_vit",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
    gradient_accumulation_steps=8,
)

In [None]:
# Step 6: Train the Model
print("Checking GPU availability before training...")
print(f"GPU available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
print("Starting training...")
trainer.train()
print(f"GPU memory allocated post-training: {torch.cuda.memory_allocated(device) / 1024**2:.2f} MB")

Checking GPU availability before training...
GPU available: True
Current device: 0
Device name: Tesla T4
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1711,2.079495,0.5195,0.503776,0.5195,0.472049
2,1.6447,1.750604,0.5915,0.61107,0.5915,0.559292


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


GPU memory allocated post-training: 1325.74 MB


In [None]:
# Step 7: Evaluate on Test Set
test_dataset = StreamingDataset(dataset["test"], test_size)
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
financial_classes = [1, 10, 11, 15]
for cls in financial_classes:
    mask = labels == cls
    cls_preds = preds[mask]
    cls_labels = labels[mask]
    acc = accuracy_score(cls_labels, cls_preds) if len(cls_labels) > 0 else 0
    print(f"Accuracy for {label_map[cls]} (label {cls}): {acc:.4f}")

Test Results: {'eval_loss': 1.7476862668991089, 'eval_accuracy': 0.5915, 'eval_precision': 0.5596926554267293, 'eval_recall': 0.5915, 'eval_f1': 0.5564453816774622, 'eval_runtime': 158.5213, 'eval_samples_per_second': 12.617, 'eval_steps_per_second': 3.154, 'epoch': 2.9856}
Accuracy for form (label 1): 0.2126
Accuracy for budget (label 10): 0.4275
Accuracy for invoice (label 11): 0.6975
Accuracy for memo (label 15): 0.1788


In [None]:
# Step 8: Save the Model
model.save_pretrained("./rvl_cdip_vit_model")
processor.save_pretrained("./rvl_cdip_vit_model")
!du -sh ./rvl_cdip_vit_model
!df -h

328M	./rvl_cdip_vit_model
Filesystem      Size  Used Avail Use% Mounted on
overlay         113G   43G   70G  38% /
tmpfs            64M     0   64M   0% /dev
shm             5.7G   24K  5.7G   1% /dev/shm
/dev/root       2.0G  1.2G  820M  59% /usr/sbin/docker-init
/dev/sda1        92G   71G   22G  77% /opt/bin/.nvidia
tmpfs           6.4G  308K  6.4G   1% /var/colab
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware
