# Train and evaluate the student without distillation to get a baseline

In [None]:
!pip install transformers datasets accelerate tensorboard evaluate --upgrade

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (f

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (
    ViTFeatureExtractor, ViTForImageClassification,
    TrainingArguments, Trainer, DefaultDataCollator
)
import evaluate
from sklearn.model_selection import train_test_split

In [None]:
dataset = load_dataset("cifar10")

train_subset_size = 5000
val_subset_size = 1000
test_subset_size = 2000

train_data = dataset["train"].shuffle(seed=42)
train_indices, val_indices = train_test_split(range(len(train_data)), test_size=val_subset_size, random_state=42)

dataset["train"] = train_data.select(range(train_subset_size))
dataset["validation"] = train_data.select(val_indices)
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(test_subset_size))

# Load the feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("nateraw/vit-base-patch16-224-cifar10")

# Preprocessing function
def process(examples):
    inputs = feature_extractor(examples["img"], return_tensors="np")
    examples["pixel_values"] = inputs["pixel_values"]
    return examples

# Apply the preprocessing to the dataset
processed_datasets = dataset.map(process, batched=True, remove_columns=["img"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/918 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# lightweight CNN for CIFAR-10
class SmallCNN(nn.Module):
    def __init__(self, num_classes=10, input_size=224):
        super(SmallCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)

        self._flattened_size = self._get_flattened_size(input_size)

        self.fc1 = nn.Linear(self._flattened_size, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def _get_flattened_size(self, input_size):
        with torch.no_grad():
            x = torch.zeros(1, 3, input_size, input_size)
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = self.pool(F.relu(self.conv3(x)))
        return x.numel()

    def forward(self, pixel_values, labels=None):
        x = self.pool(F.relu(self.conv1(pixel_values)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        logits = self.fc2(x)

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)

        return {"logits": logits, "loss": loss}

In [None]:
# Initialize the small CNN model
num_labels = len(processed_datasets["train"].features["label"].names)
small_cnn_model = SmallCNN(num_classes=num_labels)

# Define Training Arguments
baseline_training_args = TrainingArguments(
    output_dir="small-cnn-baseline",
    num_train_epochs=10,  # Fewer epochs for quick testing
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=1e-3,
    logging_dir=None,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    push_to_hub=False
)

In [None]:
# Evaluation metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    acc = accuracy.compute(references=labels, predictions=preds)
    return {"accuracy": acc["accuracy"]}

In [None]:
data_collator = DefaultDataCollator()

# Trainer for baseline training
baseline_trainer = Trainer(
    model=small_cnn_model,
    args=baseline_training_args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [5]:
# Train the lightweight CNN baseline
baseline_trainer.train()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,1.9554,1.650326,0.397
2,1.6783,1.405208,0.489
3,1.5294,1.204969,0.576
4,1.3609,1.04474,0.659
5,1.1568,0.793216,0.738
6,0.9441,0.544634,0.833
7,0.7106,0.342877,0.918
8,0.511,0.198239,0.956


Epoch,Training Loss,Validation Loss,Accuracy
1,1.9554,1.650326,0.397
2,1.6783,1.405208,0.489
3,1.5294,1.204969,0.576
4,1.3609,1.04474,0.659
5,1.1568,0.793216,0.738
6,0.9441,0.544634,0.833
7,0.7106,0.342877,0.918
8,0.511,0.198239,0.956
9,0.3744,0.138284,0.972
10,0.3109,0.107329,0.982


TrainOutput(global_step=1570, training_loss=1.0531955330235183, metrics={'train_runtime': 5813.7719, 'train_samples_per_second': 8.6, 'train_steps_per_second': 0.27, 'total_flos': 0.0, 'train_loss': 1.0531955330235183, 'epoch': 10.0})

In [6]:
# Evaluate on the test set
baseline_test_results = baseline_trainer.evaluate(processed_datasets["test"])
print("Baseline test accuracy: ", baseline_test_results["eval_accuracy"])

Baseline test accuracy:  0.5265
