# HuggingFace の `datasets` を用いてMNIST

## 0. 必要なライブラリ

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [23]:
!pip install datasets transformers accelerate evaluate



In [24]:
!pip install scikit-learn



In [25]:
!pip install ipywidgets



In [1]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

2.7.1+cu118
True


## 1. データの読み込み

In [2]:
from datasets import load_dataset

# MNISTの読み込み（28×28モノクロ画像）
dataset = load_dataset("mnist")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
})


## 2. 画像の変換処理（28×28 → 224×224）

### ① 画像を `float` & `normalized` で保存

In [3]:
from transformers import AutoImageProcessor
from torchvision.transforms import Compose, Resize, ToTensor, Grayscale, Normalize

# 画像前処理器（モデルに合ったサイズにする）
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k", use_fast=True)

# transformersのImageProcessorで正規化 + PIL to Tensor
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)

# torchvisionで画像変換
transform = Compose([
    Resize((224, 224)),
    Grayscale(num_output_channels=3),
    ToTensor(),
    normalize
])

### ② `datasets.map()` で `tensor` を `numpy` に変換して保存

In [4]:
# 前処理関数を定義（datasets.map に使う）
def preprocess(example):
    image = transform(example["image"])
    example["pixel_values"] = image

    return example

# 前処理を全データに適用
dataset = dataset.map(preprocess, remove_columns=["image"])

### ③ `Trainer` が学習時に `PyTorch tensor` として読み込めるように変換

In [5]:
# set_formatでテンソルに変換
dataset.set_format(type="torch", columns=["pixel_values", "label"])

## 3. ViTモデルの準備

In [6]:
from transformers import AutoModelForImageClassification

model = AutoModelForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=10  # MNISTは 0〜9
)

model = model.to("cuda")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
print(model.device)
print(torch.cuda.is_available())

cuda:0
True


## 4. 学習と評価

In [8]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

2.7.1+cu118
True


In [9]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="./mnist-vit-2",
    per_device_train_batch_size=64,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,  # ViTでは ImageProcessor を tokenizer として使う
    compute_metrics=compute_metrics
)

trainer.train()  # 学習

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3067,0.034569,0.9958
2,0.0308,0.019926,0.9966
3,0.0131,0.016937,0.997


TrainOutput(global_step=2814, training_loss=0.07696520214650168, metrics={'train_runtime': 23513.7465, 'train_samples_per_second': 7.655, 'train_steps_per_second': 0.12, 'total_flos': 1.394955826274304e+19, 'train_loss': 0.07696520214650168, 'epoch': 3.0})

## 5. ログを残す

In [10]:
for log in trainer.state.log_history:
    print(log)

{'loss': 0.3067, 'grad_norm': 0.43833282589912415, 'learning_rate': 4.113361762615494e-05, 'epoch': 0.5330490405117271, 'step': 500}
{'eval_loss': 0.034569066017866135, 'eval_accuracy': 0.9958, 'eval_runtime': 333.8142, 'eval_samples_per_second': 29.957, 'eval_steps_per_second': 3.745, 'epoch': 1.0, 'step': 938}
{'loss': 0.0569, 'grad_norm': 1.1011090278625488, 'learning_rate': 3.224946695095949e-05, 'epoch': 1.0660980810234542, 'step': 1000}
{'loss': 0.0308, 'grad_norm': 0.030460184440016747, 'learning_rate': 2.3365316275764036e-05, 'epoch': 1.5991471215351813, 'step': 1500}
{'eval_loss': 0.019925637170672417, 'eval_accuracy': 0.9966, 'eval_runtime': 335.2159, 'eval_samples_per_second': 29.832, 'eval_steps_per_second': 3.729, 'epoch': 2.0, 'step': 1876}
{'loss': 0.0187, 'grad_norm': 0.12122004479169846, 'learning_rate': 1.4481165600568586e-05, 'epoch': 2.1321961620469083, 'step': 2000}
{'loss': 0.0131, 'grad_norm': 0.01887434348464012, 'learning_rate': 5.597014925373135e-06, 'epoch': 