# HuggingFace の `datasets` を用いてMNIST

## 0. 必要なライブラリ

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install datasets transformers accelerate evaluate

In [None]:
!pip install scikit-learn

In [1]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

2.7.1+cu118
True


## 1. データの読み込み

In [2]:
from datasets import load_dataset

# MNISTの読み込み（28×28モノクロ画像）
dataset = load_dataset("mnist")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
})


## 2. 画像の変換処理（28×28 → 224×224）

In [3]:
from transformers import AutoImageProcessor
from torchvision.transforms import Compose, Resize, ToTensor, Grayscale

# 画像前処理器（モデルに合ったサイズにする）
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k", use_fast=True)

# torchvisionで画像変換
transform = Compose([
    Resize((224, 224)),
    Grayscale(num_output_channels=3),
    ToTensor()
])

# 前処理関数を定義（datasets.map に使う）
def preprocess(example):
    image = transform(example["image"])
    example["pixel_values"] = image
    return example

# 前処理を全データに適用
dataset = dataset.map(preprocess)

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

## 3. ViTモデルの準備

In [4]:
from transformers import AutoModelForImageClassification

model = AutoModelForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=10  # MNISTは 0〜9
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. 学習と評価

In [5]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="./mnist-vit",
    per_device_train_batch_size=32,
    save_strategy="epoch",
    num_train_epochs=3,
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,  # ViTでは ImageProcessor を tokenizer として使う
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.338
1000,0.0701
1500,0.0452
2000,0.0364
2500,0.0228
3000,0.0174
3500,0.0178
4000,0.0103
4500,0.0087
5000,0.0086


TrainOutput(global_step=5625, training_loss=0.05180937139723036, metrics={'train_runtime': 17532.2719, 'train_samples_per_second': 10.267, 'train_steps_per_second': 0.321, 'total_flos': 1.394955826274304e+19, 'train_loss': 0.05180937139723036, 'epoch': 3.0})

## 5. 推論

In [6]:
import torch

image = dataset["test"][0]["image"]
inputs = image_processor(image.convert("RGB"), return_tensors="pt").to(model.device)
with torch.no_grad():
    logits = model(**inputs).logits
    pred = logits.argmax(-1).item()

print(f"予測: {pred}, 正解: {dataset['test'][0]['label']}")

予測: 7, 正解: 7
