# HuggingFace の `datasets` を用いてMNIST

## 0. 必要なライブラリ

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [23]:
!pip install datasets transformers accelerate evaluate



In [24]:
!pip install scikit-learn



In [25]:
!pip install ipywidgets



In [1]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

2.7.1+cu118
True


## 1. データの読み込み

In [2]:
from datasets import load_dataset

# MNISTの読み込み（28×28モノクロ画像）
dataset = load_dataset("mnist")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
})


## 2. 画像の変換処理（28×28 → 224×224）

### ① 画像を `float` & `normalized` で保存

In [3]:
from transformers import AutoImageProcessor
from torchvision.transforms import Compose, Resize, ToTensor, Grayscale, Normalize

# 画像前処理器（モデルに合ったサイズにする）
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k", use_fast=True)

# transformersのImageProcessorで正規化 + PIL to Tensor
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)

# torchvisionで画像変換
transform = Compose([
    Resize((224, 224)),
    Grayscale(num_output_channels=3),
    ToTensor(),
    normalize
])

### ② `datasets.map()` で `tensor` を `numpy` に変換して保存

In [None]:
# 前処理関数を定義（datasets.map に使う）
def preprocess(example):
    image = transform(example["image"])
    example["pixel_values"] = image

    return example

# 前処理を全データに適用
dataset = dataset.map(preprocess, remove_columns=["image"])

### ③ `Trainer` が学習時に `PyTorch tensor` として読み込めるように変換

In [6]:
# set_formatでテンソルに変換
dataset.set_format(type="torch", columns=["pixel_values", "label"])

## 3. ViTモデルの準備

In [7]:
from transformers import AutoModelForImageClassification

model = AutoModelForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=10  # MNISTは 0〜9
)

model = model.to("cuda")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
print(model.device)
print(torch.cuda.is_available())

cuda:0
True


## 4. 学習と評価

In [None]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

In [None]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="./mnist-vit",
    per_device_train_batch_size=64,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,  # ViTでは ImageProcessor を tokenizer として使う
    compute_metrics=compute_metrics
)

trainer.train()  # 学習