# `sms-spam` をTransformerを使って学習してみる

# 1. データの読み込みと前処理

In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer

# データセット読み込み
dataset = load_dataset("sms_spam")

# 2クラス（ham or spam）
label_names = dataset["train"].features["label"].names

# 80% train / 20% test に分割
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# トークナイザ
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

# トークナイズ関数
def tokenize_fn(example):
    return tokenizer(example["sms"], truncation=True, padding="max_length", max_length=128)

# 全体に適用
tokenized_dataset = split_dataset.map(tokenize_fn, batched=True)

# モデル用の column を整える
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

## 2. モデルの準備

In [9]:
from transformers import AutoModelForSequenceClassification

# 2クラス分類用の DistilBERT
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3. 学習設定と `Trainer` の利用

In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-sms-spam",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0911,0.032209
2,0.0458,0.060291
3,0.0004,0.061173


TrainOutput(global_step=837, training_loss=0.03341491820052569, metrics={'train_runtime': 124.2873, 'train_samples_per_second': 107.63, 'train_steps_per_second': 6.734, 'total_flos': 443004097955328.0, 'train_loss': 0.03341491820052569, 'epoch': 3.0})

## 4. 精度の評価

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 評価用関数
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer.compute_metrics = compute_metrics

# 評価
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.03220876678824425, 'eval_accuracy': 0.9919282511210762, 'eval_precision': 0.9861111111111112, 'eval_recall': 0.9530201342281879, 'eval_f1': 0.9692832764505119, 'eval_runtime': 1.7078, 'eval_samples_per_second': 652.895, 'eval_steps_per_second': 40.989, 'epoch': 3.0}
