In [6]:
from datasets import load_dataset

dataset = load_dataset("imdb", split={"train": "train[:2000]", "test": "test[:500]"})

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 2000/2000 [00:00<00:00, 3021.36 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 2914.84 examples/s]


In [9]:
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results", 
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.000279
2,0.013000,9.4e-05
3,0.013000,6.8e-05




TrainOutput(global_step=750, training_loss=0.00866904636969169, metrics={'train_runtime': 6361.5154, 'train_samples_per_second': 0.943, 'train_steps_per_second': 0.118, 'total_flos': 397402195968000.0, 'train_loss': 0.00866904636969169, 'epoch': 3.0})

In [11]:
trainer.evaluate()

{'eval_loss': 6.762630073353648e-05,
 'eval_runtime': 74.0586,
 'eval_samples_per_second': 6.751,
 'eval_steps_per_second': 0.851,
 'epoch': 3.0}