In [1]:
%pip install datasets evaluate
%pip install transformers[torch]
%pip install accelerate -U

# Please restart the runtime after installing packages

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 1. Data preparation

In [3]:
from datasets import load_dataset

imdb = load_dataset("imdb")

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [5]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Model

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# Training

In [8]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/aivn12s1/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


  0%|          | 0/15630 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.3195, 'learning_rate': 1.9360204734484968e-05, 'epoch': 0.32}
{'loss': 0.2575, 'learning_rate': 1.872040946896993e-05, 'epoch': 0.64}
{'loss': 0.2321, 'learning_rate': 1.8080614203454897e-05, 'epoch': 0.96}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.26567861437797546, 'eval_accuracy': 0.90824, 'eval_runtime': 230.1271, 'eval_samples_per_second': 108.636, 'eval_steps_per_second': 6.792, 'epoch': 1.0}
{'loss': 0.1601, 'learning_rate': 1.744081893793986e-05, 'epoch': 1.28}
{'loss': 0.1577, 'learning_rate': 1.6801023672424827e-05, 'epoch': 1.6}
{'loss': 0.1646, 'learning_rate': 1.616122840690979e-05, 'epoch': 1.92}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.24324841797351837, 'eval_accuracy': 0.9296, 'eval_runtime': 230.1882, 'eval_samples_per_second': 108.607, 'eval_steps_per_second': 6.79, 'epoch': 2.0}
{'loss': 0.1056, 'learning_rate': 1.5521433141394756e-05, 'epoch': 2.24}
{'loss': 0.0899, 'learning_rate': 1.488163787587972e-05, 'epoch': 2.56}
{'loss': 0.1003, 'learning_rate': 1.4241842610364684e-05, 'epoch': 2.88}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.302964448928833, 'eval_accuracy': 0.92744, 'eval_runtime': 230.2062, 'eval_samples_per_second': 108.598, 'eval_steps_per_second': 6.79, 'epoch': 3.0}
{'loss': 0.0626, 'learning_rate': 1.3602047344849649e-05, 'epoch': 3.2}
{'loss': 0.0669, 'learning_rate': 1.2962252079334613e-05, 'epoch': 3.52}
{'loss': 0.0616, 'learning_rate': 1.2322456813819578e-05, 'epoch': 3.84}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.3415318429470062, 'eval_accuracy': 0.93144, 'eval_runtime': 230.2528, 'eval_samples_per_second': 108.576, 'eval_steps_per_second': 6.788, 'epoch': 4.0}
{'loss': 0.0432, 'learning_rate': 1.1682661548304543e-05, 'epoch': 4.16}
{'loss': 0.0364, 'learning_rate': 1.1042866282789508e-05, 'epoch': 4.48}
{'loss': 0.0347, 'learning_rate': 1.0403071017274472e-05, 'epoch': 4.8}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.45246097445487976, 'eval_accuracy': 0.92424, 'eval_runtime': 230.2116, 'eval_samples_per_second': 108.596, 'eval_steps_per_second': 6.789, 'epoch': 5.0}
{'loss': 0.027, 'learning_rate': 9.763275751759437e-06, 'epoch': 5.12}
{'loss': 0.0204, 'learning_rate': 9.123480486244403e-06, 'epoch': 5.44}
{'loss': 0.0332, 'learning_rate': 8.483685220729368e-06, 'epoch': 5.76}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.4594961404800415, 'eval_accuracy': 0.93112, 'eval_runtime': 230.2599, 'eval_samples_per_second': 108.573, 'eval_steps_per_second': 6.788, 'epoch': 6.0}
{'loss': 0.0217, 'learning_rate': 7.843889955214333e-06, 'epoch': 6.08}
{'loss': 0.0152, 'learning_rate': 7.204094689699297e-06, 'epoch': 6.4}
{'loss': 0.0142, 'learning_rate': 6.5642994241842614e-06, 'epoch': 6.72}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.49574753642082214, 'eval_accuracy': 0.92568, 'eval_runtime': 230.3165, 'eval_samples_per_second': 108.546, 'eval_steps_per_second': 6.786, 'epoch': 7.0}
{'loss': 0.014, 'learning_rate': 5.924504158669226e-06, 'epoch': 7.04}
{'loss': 0.009, 'learning_rate': 5.284708893154191e-06, 'epoch': 7.36}
{'loss': 0.0093, 'learning_rate': 4.644913627639156e-06, 'epoch': 7.68}
{'loss': 0.0094, 'learning_rate': 4.005118362124121e-06, 'epoch': 8.0}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.5415156483650208, 'eval_accuracy': 0.9288, 'eval_runtime': 230.2925, 'eval_samples_per_second': 108.558, 'eval_steps_per_second': 6.787, 'epoch': 8.0}
{'loss': 0.0068, 'learning_rate': 3.3653230966090854e-06, 'epoch': 8.32}
{'loss': 0.0081, 'learning_rate': 2.72552783109405e-06, 'epoch': 8.64}
{'loss': 0.0064, 'learning_rate': 2.085732565579015e-06, 'epoch': 8.96}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.516925573348999, 'eval_accuracy': 0.93056, 'eval_runtime': 230.2854, 'eval_samples_per_second': 108.561, 'eval_steps_per_second': 6.787, 'epoch': 9.0}
{'loss': 0.0056, 'learning_rate': 1.4459373000639796e-06, 'epoch': 9.28}
{'loss': 0.0023, 'learning_rate': 8.061420345489445e-07, 'epoch': 9.6}
{'loss': 0.0018, 'learning_rate': 1.6634676903390917e-07, 'epoch': 9.92}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.5505360960960388, 'eval_accuracy': 0.93156, 'eval_runtime': 230.2891, 'eval_samples_per_second': 108.559, 'eval_steps_per_second': 6.787, 'epoch': 10.0}
{'train_runtime': 9364.6981, 'train_samples_per_second': 26.696, 'train_steps_per_second': 1.669, 'train_loss': 0.06713474385263976, 'epoch': 10.0}


TrainOutput(global_step=15630, training_loss=0.06713474385263976, metrics={'train_runtime': 9364.6981, 'train_samples_per_second': 26.696, 'train_steps_per_second': 1.669, 'train_loss': 0.06713474385263976, 'epoch': 10.0})