# Chapter 13
## Section: Language modeling using pre-trained models

In [None]:
!pip install transformers==4.28.0
!pip install transformers datasets

In [2]:
from datasets import load_dataset
docs = load_dataset(f"Cohere/wikipedia-22-12-simple-embeddings", split="train")

Downloading readme:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/409M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/408M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/407M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/404M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/485859 [00:00<?, ? examples/s]

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
# Load dataset
from datasets import load_dataset
dataset = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
# Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

# train_dataset, test_dataset = dataset["train"].map(tokenize, batched=True), dataset["test"].map(tokenize, batched=True)
# get small subsets of imdb dataset for training and testing
train_dataset = dataset["train"].train_test_split(test_size=0.01)["test"].map(tokenize, batched=True)
test_dataset = dataset["test"].train_test_split(test_size=0.01)["test"].map(tokenize, batched=True)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [5]:
# Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Training
training_args = TrainingArguments(output_dir="./results", num_train_epochs=3,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8, logging_dir="./logs")

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [6]:
trainer = Trainer(model=model, args=training_args,
                  train_dataset=train_dataset,eval_dataset=test_dataset)

trainer.train()



Step,Training Loss


TrainOutput(global_step=96, training_loss=0.47670431931813556, metrics={'train_runtime': 2262.1255, 'train_samples_per_second': 0.332, 'train_steps_per_second': 0.042, 'total_flos': 99350548992000.0, 'train_loss': 0.47670431931813556, 'epoch': 3.0})

In [7]:
# Evaluation
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.44858163595199585, 'eval_runtime': 214.5124, 'eval_samples_per_second': 1.165, 'eval_steps_per_second': 0.149, 'epoch': 3.0}
