In [2]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
!pip install sentencepiece
!pip install sagemaker

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/e2/cf/db41e572d7ed958e8679018f8190438ef700aeb501b62da9e1eed9e4d69a/datasets-2.15.0-py3-none-any.whl.metadata
  Using cached datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets)
  Obtaining dependency information for pyarrow-hotfix from https://files.pythonhosted.org/packages/e4/f4/9ec2222f5f5f8ea04f66f184caafd991a39c8782e31f5b0266f101cb68ca/pyarrow_hotfix-0.6-py3-none-any.whl.metadata
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting xxhash (from datasets)
  Obtaining dependency information for xxhash from https://files.pythonhosted.org/packages/80/8a/1dd41557883b6196f8f092011a5c1f72d4d44cf36d7b67d4a5efe3127949/xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting aio

In [48]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \
    BertConfig, BertModel, BertTokenizer, AdamW, DataCollatorWithPadding
from datasets import load_dataset

In [7]:
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [9]:
# classifier("I've been waiting for a HuggingFace course my whole life.")
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [10]:
classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
config.json: 100%|██████████| 1.15k/1.15k [00:00<00:00, 7.36MB/s]
model.safetensors: 100%|██████████| 1.63G/1.63G [00:05<00:00, 305MB/s]
tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 118kB/s]
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 4.93MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 45.0MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 13.4MB/s]


In [11]:
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business", "academic", "IT", "Tech", "Machine learning"],
)

{'sequence': 'This is a course about the Transformers library',
 'labels': ['academic',
  'education',
  'Tech',
  'IT',
  'business',
  'Machine learning',
  'politics'],
 'scores': [0.9623221158981323,
  0.019890105351805687,
  0.0075290328823029995,
  0.0047127739526331425,
  0.00263696676120162,
  0.0018863442819565535,
  0.0010226941667497158]}

In [12]:
generator = pipeline("text-generation")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
model.safetensors: 100%|██████████| 548M/548M [00:01<00:00, 354MB/s] 
generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 704kB/s]


In [13]:
generator("In this course, we will teach you how to")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to create real-time and real-time interactive websites.\n\nYou will: build a unique website\n\nDesign a site that is accessible to millions\n\nBuild and validate your websites using our tools'}]

In [14]:
generator = pipeline("text-generation", model="distilgpt2")

config.json: 100%|██████████| 762/762 [00:00<00:00, 5.34MB/s]
model.safetensors: 100%|██████████| 353M/353M [00:01<00:00, 206MB/s] 
generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 885kB/s]
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 41.4MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 42.8MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.00MB/s]


In [15]:
generator(
    "In this course, we will teach you how to",
    max_length=30,
    num_return_sequences=2,
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to use the “GCSV“ in both Microsoft Outlook and Office 365 using Microsoft Excel for'},
 {'generated_text': 'In this course, we will teach you how to read a language that will give you additional training\n\n\n\n\nAnd in the future, a'}]

In [16]:
unmasker = pipeline("fill-mask")

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
config.json: 100%|██████████| 480/480 [00:00<00:00, 3.47MB/s]
model.safetensors: 100%|██████████| 331M/331M [00:01<00:00, 299MB/s]  
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
vocab.j

In [17]:
unmasker("This course will teach you all about <mask> models.", top_k=2)

[{'score': 0.1961982101202011,
  'token': 30412,
  'token_str': ' mathematical',
  'sequence': 'This course will teach you all about mathematical models.'},
 {'score': 0.040527306497097015,
  'token': 38163,
  'token_str': ' computational',
  'sequence': 'This course will teach you all about computational models.'}]

In [18]:
ner = pipeline("ner", grouped_entities=True)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
config.json: 100%|██████████| 998/998 [00:00<00:00, 7.01MB/s]
model.safetensors: 100%|██████████| 1.33G/1.33G [00:04<00:00, 304MB/s]
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identi

In [19]:
ner("My name is Abdulkarim and I work at Hugging Face in DC.")

[{'entity_group': 'PER',
  'score': 0.9975577,
  'word': 'Abdulkarim',
  'start': 11,
  'end': 21},
 {'entity_group': 'ORG',
  'score': 0.9868558,
  'word': 'Hugging Face',
  'start': 36,
  'end': 48},
 {'entity_group': 'LOC',
  'score': 0.9983911,
  'word': 'DC',
  'start': 52,
  'end': 54}]

In [20]:
question_answerer = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
config.json: 100%|██████████| 473/473 [00:00<00:00, 3.37MB/s]
model.safetensors: 100%|██████████| 261M/261M [00:01<00:00, 247MB/s] 
tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 223kB/s]
vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 2.39MB/s]
tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.59MB/s]


In [21]:
question_answerer(
    question="Where do I work?",
    context="My name is Abdulkarim and I work at Hugging Face in DC",
)

{'score': 0.5561079978942871,
 'start': 36,
 'end': 54,
 'answer': 'Hugging Face in DC'}

In [22]:
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
config.json: 100%|██████████| 1.80k/1.80k [00:00<00:00, 12.5MB/s]
pytorch_model.bin: 100%|██████████| 1.22G/1.22G [00:04<00:00, 263MB/s] 
tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 175kB/s]
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 9.80MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.51MB/s]


In [23]:
summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of 
    graduates in traditional engineering disciplines such as mechanical, civil, 
    electrical, chemical, and aeronautical engineering declined, but in most of 
    the premier American universities engineering curricula now concentrate on 
    and encourage largely the study of engineering science. As a result, there 
    are declining offerings in engineering subjects dealing with infrastructure, 
    the environment, and related issues, and greater concentration on high 
    technology subjects, largely supporting increasingly complex scientific 
    developments. While the latter is important, it should not be at the expense 
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other 
    industrial countries in Europe and Asia, continue to encourage and advance 
    the teaching of engineering. Both China and India, respectively, graduate 
    six and eight times as many traditional engineers as does the United States. 
    Other industrial countries at minimum maintain their output, while America 
    suffers an increasingly serious decline in the number of engineering graduates 
    and a lack of well-educated engineers.
"""
)

[{'summary_text': ' America has changed dramatically during recent years . The number of engineering graduates in the U.S. has declined in traditional engineering disciplines such as mechanical, civil,    electrical, chemical, and aeronautical engineering . Rapidly developing economies such as China and India continue to encourage and advance the teaching of engineering .'}]

In [4]:
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")

source.spm: 100%|██████████| 802k/802k [00:00<00:00, 9.09MB/s]
target.spm: 100%|██████████| 778k/778k [00:00<00:00, 66.0MB/s]
vocab.json: 100%|██████████| 1.34M/1.34M [00:00<00:00, 36.7MB/s]


In [5]:
translator("Ce cours est produit par Hugging Face.")

[{'translation_text': 'This course is produced by Hugging Face.'}]

In [6]:
unmasker = pipeline("fill-mask", model="bert-base-uncased")

config.json: 100%|██████████| 570/570 [00:00<00:00, 3.91MB/s]
model.safetensors: 100%|██████████| 440M/440M [00:01<00:00, 337MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 209kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 58.2MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00

In [7]:
result = unmasker("This man works as a [MASK].")
print([r["token_str"] for r in result])
result = unmasker("This woman works as a [MASK].")
print([r["token_str"] for r in result])

['carpenter', 'lawyer', 'farmer', 'businessman', 'doctor']
['nurse', 'maid', 'teacher', 'waitress', 'prostitute']


In [4]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

In [6]:
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

In [7]:
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [8]:
# Building the config
config = BertConfig()

# Building the model from the config
model = BertModel(config)

In [9]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [10]:
model = BertModel.from_pretrained("bert-base-cased")

config.json: 100%|██████████| 570/570 [00:00<00:00, 2.96MB/s]
model.safetensors: 100%|██████████| 436M/436M [00:01<00:00, 392MB/s] 


In [None]:
# model.save_pretrained("directory_on_my_computer")

In [16]:
tokenized_text = "Abdulkaream is an engineer".split()
print(tokenized_text)

['Abdulkaream', 'is', 'an', 'engineer']


In [17]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 2478, 1037, 10938, 2121, 2897, 2003, 3722, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# tokenizer.save_pretrained("directory_on_my_computer")

In [18]:
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

['using', 'a', 'transform', '##er', 'network', 'is', 'simple']


In [19]:
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[2478, 1037, 10938, 2121, 2897, 2003, 3722]


In [20]:
decoded_string = tokenizer.decode([2478, 1037, 10938, 2121, 2897, 2003, 3722])
print(decoded_string)

using a transformer network is simple


In [21]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [22]:
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [23]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [28]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

model_inputs = tokenizer(sequences)

# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

In [29]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [32]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

# Returns TensorFlow tensors
# model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

In [33]:
sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [34]:
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.


In [35]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

In [37]:
# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()



In [40]:
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Downloading builder script: 100%|██████████| 28.8k/28.8k [00:00<00:00, 18.1MB/s]
Downloading metadata: 100%|██████████| 28.7k/28.7k [00:00<00:00, 21.0MB/s]
Downloading readme: 100%|██████████| 27.9k/27.9k [00:00<00:00, 18.0MB/s]
Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]
Downloading data: 6.22kB [00:00, 2.95MB/s]
Downloading data files:  33%|███▎      | 1/3 [00:00<00:00,  6.86it/s]
Downloading data: 1.05MB [00:00, 96.1MB/s]
Downloading data files:  67%|██████▋   | 2/3 [00:00<00:00,  5.54it/s]
Downloading data: 441kB [00:00, 84.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  5.39it/s]
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 14437.98 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 3838.64 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 15665.09 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [41]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [42]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [43]:
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [44]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [45]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [46]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [47]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map: 100%|██████████| 3668/3668 [00:00<00:00, 6944.85 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 3736.45 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 6178.76 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [49]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [50]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [51]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}