In [38]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification, DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
from datasets import Dataset
from evaluate import load
import numpy as np

In [2]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [3]:
snips_file = open('../data/snips.train.txt','rb')
snips_rows = snips_file.readlines()
snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [4]:
utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # skip over rows with no data
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

In [5]:
len(labels_for_tokens), len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [6]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['SearchCreativeWork',
 'PlayMusic',
 'RateBook',
 'AddToPlaylist',
 'BookRestaurant',
 'GetWeather',
 'SearchScreeningEvent']

In [7]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f'There are {len(unique_token_labels)} unique token labels')

There are 72 unique token labels


In [10]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances, 
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [11]:
snips_dataset['train'][0]

{'utterance': 'give 3 out of 6 stars to this essay',
 'label': 'RateBook',
 'tokens': ['give', '3', 'out', 'of', '6', 'stars', 'to', 'this', 'essay'],
 'token_labels': [20, 36, 20, 20, 49, 10, 20, 37, 5]}

In [30]:
tokenized_inputs = tokenizer(snips_dataset['train'][0]["tokens"], truncation=True, is_split_into_words=True)

In [31]:
tokenized_inputs.word_ids(batch_index=0)

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]

In [28]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"token_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Set the special tokens to -100.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # CLS and SEP are labeled as -100
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [29]:
snips_dataset['train'][0]

{'utterance': 'give 3 out of 6 stars to this essay',
 'label': 'RateBook',
 'tokens': ['give', '3', 'out', 'of', '6', 'stars', 'to', 'this', 'essay'],
 'token_labels': [20, 36, 20, 20, 49, 10, 20, 37, 5]}

In [14]:
tok_clf_tokenized_snips = snips_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 10467/10467 [00:00<00:00, 23679.16 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 33890.20 examples/s]


In [32]:
tok_clf_tokenized_snips['train'][0]

{'utterance': 'give 3 out of 6 stars to this essay',
 'label': 'RateBook',
 'tokens': ['give', '3', 'out', 'of', '6', 'stars', 'to', 'this', 'essay'],
 'token_labels': [20, 36, 20, 20, 49, 10, 20, 37, 5],
 'input_ids': [101, 2507, 1017, 2041, 1997, 1020, 3340, 2000, 2023, 9491, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 20, 36, 20, 20, 49, 10, 20, 37, 5, -100]}

In [33]:
tok_clf_tokenized_snips['train'] = tok_clf_tokenized_snips['train'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips['test'] = tok_clf_tokenized_snips['test'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2617
    })
})

In [34]:
tok_data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [35]:
tok_clf_model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=len(unique_token_labels)
)

tok_clf_model.config.id2label = {i: l for i, l in enumerate(unique_token_labels)}

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
tok_clf_model.config.id2label[0], tok_clf_model.config.id2label[1]

('I-entity_name', 'I-state')

In [39]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_tok_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
        
    logging_steps=10,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=tok_clf_model,
    args=training_args,
    train_dataset=tok_clf_tokenized_snips['train'],
    eval_dataset=tok_clf_tokenized_snips['test'],
    data_collator=tok_data_collator
)



In [40]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 4.318129539489746,
 'eval_model_preparation_time': 0.0014,
 'eval_runtime': 3.6319,
 'eval_samples_per_second': 720.562,
 'eval_steps_per_second': 22.578}

In [41]:
trainer.train()

***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,418,248


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,0.2651,0.187452,0.0014
2,0.1032,0.138956,0.0014



***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_tok_clf/results/checkpoint-328
Configuration saved in ./snips_tok_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_tok_clf/results/checkpoint-328/model.safetensors
Saving model checkpoint to ./snips_tok_clf/results/checkpoint-656
Configuration saved in ./snips_tok_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_tok_clf/results/checkpoint-656/model.safetensors

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_tok_clf/results/checkpoint-656
Configuration saved in ./snips_tok_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_tok_clf/results/checkpoint-656/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_tok_clf/results/checkpoint-656 (score: 0.13895584642887115).


TrainOutput(global_step=656, training_loss=0.43665800080066774, metrics={'train_runtime': 121.4593, 'train_samples_per_second': 172.354, 'train_steps_per_second': 5.401, 'total_flos': 115000673643072.0, 'train_loss': 0.43665800080066774, 'epoch': 2.0})

In [42]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.13895584642887115,
 'eval_model_preparation_time': 0.0014,
 'eval_runtime': 2.8168,
 'eval_samples_per_second': 929.06,
 'eval_steps_per_second': 29.111,
 'epoch': 2.0}

In [43]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer=tokenizer)
pipe('Please add Here We Go by Dispatch to my road trip playlist')

Device set to use mps:0
Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'entity': 'B-entity_name',
  'score': 0.9019391,
  'index': 3,
  'word': 'here',
  'start': 11,
  'end': 15},
 {'entity': 'I-entity_name',
  'score': 0.96080416,
  'index': 4,
  'word': 'we',
  'start': 16,
  'end': 18},
 {'entity': 'I-entity_name',
  'score': 0.9673803,
  'index': 5,
  'word': 'go',
  'start': 19,
  'end': 21},
 {'entity': 'I-entity_name',
  'score': 0.9623448,
  'index': 6,
  'word': 'by',
  'start': 22,
  'end': 24},
 {'entity': 'I-entity_name',
  'score': 0.89363253,
  'index': 7,
  'word': 'dispatch',
  'start': 25,
  'end': 33},
 {'entity': 'B-playlist_owner',
  'score': 0.98478633,
  'index': 9,
  'word': 'my',
  'start': 37,
  'end': 39},
 {'entity': 'B-playlist',
  'score': 0.98735225,
  'index': 10,
  'word': 'road',
  'start': 40,
  'end': 44},
 {'entity': 'I-playlist',
  'score': 0.9869207,
  'index': 11,
  'word': 'trip',
  'start': 45,
  'end': 49}]

In [45]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer=tokenizer)
pipe('Rate the dog food 5 out of 5')

Device set to use mps:0


[{'entity': 'B-object_name',
  'score': 0.96338195,
  'index': 2,
  'word': 'the',
  'start': 5,
  'end': 8},
 {'entity': 'I-object_name',
  'score': 0.98337644,
  'index': 3,
  'word': 'dog',
  'start': 9,
  'end': 12},
 {'entity': 'I-object_name',
  'score': 0.9810492,
  'index': 4,
  'word': 'food',
  'start': 13,
  'end': 17},
 {'entity': 'B-rating_value',
  'score': 0.9939744,
  'index': 5,
  'word': '5',
  'start': 18,
  'end': 19},
 {'entity': 'B-best_rating',
  'score': 0.96585757,
  'index': 8,
  'word': '5',
  'start': 27,
  'end': 28}]