In [52]:
snips_file = open('data/snips_train.txt', 'rb')
snips_rows = snips_file.readlines()
snips_rows[:5]

[b'BOS listen to westbam alumb allergic on google music EOS\tO O O B-artist O B-album O B-service I-service PlayMusic\r\n',
 b'BOS add step to me to the 50 cl\xc3\xa1sicos playlist EOS\tO O B-entity_name I-entity_name I-entity_name O O B-playlist I-playlist O AddToPlaylist\r\n',
 b'BOS i give this current textbook a rating value of 1 and a best rating of 6 EOS\tO O O O B-object_select B-object_type O O O O B-rating_value O O O O O B-best_rating RateBook\r\n',
 b'BOS play the song little robin redbreast EOS\tO O O B-music_item B-track I-track I-track PlayMusic\r\n',
 b'BOS please add iris dement to my playlist this is selena EOS\tO O O B-artist I-artist O B-playlist_owner O B-playlist I-playlist I-playlist AddToPlaylist\r\n']

In [53]:
tokenized_utterances = []
labels_for_tokens = []
for snips_row in snips_rows:
    a = snips_row.decode()[4:].split(' EOS\t')
    tokenized_utterances.append(a[0].split())
    labels_for_tokens.append(a[1].split()[:-1])

In [54]:
tokenized_utterances[:2]

[['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music'],
 ['add', 'step', 'to', 'me', 'to', 'the', '50', 'clásicos', 'playlist']]

In [55]:
labels_for_tokens[:2]

[['O', 'O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service'],
 ['O',
  'O',
  'B-entity_name',
  'I-entity_name',
  'I-entity_name',
  'O',
  'O',
  'B-playlist',
  'I-playlist',
  'O']]

In [56]:
a[1].split()[:-1]

['O',
 'O',
 'B-object_name',
 'I-object_name',
 'B-rating_value',
 'O',
 'O',
 'B-best_rating']

In [57]:
a[0].split()

['rate', 'richard', 'carvel', '4', 'out', 'of', '6']

In [58]:
from transformers import DataCollatorForTokenClassification, DistilBertForTokenClassification, \
        DistilBertTokenizerFast, pipeline

In [59]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [60]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f"There are {len(unique_token_labels)} unique token labels")

There are 72 unique token labels


In [61]:
labels_for_tokens[:2]

[[31, 31, 31, 24, 31, 49, 31, 10, 69], [31, 31, 59, 65, 65, 31, 31, 39, 9, 31]]

In [62]:
def tokenize_and_align_labels(examples):
    
    tokenized_inputs = tokenizer(examples['tokens'], truncation = True, is_split_into_words = True)
    
    labels = []
    for i, label in enumerate(examples[f"token_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [63]:
import datasets

In [64]:
snips_dataset = datasets.Dataset.from_dict(
    dict(
        tokens = tokenized_utterances,
        token_labels = labels_for_tokens
    )
)

snips_dataset = snips_dataset.train_test_split(test_size = 0.2)

In [65]:
snips_dataset['train'][0]

{'tokens': ['what',
  's',
  'the',
  'weather',
  'for',
  'april',
  '10th',
  '2028',
  'in',
  'arkansas'],
 'token_labels': [31, 31, 31, 31, 31, 31, 20, 50, 50, 31, 12]}

In [66]:
tok_clf_tokenized_snips = snips_dataset.map(tokenize_and_align_labels, batched = True)

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]

Map:   0%|          | 0/2617 [00:00<?, ? examples/s]

In [67]:
tokenizer.decode([1024])

':'

In [68]:
tok_clf_tokenized_snips['train'][0]

{'tokens': ['what',
  's',
  'the',
  'weather',
  'for',
  'april',
  '10th',
  '2028',
  'in',
  'arkansas'],
 'token_labels': [31, 31, 31, 31, 31, 31, 20, 50, 50, 31, 12],
 'input_ids': [101,
  2054,
  1055,
  1996,
  4633,
  2005,
  2258,
  6049,
  16798,
  2620,
  1999,
  6751,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 31, 31, 31, 31, 31, 31, 20, 50, -100, 50, 31, -100]}

In [69]:
tok_clf_tokenized_snips['train'] = tok_clf_tokenized_snips['train'].remove_columns(['tokens', 'token_labels'])

In [70]:
tok_clf_tokenized_snips['test'] = tok_clf_tokenized_snips['test'].remove_columns(['tokens', 'token_labels'])

In [71]:
tok_clf_tokenized_snips

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2617
    })
})

In [72]:
tok_data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

In [73]:
tok_clf_model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels = len(unique_token_labels))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
tok_clf_model.config.id2label = {i : l for i, l in enumerate(unique_token_labels)}

In [79]:
from transformers import Trainer, TrainingArguments

In [81]:
epochs = 2

training_args = TrainingArguments(
    output_dir = "./snips_tok_clf/results",
    num_train_epochs = epochs,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    load_best_model_at_end = True,
    
    # some deep learning params that the trainer is able to take in
    
#     warmup_steps = len(seq_clf_tokenized_snips['train']) // 5, # no. of warmup steps for learning rate scheduler
#     weight_decay = 0.05,
    
    logging_steps = 10,
    log_level = 'info',
    eval_strategy = 'epoch',
    save_strategy = 'epoch'
)


trainer = Trainer(
    model = tok_clf_model,
    args = training_args,
    train_dataset = tok_clf_tokenized_snips['train'],
    eval_dataset = tok_clf_tokenized_snips['test'],
#     compute_metrics = compute_metrics,
    data_collator = tok_data_collator
)

In [82]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 4.319265365600586,
 'eval_model_preparation_time': 0.004,
 'eval_runtime': 90.9478,
 'eval_samples_per_second': 28.775,
 'eval_steps_per_second': 0.902}

In [83]:
trainer.train()

***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,418,248


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,0.2891,0.23109,0.004
2,0.151,0.157431,0.004



***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_tok_clf/results\checkpoint-328
Configuration saved in ./snips_tok_clf/results\checkpoint-328\config.json
Model weights saved in ./snips_tok_clf/results\checkpoint-328\model.safetensors
Saving model checkpoint to ./snips_tok_clf/results\checkpoint-656
Configuration saved in ./snips_tok_clf/results\checkpoint-656\config.json
Model weights saved in ./snips_tok_clf/results\checkpoint-656\model.safetensors

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_tok_clf/results\checkpoint-656
Configuration saved in ./snips_tok_clf/results\checkpoint-656\config.json
Model weights saved in ./snips_tok_clf/results\checkpoint-656\model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_tok_clf/results\checkpoint-656 (score: 0.15743082761764526).


TrainOutput(global_step=656, training_loss=0.49110321500679344, metrics={'train_runtime': 2048.7072, 'train_samples_per_second': 10.218, 'train_steps_per_second': 0.32, 'total_flos': 115873222589712.0, 'train_loss': 0.49110321500679344, 'epoch': 2.0})

In [84]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.15743082761764526,
 'eval_model_preparation_time': 0.004,
 'eval_runtime': 73.4315,
 'eval_samples_per_second': 35.639,
 'eval_steps_per_second': 1.117,
 'epoch': 2.0}

In [85]:
trainer.save_model()

Saving model checkpoint to ./snips_tok_clf/results
Configuration saved in ./snips_tok_clf/results\config.json
Model weights saved in ./snips_tok_clf/results\model.safetensors


In [87]:
pipe = pipeline('token-classification', './snips_tok_clf/results', tokenizer = tokenizer)
pipe("Add Two Coins' by Dispatch to my road trip playlist")

loading configuration file ./snips_tok_clf/results\config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_tok_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "I-geographic_poi",
    "1": "I-spatial_relation",
    "2": "I-cuisine",
    "3": "I-object_select",
    "4": "B-object_type",
    "5": "B-playlist_owner",
    "6": "I-party_size_description",
    "7": "I-movie_type",
    "8": "I-playlist_owner",
    "9": "I-playlist",
    "10": "B-service",
    "11": "I-state",
    "12": "B-state",
    "13": "B-served_dish",
    "14": "B-track",
    "15": "B-current_location",
    "16": "B-location_name",
    "17": "B-restaurant_type",
    "18": "I-poi",
    "19": "B-party_size_number",
    "20": "B-timeRange",
    "21": "I-served_dish",
    "22": "B-genre",
    "23": "I-facility",
    "24": "B-artist",
    "25": "I-albu

All the weights of DistilBertForTokenClassification were initialized from the model checkpoint at ./snips_tok_clf/results.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForTokenClassification for predictions without further training.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'entity': 'B-entity_name',
  'score': 0.91015667,
  'index': 3,
  'word': 'coins',
  'start': 8,
  'end': 13},
 {'entity': 'I-entity_name',
  'score': 0.841761,
  'index': 4,
  'word': "'",
  'start': 13,
  'end': 14},
 {'entity': 'I-entity_name',
  'score': 0.7664101,
  'index': 5,
  'word': 'by',
  'start': 15,
  'end': 17},
 {'entity': 'I-entity_name',
  'score': 0.7366723,
  'index': 6,
  'word': 'dispatch',
  'start': 18,
  'end': 26},
 {'entity': 'I-entity_name',
  'score': 0.68693733,
  'index': 7,
  'word': 'to',
  'start': 27,
  'end': 29},
 {'entity': 'B-playlist_owner',
  'score': 0.983563,
  'index': 9,
  'word': 'road',
  'start': 33,
  'end': 37},
 {'entity': 'B-playlist',
  'score': 0.98780084,
  'index': 10,
  'word': 'trip',
  'start': 38,
  'end': 42},
 {'entity': 'I-playlist',
  'score': 0.98700935,
  'index': 11,
  'word': 'play',
  'start': 43,
  'end': 47},
 {'entity': 'I-playlist',
  'score': 0.97960496,
  'index': 12,
  'word': '##list',
  'start': 47,
  'end'