In [52]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, DataCollatorWithPadding, pipeline
from datasets import Dataset
from evaluate import load
import numpy as np

In [34]:
snips_file = open('../data/snips.train.txt','rb')
snips_rows = snips_file.readlines()
snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

The dataset in its raw form contains tokens itself with labels next to them.

Commonly used labels:

- ***O***: If its just a random word and not for any major entity.
- ***B***: Beginning of an entity. Like for westbam: B-artist so westban is an artist so it comes under artist entity.
- ***I***: Continuation token. Represents it consists of a previously begun entity.

In [35]:
utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # skip over rows with no data
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

In [36]:
len(labels_for_tokens), len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [37]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['BookRestaurant',
 'GetWeather',
 'AddToPlaylist',
 'RateBook',
 'SearchCreativeWork',
 'PlayMusic',
 'SearchScreeningEvent']

In [38]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]

print(f'There are {len(unique_sequence_labels)} unique sequence labels')

There are 7 unique sequence labels


These 7 categories is what we will trian our BERT model to try to predict

In [39]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f'There are {len(unique_token_labels)} unique token labels')

There are 72 unique token labels


In [40]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print([unique_token_labels[l] for l in labels_for_tokens[0]])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])


['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[12, 12, 4, 12, 32, 12, 55, 11]
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
5
PlayMusic


In [41]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances, 
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [42]:
snips_dataset['train'][0]

{'utterance': 'add manuelita to my indiespensables playlist',
 'label': 2,
 'tokens': ['add', 'manuelita', 'to', 'my', 'indiespensables', 'playlist'],
 'token_labels': [12, 46, 12, 44, 57, 12]}

In [43]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [44]:
def preprocess_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

In [45]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 10467/10467 [00:00<00:00, 46583.55 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 55640.57 examples/s]


In [46]:
seq_clf_tokenized_snips['train'][0]

{'utterance': 'add manuelita to my indiespensables playlist',
 'label': 2,
 'tokens': ['add', 'manuelita', 'to', 'my', 'indiespensables', 'playlist'],
 'token_labels': [12, 46, 12, 44, 57, 12],
 'input_ids': [101,
  5587,
  7762,
  6590,
  2000,
  2026,
  9429,
  11837,
  19150,
  2015,
  2377,
  9863,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [47]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [49]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=len(unique_sequence_labels),
)

sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
sequence_clf_model.config.id2label[0]

'BookRestaurant'

In [53]:
metric = load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 7.81MB/s]


In [54]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,
    weight_decay = 0.05,
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)



In [55]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.9451035261154175,
 'eval_model_preparation_time': 0.0008,
 'eval_accuracy': 0.14444019870080244,
 'eval_runtime': 5.1036,
 'eval_samples_per_second': 512.776,
 'eval_steps_per_second': 16.067}

In [56]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,958,855


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.1577,0.246668,0.0008,0.971341
2,0.0406,0.099221,0.0008,0.977455


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/model.safetensors
Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_clf/results/checkpoint-656/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If 

TrainOutput(global_step=656, training_loss=0.720749528652693, metrics={'train_runtime': 127.9346, 'train_samples_per_second': 163.63, 'train_steps_per_second': 5.128, 'total_flos': 116918078231880.0, 'train_loss': 0.720749528652693, 'epoch': 2.0})

In [57]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.09922091662883759,
 'eval_model_preparation_time': 0.0008,
 'eval_accuracy': 0.9774551012609859,
 'eval_runtime': 3.1743,
 'eval_samples_per_second': 824.424,
 'eval_steps_per_second': 25.832,
 'epoch': 2.0}

In [58]:
pipe = pipeline("text-classification", sequence_clf_model, tokenizer=tokenizer)
pipe('Please add Here We Go by Dispatch to my road trip playlist')

Device set to use mps:0


[{'label': 'AddToPlaylist', 'score': 0.9864668250083923}]

In [59]:
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results/config.json
Model weights saved in ./snips_clf/results/model.safetensors


In [60]:
pipe = pipeline("text-classification", "./snips_clf/results", tokenizer=tokenizer)
pipe('Please add Here We Go by Dispatch to my road trip playlist')

loading configuration file ./snips_clf/results/config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "BookRestaurant",
    "1": "GetWeather",
    "2": "AddToPlaylist",
    "3": "RateBook",
    "4": "SearchCreativeWork",
    "5": "PlayMusic",
    "6": "SearchScreeningEvent"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transfor

[{'label': 'AddToPlaylist', 'score': 0.9864668250083923}]

In [61]:
frozen_sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=len(unique_sequence_labels),
)

loading configuration file config.json from cache at /Users/nt/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.48.3",
  "vocab_size": 30522
}

load

In [62]:
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False

In [63]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,
    weight_decay = 0.05,
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)


trainer = Trainer(
    model=frozen_sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [64]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.948607325553894,
 'eval_model_preparation_time': 0.0015,
 'eval_accuracy': 0.1322124570118456,
 'eval_runtime': 3.2492,
 'eval_samples_per_second': 805.43,
 'eval_steps_per_second': 25.237}

In [65]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 595,975


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,1.8614,1.872272,0.0015,0.513947
2,1.6936,1.584693,0.0015,0.852885


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/model.safetensors
Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_clf/results/checkpoint-656/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If 

TrainOutput(global_step=656, training_loss=1.846965655321028, metrics={'train_runtime': 35.3806, 'train_samples_per_second': 591.68, 'train_steps_per_second': 18.541, 'total_flos': 116918078231880.0, 'train_loss': 1.846965655321028, 'epoch': 2.0})

In [66]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.584693193435669,
 'eval_model_preparation_time': 0.0015,
 'eval_accuracy': 0.8528849828047382,
 'eval_runtime': 3.1731,
 'eval_samples_per_second': 824.746,
 'eval_steps_per_second': 25.842,
 'epoch': 2.0}