In [1]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, \
    DataCollatorWithPadding, pipeline

In [2]:
import datasets

In [3]:
snips_file = open('data/snips_train.txt', 'rb')
snips_rows = snips_file.readlines()
snips_rows[:20]

[b'BOS listen to westbam alumb allergic on google music EOS\tO O O B-artist O B-album O B-service I-service PlayMusic\r\n',
 b'BOS add step to me to the 50 cl\xc3\xa1sicos playlist EOS\tO O B-entity_name I-entity_name I-entity_name O O B-playlist I-playlist O AddToPlaylist\r\n',
 b'BOS i give this current textbook a rating value of 1 and a best rating of 6 EOS\tO O O O B-object_select B-object_type O O O O B-rating_value O O O O O B-best_rating RateBook\r\n',
 b'BOS play the song little robin redbreast EOS\tO O O B-music_item B-track I-track I-track PlayMusic\r\n',
 b'BOS please add iris dement to my playlist this is selena EOS\tO O O B-artist I-artist O B-playlist_owner O B-playlist I-playlist I-playlist AddToPlaylist\r\n',
 b'BOS add slimm cutta calhoun to my this is prince playlist  EOS\tO O B-artist I-artist I-artist O B-playlist_owner B-playlist I-playlist I-playlist O AddToPlaylist\r\n',
 b'BOS i want to listen to seventies music EOS\tO O O O O O B-year O PlayMusic\r\n',
 b'BOS p

In [4]:
utterances = []
sequence_labels = []
for snips_row in snips_rows:
    a = snips_row.decode()[4:].split(' EOS\t')
    utterances.append(a[0])
    sequence_labels.append(a[1].split()[-1])

In [5]:
utterances[:5]

['listen to westbam alumb allergic on google music',
 'add step to me to the 50 clásicos playlist',
 'i give this current textbook a rating value of 1 and a best rating of 6',
 'play the song little robin redbreast',
 'please add iris dement to my playlist this is selena']

In [6]:
sequence_labels[:5]

['PlayMusic', 'AddToPlaylist', 'RateBook', 'PlayMusic', 'AddToPlaylist']

In [7]:
type(sequence_labels)

list

In [8]:
unique_sequence_labels = list(set(sequence_labels))

In [9]:
unique_sequence_labels

['AddToPlaylist',
 'SearchScreeningEvent',
 'PlayMusic',
 'SearchCreativeWork',
 'BookRestaurant',
 'RateBook',
 'GetWeather']

In [10]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]

In [11]:
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

listen to westbam alumb allergic on google music
2
PlayMusic


In [12]:
snips_dataset = datasets.Dataset.from_dict(
    dict(
        utterance = utterances,
        label = sequence_labels
    )
)

snips_dataset = snips_dataset.train_test_split(test_size = 0.2)

In [13]:
snips_dataset['train'][0]

{'utterance': 'add the chamillionaire track to lina s wedding classics playlist',
 'label': 0}

In [14]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [15]:
def preprocess_function(examples):
    return tokenizer(examples['utterance'], truncation = True)

In [16]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched = True)

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]

Map:   0%|          | 0/2617 [00:00<?, ? examples/s]

In [17]:
seq_clf_tokenized_snips['train'][0]

{'utterance': 'add the chamillionaire track to lina s wedding classics playlist',
 'label': 0,
 'input_ids': [101,
  5587,
  1996,
  15775,
  19912,
  3258,
  14737,
  2650,
  2000,
  27022,
  1055,
  5030,
  10002,
  2377,
  9863,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(unique_sequence_labels)
)
sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
sequence_clf_model.config.id2label[0]

'AddToPlaylist'

In [21]:
!pip install evaluate
import evaluate



In [22]:
metric = evaluate.load("accuracy")

In [23]:
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    return metric.compute(predictions = predictions, references = labels)

In [24]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [25]:
epochs = 2

training_args = TrainingArguments(
    output_dir = "./snips_clf/results",
    num_train_epochs = epochs,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    load_best_model_at_end = True,
    
    # some deep learning params that the trainer is able to take in
    
    warmup_steps = len(seq_clf_tokenized_snips['train']) // 5, # no. of warmup steps for learning rate scheduler
    weight_decay = 0.05,
    
    logging_steps = 1,
    log_level = 'info',
    eval_strategy = 'epoch',
    save_strategy = 'epoch'
)


trainer = Trainer(
    model = sequence_clf_model,
    args = training_args,
    train_dataset = seq_clf_tokenized_snips['train'],
    eval_dataset = seq_clf_tokenized_snips['test'],
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

In [26]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.9480856657028198,
 'eval_model_preparation_time': 0.0071,
 'eval_accuracy': 0.12533435231180742,
 'eval_runtime': 30.3565,
 'eval_samples_per_second': 86.209,
 'eval_steps_per_second': 2.701}

In [27]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,958,855


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.1798,0.224038,0.0071,0.978984
2,0.0163,0.048174,0.0071,0.988536


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results\checkpoint-328
Configuration saved in ./snips_clf/results\checkpoint-328\config.json
Model weights saved in ./snips_clf/results\checkpoint-328\model.safetensors
Saving model checkpoint to ./snips_clf/results\checkpoint-656
Configuration saved in ./snips_clf/results\checkpoint-656\config.json
Model weights saved in ./snips_clf/results\checkpoint-656\model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification

TrainOutput(global_step=656, training_loss=0.7247861083596945, metrics={'train_runtime': 1955.0215, 'train_samples_per_second': 10.708, 'train_steps_per_second': 0.336, 'total_flos': 117122748262158.0, 'train_loss': 0.7247861083596945, 'epoch': 2.0})

In [28]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.04817378148436546,
 'eval_model_preparation_time': 0.0071,
 'eval_accuracy': 0.988536492166603,
 'eval_runtime': 74.4715,
 'eval_samples_per_second': 35.141,
 'eval_steps_per_second': 1.101,
 'epoch': 2.0}

In [29]:
pipe = pipeline('text-classification', sequence_clf_model, tokenizer=tokenizer)
pipe("Add Two Coins' by Dispatch to my road trip playlist")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'AddToPlaylist', 'score': 0.9908778071403503}]

In [30]:
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results\config.json
Model weights saved in ./snips_clf/results\model.safetensors


In [31]:
pipe = pipeline('text-classification', './snips_clf/results', tokenizer = tokenizer)
pipe("Add Two Coins' by Dispatch to my road trip playlist")

loading configuration file ./snips_clf/results\config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "AddToPlaylist",
    "1": "SearchScreeningEvent",
    "2": "PlayMusic",
    "3": "SearchCreativeWork",
    "4": "BookRestaurant",
    "5": "RateBook",
    "6": "GetWeather"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transfor

[{'label': 'AddToPlaylist', 'score': 0.9908778071403503}]

In [32]:
frozen_sequence_clf_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = len(unique_sequence_labels))

loading configuration file config.json from cache at C:\Users\Saket\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.46.3",
  "vocab_size": 30522
}


In [33]:
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False

In [34]:
epochs = 2

training_args = TrainingArguments(
    output_dir = "./snips_clf/results",
    num_train_epochs = epochs,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    load_best_model_at_end = True,
    
    # some deep learning params that the trainer is able to take in
    
    warmup_steps = len(seq_clf_tokenized_snips['train']) // 5, # no. of warmup steps for learning rate scheduler
    weight_decay = 0.05,
    
    logging_steps = 1,
    log_level = 'info',
    eval_strategy = 'epoch',
    save_strategy = 'epoch'
)


trainer = Trainer(
    model = frozen_sequence_clf_model,
    args = training_args,
    train_dataset = seq_clf_tokenized_snips['train'],
    eval_dataset = seq_clf_tokenized_snips['test'],
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [35]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.9445534944534302,
 'eval_model_preparation_time': 0.0,
 'eval_accuracy': 0.1815055406954528,
 'eval_runtime': 75.943,
 'eval_samples_per_second': 34.46,
 'eval_steps_per_second': 1.08}

In [36]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 595,975


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,1.8718,1.8654,0.0,0.630875
2,1.612,1.578199,0.0,0.855942


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results\checkpoint-328
Configuration saved in ./snips_clf/results\checkpoint-328\config.json
Model weights saved in ./snips_clf/results\checkpoint-328\model.safetensors
Saving model checkpoint to ./snips_clf/results\checkpoint-656
Configuration saved in ./snips_clf/results\checkpoint-656\config.json
Model weights saved in ./snips_clf/results\checkpoint-656\model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification

TrainOutput(global_step=656, training_loss=1.8450886694396413, metrics={'train_runtime': 788.1924, 'train_samples_per_second': 26.56, 'train_steps_per_second': 0.832, 'total_flos': 117122748262158.0, 'train_loss': 1.8450886694396413, 'epoch': 2.0})

In [37]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance. If utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.5781985521316528,
 'eval_model_preparation_time': 0.0,
 'eval_accuracy': 0.8559419182269774,
 'eval_runtime': 74.4121,
 'eval_samples_per_second': 35.169,
 'eval_steps_per_second': 1.102,
 'epoch': 2.0}