In [2]:
# install hugging face datasets module
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [3]:
# install transformers library
%pip install transformers

^C
Note: you may need to restart the kernel to use updated packages.




In [None]:
# import necessary packages
import transformers, pandas as pd
from datasets import Dataset, load_metric
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer

  from .autonotebook import tqdm as notebook_tqdm


### Prepare the Dataset

In [None]:
# create the sample emails texts
sample_mails = ["I will share your email",
"I shall share your email",
"I've shared your email",
"May I share your email",
"Should I share your email",
"I already shared the email",
"I've just shared your email",
"Am I allowed to share your email",
"Am I able to share your email",
"I am able to share your email",
"Will you help my friends if I share your email with them?"]

In [None]:
# manually create a label match for each sample mail
mail_labels = ["Student wants to know if can share",
"Student wants to know if can share",
"Student has shared",
"Student wants to know if can share",
"Student wants to know if can share",
"Student has shared",
"Student has shared",
"Student wants to know if can share",
"Student wants to know if can share",
"Student has shared",
"Student wants to know if can share"]

In [None]:
# load dataset as pandas Dataframe
df = pd.DataFrame({'sample_mails': sample_mails, 'labels': mail_labels})

# check
df

SyntaxError: invalid syntax. Perhaps you forgot a comma? (368758319.py, line 2)

In [None]:
# load the dataframe in a hugging face compatible format
dataset = Dataset.from_pandas(df)

# check the type
type(dataset)

datasets.arrow_dataset.Dataset

### Preprocessing the dataset

In [None]:
# encode the dataset labels as integers
dataset = dataset.class_encode_column('labels')

Casting to class labels: 100%|██████████| 11/11 [00:00<00:00, 430.71 examples/s]


In [None]:
# view a sample of the dataset
dataset[2]

{'sample_mails': "I've shared your email", 'labels': 0}

From the above output, we see that label `0` indicates the label `Student has shared` therefore label `1` will indicate `Student wants to know if can share`

In [None]:
# verify the dataset features
dataset.features

{'sample_mails': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['Student has shared', 'Student wants to know if can share'], id=None)}

### Tokenization

In [None]:
# declare the checkpoint
checkpoint = "bert-base-uncased"

# call the tokenizer for training
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# create a function for tokenizing the sample_mails
def tokenize_function(example):
    return tokenizer(example["sample_mails"], truncation=True)

In [None]:
# tokenize the dataset with the map function
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map: 100%|██████████| 11/11 [00:00<00:00, 275.02 examples/s]


Dataset({
    features: ['sample_mails', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 11
})

In [None]:
# apply dynamic padding -- pad all the sample_mails to the length of the longest element when we batch elements together
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

To test this new toy, we'll slice our dataset that we would like to batch together. Here, we remove the columns idx and sample_mails as they won’t be needed and contain strings (and we can’t create tensors with strings) and have a look at the lengths of each entry in the batch:

In [None]:
samples = tokenized_datasets[:]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sample_mails"]}
[len(x) for x in samples["input_ids"]]

[7, 7, 8, 7, 7, 7, 9, 9, 9, 9, 15]

No surprise, we get samples of varying length, from 7 to 15. Dynamic padding means the samples in this batch should all be padded to a length of 15, the maximum length inside the batch. Without dynamic padding, all of the samples would have to be padded to the maximum length in the whole dataset, or the maximum length the model can accept. Let’s double-check that our data_collator is dynamically padding the batch properly:

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([11]),
 'input_ids': torch.Size([11, 15]),
 'token_type_ids': torch.Size([11, 15]),
 'attention_mask': torch.Size([11, 15])}

In [None]:
# check if we're using a fast tokenizer
tokenizer.is_fast

True

In [None]:
# we can convert the tokenized dataset back to text as follows
tokenizer.convert_ids_to_tokens(tokenized_datasets['input_ids'][-1])

['[CLS]',
 'will',
 'you',
 'help',
 'my',
 'friends',
 'if',
 'i',
 'share',
 'your',
 'email',
 'with',
 'them',
 '?',
 '[SEP]']

### Training

The first step before we can define our Trainer is to define a TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and evaluation. The only argument you have to provide is a directory where the trained model will be saved, in our case we want to also modify the number of epochs for training,  the checkpoints along the way are also saved in this directory. For all the rest, you can leave the defaults, which should work pretty well for a basic fine-tuning.

In [None]:
# define a metric to monitor during training
metric = load_metric("accuracy")

# create a function that helps compute the specified metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [None]:
# define the training arguments
training_args = TrainingArguments('training_args',
                                  num_train_epochs=20)

The second step is to define our model. We will use the AutoModelForSequenceClassification class, with two labels:

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading model.safetensors: 100%|██████████| 440M/440M [01:00<00:00, 7.34MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


You will notice that you get a warning after instantiating this pretrained model. This is because BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.

In [None]:
# define trainer object
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

To fine-tune the model on our dataset, we just have to call the train() method of our Trainer:

In [None]:
# train the model
trainer.train()

100%|██████████| 40/40 [02:53<00:00,  4.33s/it]

{'train_runtime': 172.9928, 'train_samples_per_second': 1.272, 'train_steps_per_second': 0.231, 'train_loss': 0.24077908992767333, 'epoch': 20.0}





TrainOutput(global_step=40, training_loss=0.24077908992767333, metrics={'train_runtime': 172.9928, 'train_samples_per_second': 1.272, 'train_steps_per_second': 0.231, 'train_loss': 0.24077908992767333, 'epoch': 20.0})

In [None]:
# save the trained model together with the tokenizer in a directory
trainer.save_model('custom_model')

### Evaluation

For this task, we will evaluate the model on the training set, given that the dataset is extremely small and could not be split into train-test sets

In [None]:
predictions = trainer.predict(tokenized_datasets)
print(predictions.predictions.shape, predictions.label_ids.shape, '\n')
print(predictions)

100%|██████████| 2/2 [00:01<00:00,  1.90it/s]

(11, 2) (11,) 

PredictionOutput(predictions=array([[-1.9196216,  1.3245827],
       [-1.949448 ,  1.3339628],
       [ 1.7652345, -1.1058381],
       [-2.0256405,  1.362749 ],
       [-2.0248377,  1.3859997],
       [ 1.6743466, -1.0574694],
       [ 1.7554849, -1.0992212],
       [-2.0404794,  1.3920761],
       [-1.9952593,  1.3621099],
       [ 1.7081995, -1.0845065],
       [-2.0348525,  1.3852504]], dtype=float32), label_ids=array([1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1], dtype=int64), metrics={'test_loss': 0.04296018183231354, 'test_accuracy': 1.0, 'test_runtime': 1.6477, 'test_samples_per_second': 6.676, 'test_steps_per_second': 1.214})





The output of the `predict()` method is another named tuple with three fields: predictions, `label_ids`, and `metrics`. The metrics field now contains the loss on the dataset passed, some time metrics (how long it took to predict, in total and on average), and the accuracy of training

As we can see, predictions is a two-dimensional array with shape 11 x 2 (11 being the number of elements in the dataset we used). Those are the logits for each element of the dataset we passed to `predict()`. To transform them into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis:

In [None]:
preds = np.argmax(predictions.predictions, axis=-1)

We can now compare those preds to the labels. To build our `compute_metric()` function, we will rely on the metrics from the 🤗 Datasets library. We can load the metrics associated with the MRPC dataset as easily as we loaded the dataset, this time with the `load_metric()` function. The object returned has a `compute()` method we can use to do the metric calculation. Wrapping everything together, we get our `compute_metrics_mrpc()` function:

In [None]:
def compute_metrics_mrpc(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
compute_metrics_mrpc(predictions)

Downloading builder script: 5.76kB [00:00, ?B/s]                       


{'accuracy': 1.0, 'f1': 1.0}

In [None]:
predictions.predictions

array([[-1.9196216,  1.3245827],
       [-1.949448 ,  1.3339628],
       [ 1.7652345, -1.1058381],
       [-2.0256405,  1.362749 ],
       [-2.0248377,  1.3859997],
       [ 1.6743466, -1.0574694],
       [ 1.7554849, -1.0992212],
       [-2.0404794,  1.3920761],
       [-1.9952593,  1.3621099],
       [ 1.7081995, -1.0845065],
       [-2.0348525,  1.3852504]], dtype=float32)

In [None]:
predictions.label_ids

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1], dtype=int64)

From the above, we see that the model has a perfect prediction on the data it was trained on. This is highly flawed and can be ascribed to overfitting, but since we have no test set to evaluate on given the size of the sample data, we can assume that for the model to overfit at 20 epochs, it actually did well in learning the training dataset.

### Inference

In [None]:
# get the directory where the model was saved to
inf_model = AutoModelForSequenceClassification.from_pretrained('custom_model/')

I had to change the path of the custom model to 'custom_model/'

In [None]:
# load the tokenizer by pointing to the same directory as the pretrained model
inf_tokenizer = AutoTokenizer.from_pretrained('custom_model/')

In [None]:
# generate sequence for inference
sequences = ['I want to know if I should send your email', 'I sent your email a long time ago']

In [None]:
# create a pipeline for inference
from transformers import pipeline
classifier = pipeline(task='text-classification', model=inf_model, tokenizer=inf_tokenizer)

In [None]:
classifier(sequences)

[{'label': 'LABEL_1', 'score': 0.9348370432853699},
 {'label': 'LABEL_0', 'score': 0.7933637499809265}]

From the above output, we can confidently say the model is performing well on inference