In [1]:
!pip install datasets --quiet
!pip install sentencepiece --quiet
!pip install accelerate -U --quiet
!pip install scikit-learn --quiet
!pip install sacrebleu --quiet
!pip install tensorboard --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m26.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import torch
import sacrebleu
from datasets import load_dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from hyperparameters import *
import numpy as np
import tensorboard

In [3]:
def get_validation_dataset(dataset, val_size = .1):
    t,v = dataset['train'].train_test_split(test_size = val_size).values()
    return t, v

In [4]:
dataset = load_dataset(DATASET)
train_dataset, val_dataset = get_validation_dataset(dataset)
len(train_dataset), len(val_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/12.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74 [00:00<?, ? examples/s]

(66, 8)

In [5]:
def preprocess_data(data):
    inputs = [pretext + d.lower() for d in data['query']]
    targets = [d.lower() for d in data['response']]
    model_inputs = tokenizer(inputs, max_length= max_source_length, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length= max_target_length, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [6]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_data, batched=True)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]



Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [7]:
def compute_bleu(eval_pred):
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens = True)
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens = True)
    return {'bleu' : sacrebleu.corpus_bleu(predictions, labels).score}

In [8]:
training_args         = Seq2SeqTrainingArguments(**TRAINING_ARGS)
data_collator         = DataCollatorForSeq2Seq(tokenizer, model = model)
trainer               = Seq2SeqTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = tokenized_train_dataset,
    eval_dataset    = tokenized_val_dataset,
    compute_metrics = compute_bleu,
    data_collator   = data_collator
)



#### Trained the model with the pretext : `Assure the customer and provide specific help`

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,No log,0.554859,0.0
2,No log,0.336999,0.0
3,No log,0.240692,0.0
4,No log,0.22331,0.176954
5,No log,0.213184,0.215842
6,No log,0.204392,0.228775
7,No log,0.199596,0.215842
8,No log,0.196146,0.18531
9,No log,0.196068,0.226309
10,No log,0.193811,0.243882


Checkpoint destination directory ./models\checkpoint-17 already exists and is non-empty.Saving will proceed but saved results may be invalid.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=170, training_loss=0.735866995418773, metrics={'train_runtime': 121.9824, 'train_samples_per_second': 5.411, 'train_steps_per_second': 1.394, 'total_flos': 401912207769600.0, 'train_loss': 0.735866995418773, 'epoch': 10.0})

In [None]:
def generate_response(query, model):
    query = pretext + query.lower()
    input_ids = tokenizer.encode(query, return_tensors='pt', max_length=max_source_length, truncation=True)
    if torch.cuda.is_available(): input_ids = input_ids.to('cuda')
    output_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

inds = np.random.choice(np.arange(len(dataset['train'])), 5)
queries = dataset['train']['query']
responses = dataset['train']['response']

for i in inds:
    q = queries[i]
    response = generate_response(q, model)
    print('Query    : ' + q)
    print('Response : ' + responses[i])
    print('Predicted: ' + response)
    print()

Query    : Where can I find your sizing chart?
Response : We'd be happy to help. Can you please provide the product name or SKU so we can direct you to the appropriate sizing chart?
Predicted: we'd be happy to help. can you please provide your order number and the product name or sku you're interested in?

Query    : I need to return an item.
Response : Certainly. Please provide your order number and reason for return, and we will provide you with instructions on how to proceed.
Predicted: we apologize for the inconvenience. can you please provide the product name or sku so we can assist you further?

Query    : How long does shipping take?
Response : We'd be happy to provide an estimate. Can you please provide your shipping destination and the product name or SKU?
Predicted: we'd be happy to help. can you please provide your shipping destination and the product name or sku?

Query    : Can I pre-order an item?
Response : Certainly. Can you please provide the product name or SKU and yo

#### Trained the model with the pretext : `Generate meaningful customer support response`

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,No log,0.558074,0.0
2,No log,0.310823,0.0
3,No log,0.232266,0.0
4,No log,0.214262,0.229969
5,No log,0.198647,0.0
6,No log,0.193539,0.196662
7,No log,0.197325,0.191334
8,No log,0.200803,0.176072
9,No log,0.202593,0.184343
10,No log,0.202748,0.184343


Checkpoint destination directory ./models\checkpoint-17 already exists and is non-empty.Saving will proceed but saved results may be invalid.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=170, training_loss=0.7351306242101333, metrics={'train_runtime': 123.9108, 'train_samples_per_second': 5.326, 'train_steps_per_second': 1.372, 'total_flos': 401912207769600.0, 'train_loss': 0.7351306242101333, 'epoch': 10.0})

In [None]:
def generate_response(query, model):
    query = pretext + query.lower()
    input_ids = tokenizer.encode(query, return_tensors='pt', max_length=max_source_length, truncation=True)
    if torch.cuda.is_available(): input_ids = input_ids.to('cuda')
    output_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

inds = np.random.choice(np.arange(len(dataset['train'])), 5)
queries = dataset['train']['query']
responses = dataset['train']['response']

for i in inds:
    q = queries[i]
    response = generate_response(q, model)
    print('Query    : ' + q)
    print('Response : ' + responses[i])
    print('Predicted: ' + response)
    print()

Query    : Where can I find your sizing chart?
Response : We'd be happy to help. Can you please provide the product name or SKU so we can direct you to the appropriate sizing chart?
Predicted: we'd be happy to help. can you please provide the product name or sku and the product name or sku you're interested in?

Query    : I need to return an item.
Response : Certainly. Please provide your order number and reason for return, and we will provide you with instructions on how to proceed.
Predicted: we apologize for the inconvenience. can you please provide the product name or sku so we can assist you?

Query    : How long does shipping take?
Response : We'd be happy to provide an estimate. Can you please provide your shipping destination and the product name or SKU?
Predicted: we'd be happy to help. can you please provide your shipping address so we can send you a quote?

Query    : Can I pre-order an item?
Response : Certainly. Can you please provide the product name or SKU and your emai

In [None]:
model.save_pretrained(SAVE_AS)
tokenizer.save_pretrained(SAVE_AS)

('t5-customer-support\\tokenizer_config.json',
 't5-customer-support\\special_tokens_map.json',
 't5-customer-support\\spiece.model',
 't5-customer-support\\added_tokens.json')