In [46]:
# !pip install accelerate

In [47]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [48]:
# !pip install transformers

In [49]:
# !pip install datasets

In [50]:
# !pip install evaluate

In [51]:
# !pip install rouge_score

## Read Dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset("Kaludi/Customer-Support-Responses", split="train")
dataset

Downloading data:   0%|          | 0.00/12.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74 [00:00<?, ? examples/s]

Dataset({
    features: ['query', 'response'],
    num_rows: 74
})

In [6]:
dataset = dataset.train_test_split(test_size=0.2)

In [7]:
dataset['train'][0]

{'query': 'How do I change my email preferences?',
 'response': 'We can help with that. Can you please provide your account email so we can guide you through updating your email preferences?'}

## Read the model tokenizer and preprocess data

In [8]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [9]:
prefix = "answer: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["query"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["response"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Metrics


In [16]:
import evaluate

rouge = evaluate.load("rouge")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Model and training

In [17]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [31]:
training_args = Seq2SeqTrainingArguments(
    output_dir="customer_response_model",
    eval_strategy="epoch",
    logging_strategy = "epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    # save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.can_return_loss = True

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.2527,1.891828,0.4005,0.2418,0.3679,0.3669,19.0
2,1.6587,1.328874,0.5549,0.4202,0.5208,0.5183,19.0
3,1.2155,1.255816,0.5064,0.3813,0.4852,0.4843,19.0
4,0.9916,1.243149,0.5127,0.387,0.4798,0.4766,19.0
5,0.8391,1.215239,0.5577,0.4279,0.5075,0.504,19.0
6,0.7338,1.208837,0.5405,0.3926,0.4874,0.484,19.0
7,0.6319,1.202521,0.5198,0.3762,0.4833,0.4796,19.0
8,0.554,1.204675,0.5745,0.4423,0.5376,0.5326,19.0
9,0.529,1.210082,0.5914,0.454,0.5491,0.5469,19.0
10,0.5029,1.211151,0.5707,0.4202,0.5204,0.5156,19.0




TrainOutput(global_step=20, training_loss=0.990927106142044, metrics={'train_runtime': 7.5105, 'train_samples_per_second': 78.556, 'train_steps_per_second': 2.663, 'total_flos': 2600042889216.0, 'train_loss': 0.990927106142044, 'epoch': 10.0})

In [39]:
trainer.save_model("customer_response_model")

In [40]:
# dir(trainer)

In [43]:
# zip the model directory and dowload
!zip -r ./customer_response_model.zip ./customer_response_model

  adding: content/customer_response_model/ (stored 0%)
  adding: content/customer_response_model/spiece.model (deflated 48%)
  adding: content/customer_response_model/tokenizer.json (deflated 74%)
  adding: content/customer_response_model/training_args.bin (deflated 51%)
  adding: content/customer_response_model/model.safetensors (deflated 10%)
  adding: content/customer_response_model/generation_config.json (deflated 29%)
  adding: content/customer_response_model/runs/ (stored 0%)
  adding: content/customer_response_model/runs/Jun24_05-22-44_88da2bfe1fcc/ (stored 0%)
  adding: content/customer_response_model/runs/Jun24_05-22-44_88da2bfe1fcc/events.out.tfevents.1719206568.88da2bfe1fcc.1538.4 (deflated 63%)
  adding: content/customer_response_model/runs/Jun24_05-14-20_88da2bfe1fcc/ (stored 0%)
  adding: content/customer_response_model/runs/Jun24_05-14-20_88da2bfe1fcc/events.out.tfevents.1719206079.88da2bfe1fcc.1538.0 (deflated 62%)
  adding: content/customer_response_model/runs/Jun24_05

## Run inference

In [45]:
from transformers import pipeline

query = input()
print("query:", query)
assistant = pipeline("text2text-generation", model="./customer_response_model")
assistant(query)

can I get a 30% off?
query: can I get a 30% off?




[{'generated_text': 'Certainly. Can you please provide your account email or account email so we can send you '}]