In [1]:
from datasets import load_dataset
df = load_dataset("knkarthick/dialogsum")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")


In [3]:
def preprocess(batch):
    source = batch["dialogue"]
    target = batch["summary"]

    source_enc = tokenizer(source, truncation=True, padding="max_length", max_length=120)
    target_enc = tokenizer(target, truncation=True, padding="max_length", max_length=120)

    labels = [
        [(t if t != tokenizer.pad_token_id else -100) for t in seq]
        for seq in target_enc["input_ids"]
    ]

    return {
        "input_ids": source_enc["input_ids"],
        "attention_mask": source_enc["attention_mask"],
        "labels": labels
    }

df_tokenized = df.map(preprocess, batched=True)


Map: 100%|██████████| 500/500 [00:00<00:00, 2348.14 examples/s]


In [4]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bart_dialogsum",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    remove_unused_columns = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_tokenized["train"],
    eval_dataset=df_tokenized["test"]
)

trainer.train()




Step,Training Loss
500,2.0988
1000,1.9244
1500,1.8842
2000,1.8699
2500,1.8487
3000,1.8613


TrainOutput(global_step=3115, training_loss=1.9116227886267305, metrics={'train_runtime': 4318.6254, 'train_samples_per_second': 2.885, 'train_steps_per_second': 0.721, 'total_flos': 395240354611200.0, 'train_loss': 1.9116227886267305, 'epoch': 1.0})

In [5]:
trainer.evaluate()



{'eval_loss': 1.85393226146698,
 'eval_runtime': 106.9135,
 'eval_samples_per_second': 14.03,
 'eval_steps_per_second': 1.758,
 'epoch': 1.0}

In [6]:
model.save_pretrained("./dialogsum_model")
tokenizer.save_pretrained("./dialogsum_model")

('./dialogsum_model/tokenizer_config.json',
 './dialogsum_model/special_tokens_map.json',
 './dialogsum_model/tokenizer.json')

In [8]:
model.push_to_hub("Sachin-0001/dialogsum-t5-small")
tokenizer.push_to_hub("Sachin-0001/dialogsum-t5-small")

Processing Files (1 / 1): 100%|██████████|  242MB /  242MB,  801kB/s  
New Data Upload: 100%|██████████|  242MB /  242MB,  801kB/s  


CommitInfo(commit_url='https://huggingface.co/Sachin-0001/dialogsum-t5-small/commit/34c8e85ceb7c46cb448e04f315e3dc102c40031f', commit_message='Upload tokenizer', commit_description='', oid='34c8e85ceb7c46cb448e04f315e3dc102c40031f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sachin-0001/dialogsum-t5-small', endpoint='https://huggingface.co', repo_type='model', repo_id='Sachin-0001/dialogsum-t5-small'), pr_revision=None, pr_num=None)