In [None]:
!pip install transformers[torch]

In [None]:
!pip install transformers datasets evaluate

In [None]:
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('logs')

In [30]:
from datasets import load_dataset
raw_dataset = load_dataset("samsum")

Downloading data:   0%|          | 0.00/6.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [31]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [32]:
raw_dataset["train"]=raw_dataset["train"].select([i for i in range(10000)])
#raw_dataset["validation"]=raw_dataset["validation"].select([i for i in range(2000)])

In [33]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [34]:
raw_dataset["train"][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

In [35]:
import re
def clean_data(x):
  x["dialogue"] = re.sub(r"[\r\n]","",x["dialogue"])
  x["dialogue"] = re.sub(r'\s+', ' ', x["dialogue"]).strip()
  return x
raw_dataset = raw_dataset.map(clean_data)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [36]:
raw_dataset["train"][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked cookies. Do you want some?Jerry: Sure!Amanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

In [37]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_ckpt = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

In [None]:
print(model)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

In [38]:
def get_feature(batch):
  encodings = tokenizer(batch['dialogue'], text_target=batch['summary'],
                        max_length=1024, truncation=True)

  encodings = {'input_ids': encodings['input_ids'],
               'attention_mask': encodings['attention_mask'],
               'labels': encodings['labels']}

  return encodings

In [39]:
raw_data_encoded = raw_dataset.map(get_feature, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [40]:
raw_data_encoded

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [41]:
print(raw_data_encoded["train"][0])

{'id': '13818513', 'dialogue': "Amanda: I baked cookies. Do you want some?Jerry: Sure!Amanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.', 'input_ids': [0, 10127, 5219, 35, 38, 17241, 15269, 4, 1832, 47, 236, 103, 116, 39237, 35, 9136, 328, 10127, 5219, 35, 38, 581, 836, 47, 3859, 48433, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [0, 10127, 5219, 17241, 15269, 8, 40, 836, 6509, 103, 3859, 4, 2]}


In [42]:
columns = ['input_ids', 'labels', 'attention_mask']
raw_data_encoded.set_format(type='torch', columns=columns)

In [43]:
raw_data_encoded

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [44]:
print(raw_data_encoded["train"][0])

{'input_ids': tensor([    0, 10127,  5219,    35,    38, 17241, 15269,     4,  1832,    47,
          236,   103,   116, 39237,    35,  9136,   328, 10127,  5219,    35,
           38,   581,   836,    47,  3859, 48433,     2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1]), 'labels': tensor([    0, 10127,  5219, 17241, 15269,     8,    40,   836,  6509,   103,
         3859,     4,     2])}


In [45]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [46]:
data_collator

DataCollatorForSeq2Seq(tokenizer=BartTokenizerFast(name_or_path='facebook/bart-large-cnn', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}, model=BartForConditionalGeneration(
  (model): BartModel(
   

In [47]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir = 'bart_samsum',
    num_train_epochs=5,
    warmup_steps = 500,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay = 0.01,
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps=100,
    save_steps=1e6,
    gradient_accumulation_steps=16
)
trainer = Trainer(model=model,
                  args=training_args,
                  tokenizer=tokenizer,
                  data_collator=data_collator,
                  train_dataset = raw_data_encoded['train'],
                  eval_dataset = raw_data_encoded['validation'])

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
100,1.4232,1.442119
200,1.2834,1.392266
300,1.2925,1.414441
400,1.1277,1.436912
500,0.8823,1.492725
600,0.9124,1.516836


In [None]:
trainer.save_model('spidex_samsum')

In [None]:
from transformers import pipeline
pipe = pipeline('summarization', model='spidex_sasum')
gen_kwargs = {'length_penalty': 0.8, 'num_beams': 8, "max_length": 128}

custom_dialogue="""
Laxmi Kant what work you planning to give Tom?
Juli i was hoping to send him on a business trip first.
Laxmi Kant cool. is there any suitable work for him?
Juli he did excellent in last quarter. i will assign new project, once he is back.
"""
result = pipe(custom_dialogue, **gen_kwargs)

In [None]:
print(result)

In [None]:
summary_output = result[0]["summary_text"]
print(summary_output)

In [20]:
!zip spidex_samsum.zip -r spidex_samsum/

  adding: spidex/ (stored 0%)
  adding: spidex/vocab.json (deflated 59%)
  adding: spidex/generation_config.json (deflated 47%)
  adding: spidex/tokenizer_config.json (deflated 76%)
  adding: spidex/model.safetensors (deflated 7%)
  adding: spidex/training_args.bin (deflated 51%)
  adding: spidex/special_tokens_map.json (deflated 52%)
  adding: spidex/merges.txt (deflated 53%)
  adding: spidex/config.json (deflated 61%)
  adding: spidex/tokenizer.json (deflated 72%)


In [22]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [24]:
# Assuming 'model' is your PyTorch model
import torch
torch.save(model.state_dict(), '/content/drive/My Drive/spidex.pth')


In [None]:
def generate(ref):
  for i in ref:
    result1 = pipe(i, **gen_kwargs)
    summary_output1 = result1[0]["summary_text"]
    return summary_output1

In [29]:
referenced_summary = [raw_dataset["test"]["summary"][0:11]]
generated_summary = list()

Your max_length is set to 128, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 128, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 128, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


korea 's nec UNK announces computer sales tie-up with ukraine 's ultramaritalian computer giant UNK in new windows UNK windows software deal UNK talks with UNK on UNK UNK 's UNK software
["korea 's nec UNK announces computer sales tie-up with ukraine 's ultramaritalian computer giant UNK in new windows UNK windows software deal UNK talks with UNK on UNK UNK 's UNK software"]


In [None]:
generated = generate(referenced_summary)
generated_summary.append(generated)

In [None]:
print(generated_summary)

In [None]:
#bleauscore
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
reference_summary1 = raw_dataset["test"][0]["summary"]
result2 = pipe(refernce_summary1, **gen_kwargs)
generated_summary1 = result2[0]["summary_text"]
reference_summaries_tokenized = [word.lower() for word in reference_summary1]
generated_summary_tokenized = [word.lower() for word in generated_summary1]
bleu_score = corpus_bleu([reference_summaries_tokenized], [generated_summary_tokenized])

print("BLEU Score:", bleu_score)
