In [5]:
! pip install -q -U transformers[torch] datasets trl peft py7zr

In [6]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM, TrainingArguments,Trainer
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

In [7]:
tokenizer= AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model= AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [8]:
dataset= load_dataset("samsum")
dataset

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [9]:
sample= dataset['test'][0]['dialogue']
label= dataset['test'][0]['summary']

def generate_summary(input,llm):
    input_prompt= f""" 
                   Summarize the following conversation.
                   {input}
                   Summary:
                   """
    
    input_ids= tokenizer(sample,return_tensors='pt')
    tokenized_output= llm.generate(input_ids['input_ids'],min_length=30,max_length=200)
    output= tokenizer.decode(tokenized_output[0],skip_special_tokens=True)
    return output

output= generate_summary(sample,llm= model)
print('---------------------------------------------------------------')
print("sample:")
print(sample)
print('---------------------------------------------------------------')
print("model_summary:")
print(output)
print('---------------------------------------------------------------')
print("Correct Summary:")
print(label)

---------------------------------------------------------------
sample:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
---------------------------------------------------------------
model_summary:
Hannah: Hey, do you have Betty's number? Amanda: Lemme check. Hannah: Ask Larry. Amanda: He called her last time we were at the park together.
---------------------------------------------------------------
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [10]:
def tokenize(sample):
  start_prompt = "Summarize the following conversation.\n\n"
  end_prompt = "\n\nSummary:"
  prompt = [start_prompt + dialogue + end_prompt for dialogue in sample['dialogue']]

  sample['input_ids'] = tokenizer(prompt, padding=True, truncation=True,return_tensors="pt").input_ids
  sample['labels'] =  tokenizer(sample['summary'], padding=True, truncation=True,return_tensors="pt").input_ids

  return sample

In [11]:
tokenized_dataset= dataset.map(tokenize,batched=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [12]:
tokenized_dataset= tokenized_dataset.remove_columns(['id','dialogue','summary'])

In [13]:
 tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 818
    })
})

In [14]:
tokenized_dataset['train']=tokenized_dataset['train'].select(range(2000))
tokenized_dataset['validation']=tokenized_dataset['validation'].select(range(500))

In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
training_args= TrainingArguments(
    output_dir='/kaggle/working/bart-cnn-samsum-finetuned',
    hub_model_id='Shorya22/bart-cnn-samsum-finetuned',
    learning_rate=2e-4,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_steps=20)

trainer= Trainer(
    model= model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'])

In [None]:
trainer.train()

In [19]:
# import shutil
# import os
# # Path to the directory
# dir_path = '/kaggle/working/bart-cnn-samsum-finetuned'

# # Check if the directory exists
# if os.path.exists(dir_path):
#     # Delete the directory and its contents
#     shutil.rmtree(dir_path)
#     print(f'{dir_path} and all its contents have been deleted')
# else:
#     print(f'{dir_path} does not exist')


/kaggle/working/wandb does not exist


In [None]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

events.out.tfevents.1715970947.757410fecdbd.188.0:   0%|          | 0.00/5.91k [00:00<?, ?B/s]

events.out.tfevents.1715971012.757410fecdbd.188.1:   0%|          | 0.00/9.07k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

# Inference Model:

In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Shorya22/bart-cnn-samsum-finetuned")
model = AutoModelForSeq2SeqLM.from_pretrained("Shorya22/bart-cnn-samsum-finetuned")

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

In [19]:
output= generate_summary(sample,llm= model)
print('---------------------------------------------------------------')
print("sample:")
print(sample)
print('---------------------------------------------------------------')
print("model_summary:")
print(output)
print('---------------------------------------------------------------')
print("Correct Summary:")
print(label)

---------------------------------------------------------------
sample:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
---------------------------------------------------------------
model_summary:
Hannah asks Amanda if she has Betty's number. Amanda can't find it. Larry called Betty last time they were at the park together.
---------------------------------------------------------------
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
