In [10]:
!pip install datasets transformers



In [11]:
from datasets import load_dataset, DatasetDict

# Load the bloomberg/entsum dataset
dataset = load_dataset("bloomberg/entsum")

Repo card metadata block was not found. Setting CardData to empty.


In [12]:
# Split the dataset into train and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.1)  # 10% for validation

# Create a new DatasetDict with train and validation sets
dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

In [13]:
def preprocess_function(examples):
    # Concatenate the salient sentences to form the input text
    # Convert any non-string values to strings
    input_text = " ".join([str(sentence) for sentence in examples['salient_sentences']])

    # Use the provided summary as the target output
    summary_text = examples['summary']

    return {"text": input_text, "summary": summary_text}



In [14]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    # Tokenize both the input text and summaries as a pair
    inputs = tokenizer(
        examples['text'],
        text_pair=examples['summary'],  # Pass summaries as text pairs
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Extract labels (input_ids for the summaries)
    inputs['labels'] = inputs['input_ids']
    return inputs

# Apply preprocessing and tokenization
tokenized_datasets = dataset.map(preprocess_function).map(tokenize_function, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/5018 [00:00<?, ? examples/s]

Map:   0%|          | 0/558 [00:00<?, ? examples/s]

Map:   0%|          | 0/5018 [00:00<?, ? examples/s]

Map:   0%|          | 0/558 [00:00<?, ? examples/s]

In [15]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],  # Use the new validation set
)

trainer.train()




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
model.save_pretrained("./fine_tuned_t5_model")
tokenizer.save_pretrained("./fine_tuned_t5_tokenizer")


In [None]:
def summarize_article(article_text):
    inputs = tokenizer.encode("summarize: " + article_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage:
article_text = "Your article text here..."
summary = summarize_article(article_text)
print(summary)
