In [None]:
#  Uncomment and run if you don't have the packages downloaded
# !pip install datasets
# !pip install transformers
# !pip install rouge


In [None]:
from pathlib import Path

from transformers import EncoderDecoderModel, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModel
from transformers.trainer_utils import set_seed

from rouge import Rouge

# Configuration
PRETRAINED_MODEL = "Set this to the name or path of the pretrained model"

## When loading the dataset
In case you face the following error: _"ModuleNotFoundError: No module named 'fsspec.asyn'"_ when loading the dataset, try the following commands: <br>  pip uninstall fsspec <br>  pip install fsspec==2022.7.1<br>
In case you face the following error: _"AttributeError: 'FloatProgress' object has no attribute 'style'"_ when loading the dataset, try the following the command: <br>  pip install --upgrade ipywidgets

In [None]:
# Load dataset for training
from datasets import load_dataset
dataset = load_dataset('Goud/Goud-sum')

In [None]:
# Show samples from the dataset
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"'>> Headline: {example['headline']}'")
        print(f"\n'>> Article: {example['article']}'")
        
show_samples(dataset)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ahmedJaafari/DarElectra", use_auth_token="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")

In [None]:
from transformers import ElectraConfig, EncoderDecoderConfig
enc_config = ElectraConfig.from_pretrained("ahmedJaafari/DarElectra", use_auth_token="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")
dec_config = ElectraConfig.from_pretrained("ahmedJaafari/DarElectra", use_auth_token="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")
enc_dec_config = EncoderDecoderConfig.from_encoder_decoder_configs(enc_config, dec_config)

In [None]:
model = EncoderDecoderModel(config=enc_dec_config)

In [None]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

In [None]:
def preprocess_function(batch):
    inputs = tokenizer(batch["article"], padding="max_length", 
                                        truncation=True, max_length=512)
    outputs = tokenizer(batch["headline"], padding="max_length", 
                                        truncation=True, max_length=512)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
  
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [None]:
tokenized_train_dataset = dataset['train'].map(
    preprocess_function, batched=True, remove_columns=["article", "headline"]
)

tokenized_eval_dataset = dataset['validation'].map(
    preprocess_function, batched=True, remove_columns=["article", "headline"]
)

tokenized_test_dataset = dataset['test'].map(
    preprocess_function, batched=True, remove_columns=["article", "headline"]
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = Rouge()

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # decoding predictions and labels
    candidates = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    references = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    scores = metric.get_scores(candidates, references, avg=True, ignore_empty=True)
    result = {key: round(value['f'] * 100, 2) for key, value in scores.items()}

    return result

In [None]:
# Seq2Seq Trainer Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=f'DarElectraFinal/Summarization',    
    num_train_epochs=20,              
    per_device_train_batch_size=2,   
    per_device_eval_batch_size=2,               
    logging_strategy='epoch', 
    warmup_steps= 1000, 
    gradient_accumulation_steps=4,
    evaluation_strategy='epoch',
    predict_with_generate=True,
    overwrite_output_dir=True,
    save_total_limit=3,
    weight_decay= 0.1,
)

In [None]:
trainer = Seq2SeqTrainer(
  model=model,                            
  args=training_args,            
  train_dataset=tokenized_train_dataset,
  eval_dataset=tokenized_eval_dataset,
  data_collator=data_collator,
  compute_metrics=compute_metrics
)

In [None]:
trainer.train()