#1.Import the library

In [None]:
!pip install transformers datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
!pip install evaluate
!pip install rouge_score
!pip install bert_score

In [None]:
import evaluate
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, BartForConditionalGeneration
import nltk
from nltk import sent_tokenize
import datasets
import numpy as np
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from bert_score import BERTScorer


#2.Import the datasets

In [None]:
dataset_csv = pd.read_csv("/kaggle/input/news-summary/news_summary.csv", encoding = "latin-1")

In [None]:
dataset_csv = dataset_csv.dropna()

In [None]:
document = dataset_csv["ctext"]
summary = dataset_csv["text"]
dataset = Dataset.from_dict({"document" : document, "summary" : summary})

In [None]:
print("Max number of token in text: ")
print(max(len(x.split()) for x in dataset["document"]))
print("Max number of token in summary: ")
print(max(len(x.split()) for x in dataset["summary"]))

Max number of token in text: 
12202
Max number of token in summary: 
62


In [None]:
validation_dataset = dataset.shuffle(seed=42).select(range(1000))
training_dataset = dataset.shuffle(seed = 42).select(range(1001,4396,1))

#3.Create a strong baseline: base-3 line


In [None]:
rouge_score = evaluate.load("rouge")

2024-04-10 14:28:04.966969: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-10 14:28:04.967085: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-10 14:28:05.241203: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def three_sentences_summary(text):
    return "\n".join(sent_tokenize(text)[:3])



In [None]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentences_summary(text) for text in dataset["document"]]
    return metric.compute(predictions=summaries, references=dataset["summary"])

In [None]:
score = evaluate_baseline(validation_dataset,rouge_score)
print("The base line score:\n",score)

The base line score:
 {'rouge1': 0.4389851343605487, 'rouge2': 0.2400983572098281, 'rougeL': 0.31238678399992753, 'rougeLsum': 0.3376194701143866}


#4.Tokenize the text

In [None]:
model_name = "sshleifer/distilbart-xsum-12-3"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
encoder_max_length = 1024 # demo
decoder_max_length = 128

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data_token = training_dataset.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=training_dataset.column_names,
)

validation_data_token = validation_dataset.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_dataset.column_names,
)



  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
train_data_token.set_format("torch")
validation_data_token.set_format("torch")

#5.Fine-tuning model

In [None]:
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-xsum-12-3")

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

In [None]:
np.object = np.object_

In [None]:

nltk.download("punkt", quiet=True)



def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):

    metric = datasets.load_metric("rouge")
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = np.array(preds)
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.array(labels)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=5,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    generation_num_beams = 3,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
    generation_max_length = 500,
    evaluation_strategy = "epoch"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data_token,
    eval_dataset=validation_data_token,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Evaluate model before training
trainer.evaluate()

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


{'eval_loss': 5.84979772567749,
 'eval_rouge1': 26.0381,
 'eval_rouge2': 9.0818,
 'eval_rougeL': 18.6491,
 'eval_rougeLsum': 21.6728,
 'eval_gen_len': 26.511,
 'eval_runtime': 262.0025,
 'eval_samples_per_second': 3.817,
 'eval_steps_per_second': 0.122}

In [None]:
#start traning the model
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.4812,3.320873,47.7226,26.3282,35.5063,42.5426,66.523
2,3.2269,3.18379,50.4271,27.7047,37.2638,45.1897,77.115
3,2.9504,3.140087,50.6362,28.2773,37.6,45.4901,74.992
4,2.8014,3.134555,51.2942,28.4684,38.0877,46.0386,74.299
5,2.71,3.142618,51.2701,28.3575,37.9263,45.8934,75.777


Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


TrainOutput(global_step=2125, training_loss=3.143825220444623, metrics={'train_runtime': 5945.0054, 'train_samples_per_second': 2.855, 'train_steps_per_second': 0.357, 'total_flos': 2.102041116672e+16, 'train_loss': 3.143825220444623, 'epoch': 5.0})

In [None]:
#After training:
trainer.evaluate()

{'eval_loss': 3.142617702484131,
 'eval_rouge1': 51.2701,
 'eval_rouge2': 28.3575,
 'eval_rougeL': 37.9263,
 'eval_rougeLsum': 45.8934,
 'eval_gen_len': 75.777,
 'eval_runtime': 401.1911,
 'eval_samples_per_second': 2.493,
 'eval_steps_per_second': 0.312,
 'epoch': 5.0}

#6.Inference


In [None]:
scorer = BERTScorer(lang="en")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def inference_label(index_document, data_tokenized, data_text, metric_rouge, bert_score):
    input_token = data_tokenized[index_document]
    prediction = trainer.predict([input_token], max_length = 500, num_beams = 4) .predictions
    predicted_summary = tokenizer.decode(prediction[0][2:-1:1])
    labels = input_token["labels"]
    labels = np.array(labels)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.decode(labels, skip_special_tokens=True)
    recall, precision, f1 = bert_score.score([predicted_summary], [decoded_labels])
    print("The truth summary:\n",decoded_labels)
    print("The predicted summary:\n", predicted_summary)
    print("----------------------------------------------")
    print("The rouge score:", metric_rouge.compute(predictions= [predicted_summary], references=[decoded_labels]))
    print(f"Bert score recall f1 : {float(f1)}")

In [None]:
inference_label(6, validation_data_token, validation_dataset, rouge_score, scorer)

The truth summary:
 Tagging Prime Minister Narendra Modi, Canadian Prime Minister Justin Trudeau on Friday re-tweeted a post about Indian tennis player Rohan Bopanna and Canadian player Gabriela Dabrowski. The two had won the mixed doubles title at the French Open. Modi replied, "India & Canada is an ace partnership, advantageous to both nations, beneficial for the world. Game, set and match always."
The predicted summary:
 Canadian Prime Minister Justin Trudeau has proposed an India-Canada partnership after Indian tennis player Rohan Bopanna and Canada's Gabriela Dabrowski won the mixed doubles French Open title in June. "India & Canada is an ace partnership, advantageous to both nations, beneficial for the world," tweeted Trudeau.
----------------------------------------------
The rouge score: {'rouge1': 0.6542056074766355, 'rouge2': 0.5142857142857143, 'rougeL': 0.6168224299065421, 'rougeLsum': 0.6168224299065421}
Bert score recall f1 : 0.9357349276542664


In [None]:
#inference with new text:
def inference(text, tokenizer,model, num_beams = 5, max_generation_length = 200):
    text_tokenized = tokenizer(
        [text], padding="max_length", truncation=True, max_length=1024,return_tensors='pt'
    )

    prediction_token = model.generate(text_tokenized["input_ids"], max_length = max_generation_length, num_beams = num_beams)

    prediction_summary = tokenizer.decode(prediction_token[0][2:-1:1])

    print("The result:")

    print("- The original text:\n")
    print(text)
    print("-" * 50)
    print("- The summary text:\n")
    print(prediction_summary)

    return prediction_summary





In [None]:
original_text1 = """
Workplace well-being is on an "upward trajectory" throughout Asia-Pacific, with Vietnam (65.09) leading the region, according to a recent study by Asia's largest mental health care company Intellect.
Other leading countries include Thailand (65.01), the Philippines (64.44), Malaysia (64.22), Singapore (63.98), Japan (63.77), China (63.61), and Indonesia (63.55).

Meanwhile, others such as India, Australia and the Republic of Korea are below average though there is potential for further development, the report found.

The study uncovered the strengths and areas of improvement of workforces across Asia-Pacific on both individual and organizational levels.

While individuals are self-aware, able to build workplace relationships, and likely to encourage workforce participation, they may need support with stress management, emotional regulation, and mental well-being. For organizations in these markets, organizational support is on the rise though companies may need support in implementing employee well-being programmes, Intellect noted.

The three sectors with the highest organizational health scores are healthcare and pharmaceuticals (67.48), government and nonprofit (66.81), and education (65.76). All of those are above the overall benchmark score of 63.85.

Other above-average industries include technology and telecommunications, professional services, and manufacturing, according to Intellect.
"""

In [None]:
inference(original_text1, tokenizer, trainer.model)

The result:
- The original text:


Workplace well-being is on an "upward trajectory" throughout Asia-Pacific, with Vietnam (65.09) leading the region, according to a recent study by Asia's largest mental health care company Intellect.
Other leading countries include Thailand (65.01), the Philippines (64.44), Malaysia (64.22), Singapore (63.98), Japan (63.77), China (63.61), and Indonesia (63.55).

Meanwhile, others such as India, Australia and the Republic of Korea are below average though there is potential for further development, the report found.

The study uncovered the strengths and areas of improvement of workforces across Asia-Pacific on both individual and organizational levels.

While individuals are self-aware, able to build workplace relationships, and likely to encourage workforce participation, they may need support with stress management, emotional regulation, and mental well-being. For organizations in these markets, organizational support is on the rise though companie

In [None]:
original_text2 = """
In the 86th minute of the game on Monday night, when Al Nassr were trailing 0-2 to Al Hilal, Ronaldo and defender Ali Al-Bulaihi went for the ball for a throw-in. Ronaldo got the ball and intended to make a quick throw, but the Saudi Arabian player rushed forward to take it because he thought the throw-in belonged to Al Hilal. But Ronaldo, captain of Al Nassr gritted his teeth and put his elbow out to prevent Al-Bulaihi from stealing the ball. Just as Bulaihi rushed forward, Ronaldo's elbow hit the 35-year-old defender’s neck and he fell down in pain.

Referee Mohammed Al-Hoaish immediately gave Ronaldo a red card and sent him off. The Portuguese superstar was surprised, raised his fist towards Al-Hoaish and intended to punch him. At that time, the referee turned away and did not see Ronaldo's threatening action.

This is the 12th red card in Ronaldo's career and the first time he has been sent off since 2018. On his way out, Ronaldo pointed at the referee, clapped mockingly and gave him a thumbs up. The 39-year-old striker is set to face suspension for this series of actions.

Al-Bulaihi has repeatedly provoked Ronaldo in the Saudi derby games between Al Nassr and Al Hilal. One time he dived then stood up and Ronaldo ran after him and applauded. Al-Bulaihi also provoked Lionel Messi several times, when Saudi Arabia unexpectedly beat Argentina 2-1 in the opening match of the 2022 World Cup.

Ronaldo lacked restraint when he encountered many difficulties against the Al Hilal, the team that hold the world record of winning 33 consecutive games in all competitions. He missed a relevant chance in the 17th minute, with a shot that went over the bar. According to Sofascore statistics, Ronaldo only had 33 touches on the ball in 86 minutes of play, missed six shots, had six wrong passes and lost the ball nine times.

Al Hilal are the team with the richest tradition in Asia with 66 official titles, including title records in the AFC Champions League and Saudi Pro League. Coach Jorge Jesus's team are ranked 39th in the world, above Marseille, Villarreal or Wolverhampton, according to the power index of Opta. Meanwhile, Al Nassr are 89th, behind Nottingham Forest.

The defeat against Al Hilal left Al Nassr with an only chance to get a title this season, which is the King Cup, a tournament similar to England's FA Cup. Al Nassr reached the semifinals and will meet Al Ittihad on April 30. Meanwhile, Al Hilal reached the final of the Saudi Super Cup and will play Al Ittihad on April 11.
"""

In [None]:
inference(original_text2, tokenizer, trainer.model)

The result:
- The original text:


In the 86th minute of the game on Monday night, when Al Nassr were trailing 0-2 to Al Hilal, Ronaldo and defender Ali Al-Bulaihi went for the ball for a throw-in. Ronaldo got the ball and intended to make a quick throw, but the Saudi Arabian player rushed forward to take it because he thought the throw-in belonged to Al Hilal. But Ronaldo, captain of Al Nassr gritted his teeth and put his elbow out to prevent Al-Bulaihi from stealing the ball. Just as Bulaihi rushed forward, Ronaldo's elbow hit the 35-year-old defender’s neck and he fell down in pain.

Referee Mohammed Al-Hoaish immediately gave Ronaldo a red card and sent him off. The Portuguese superstar was surprised, raised his fist towards Al-Hoaish and intended to punch him. At that time, the referee turned away and did not see Ronaldo's threatening action.

This is the 12th red card in Ronaldo's career and the first time he has been sent off since 2018. On his way out, Ronaldo pointed at the re

In [None]:
original_text3 = """
Holding a high school diploma from Canada and a university degree in Chinese commerce language from China, Tat Dat has faced difficulties securing a fulfilling job upon his return to Vietnam.
After moving back to his hometown of northern Quang Ninh province in 2022, it took him four months and over 20 job applications to land a position in e-commerce, with a starting salary of VND8 million ($320) per month - less than he hoped for.

"I diligently monitored job platforms every hour, in search of an e-commerce trade position, hoping for a monthly salary of VND12 million to VND20 million," Dat said. "Upon eventually securing such a position, the employers informed me that what they could offer me would be VND8 million, a figure not open to negotiation."

Dat said he believed that the salary he was offered would never compensate for the VND15 billion invested in his education, yet he remained at the position for six months.

However, the unsatisfactory salary was merely one of several challenges he encountered in the Vietnamese job market, including difficulties adjusting to workplace culture, being tasked with duties not outlined in his contract, and frequently working overtime without additional compensation.

"In practice, although the company’s policy stated an eight-hour workday, the actual hours frequently extended to 10-12 hours a day, with no additional overtime compensation," Dat said. "This was a stark contrast to my previous experiences where an eight-hour workday strictly meant eight hours, nothing more."

Dat discovered that being bilingual was no longer a distinctive advantage, facing competition from peers fluent in three or four languages.

"Encountering peers fluent in English, Chinese, Korean, and French made me feel less competent," he admitted.
"""

In [None]:
inference(original_text3, tokenizer, trainer.model)

The result:
- The original text:


Holding a high school diploma from Canada and a university degree in Chinese commerce language from China, Tat Dat has faced difficulties securing a fulfilling job upon his return to Vietnam.
After moving back to his hometown of northern Quang Ninh province in 2022, it took him four months and over 20 job applications to land a position in e-commerce, with a starting salary of VND8 million ($320) per month - less than he hoped for.

"I diligently monitored job platforms every hour, in search of an e-commerce trade position, hoping for a monthly salary of VND12 million to VND20 million," Dat said. "Upon eventually securing such a position, the employers informed me that what they could offer me would be VND8 million, a figure not open to negotiation."

Dat said he believed that the salary he was offered would never compensate for the VND15 billion invested in his education, yet he remained at the position for six months.

However, the unsatisfactory sa

#7.Push model into Huggingface hub

In [None]:
!pip install huggingface_hub --q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# !huggingface-cli login --token <Token_API_key>


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# create_repo("LA1512/fine-tuned-distilbart-xsum-12-3-news-summary", private=False)

trainer.push_to_hub("LA1512/fine-tuned-distilbart-xsum-12-3-news-summary")

Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/LA1512/results/commit/053244e12579db0d33ce5acb1ec1e7e122207d79', commit_message='LA1512/fine-tuned-distilbart-xsum-12-3-news-summary', commit_description='', oid='053244e12579db0d33ce5acb1ec1e7e122207d79', pr_url=None, pr_revision=None, pr_num=None)