In [103]:
# ----------------------------------------------------------------------------------
#  Install required libraries for Transformers, Datasets, Evaluation, and Accelerate
# ----------------------------------------------------------------------------------

In [None]:
%pip install transformers datasets evaluate accelerate tensorflow
print("Libraries Installed Successfully!")

Libraries Installed Successfully!


In [102]:
# ---------------------------
#  Import Necessary Libraries
# ---------------------------

In [42]:
import torch
import transformers
import accelerate

print(torch.__version__)
print(transformers.__version__)
print(accelerate.__version__)


2.3.1+cu121
4.41.2
0.31.0


In [104]:
# ---------------------------------
#  Load ROUGE metric for evaluation
# ---------------------------------

In [63]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate

rouge = evaluate.load("rouge")

In [105]:
# -------------------------------------------------------
#  Load a curated summarization dataset from Hugging Face
# -------------------------------------------------------

In [106]:
dataset = load_dataset("sudhanshusinghaiml/curated-dataset-for-summarization")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['Text', 'Summary'],
        num_rows: 71
    })
    validation: Dataset({
        features: ['Text', 'Summary'],
        num_rows: 20
    })
    test: Dataset({
        features: ['Text', 'Summary'],
        num_rows: 8
    })
})


In [107]:
# ----------------------------------------------------------
#  Explore sample text and summary from the training dataset
# ----------------------------------------------------------

In [65]:
dataset["train"]["Text"][0]

'Electrical supply company Crescent Electric (CESCO) study reveals that the state of Louisiana is the cheapest state in the US to mine Bitcoin.\n\nDigital currency mining requires a lot of electric power and the power rates differ in every state.\n\nBased on CESCOâ€™s latest study of the cost of cryptocurrency mining across the US, it is currently cheapest to mine Bitcoin in Louisiana -- electricity costs at 9.87 cents per watt puts the average cost of mining one Bitcoin at $3,224.\n\nThis is significantly cheaper than the current price of Bitcoin, which is currently trading at around $12,000 per coin, as of press time.\n\nWhere else in the US is it cheap to mine?\n\nIn their study, CESCO also estimated the cost of Bitcoin mining based on the wattage consumption of the three most popular mining rigs, namely, the AntMiner S9, the AntMiner S7, and the Avalon 6, as well as the average days each rig takes to mine a token. These figures were then multiplied by the average electricity rate i

In [66]:
dataset["train"]["Summary"][0]

'A new study has named Louisiana as the cheapest state in the US in which to mine bitcoin. Electrical supply company Crescent Electric based its calculation on the cost of electricity in each state, the power requirements of the equipment needed, and the average length of time taken to mine a token. This produced a figure of $3,224 per bitcoin for Louisiana, with the most expensive places being Hawaii, at $9,483, and Alaska at $7,059. All of these figures are notably less than the current trading price of bitcoin.\n'

In [108]:
# -----------------------------------------------------------------------------------
#   Define Model & Preprocessing function to tokenize input text and target summaries
# -----------------------------------------------------------------------------------

In [67]:
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_input_length = 512
max_target_length = 128

def preprocess_function(dataset):
  inputs = dataset["Text"]
  target = dataset["Summary"]

  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
  labels = tokenizer(target, max_length=max_target_length, truncation=True, padding="max_length")

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [109]:
# ---------------------------------------------------
#  Apply preprocessing function to the entire dataset
# ---------------------------------------------------

In [68]:
preprocessed_dataset = dataset.map(preprocess_function, batched=True)
print(preprocessed_dataset)

DatasetDict({
    train: Dataset({
        features: ['Text', 'Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 71
    })
    validation: Dataset({
        features: ['Text', 'Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
    test: Dataset({
        features: ['Text', 'Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8
    })
})


In [69]:
print(preprocessed_dataset["train"]["input_ids"][10])

[86, 17421, 32, 63, 9, 6, 3411, 6, 3, 9, 690, 24, 728, 1213, 46, 1297, 7071, 2425, 12, 20407, 7, 6, 3, 9, 2833, 56, 1116, 617, 7567, 7, 1597, 57, 11509, 12, 165, 1035, 871, 5, 465, 6, 79, 751, 31, 17, 36, 3, 7, 75, 27315, 16, 21, 3730, 10, 86, 2083, 6, 8, 17421, 32, 63, 9, 636, 4457, 56, 17274, 662, 14761, 7, 12, 20395, 4404, 11, 794, 5977, 344, 8242, 21, 3, 9, 215, 5, 17421, 32, 63, 9, 2833, 12, 169, 7567, 7, 21, 23113, 13, 4845, 6, 1397, 10, 634, 282, 9, 107, 23, 4804, 51, 9617, 10, 634, 282, 9, 107, 23, 4804, 51, 9617, 4893, 1303, 17, 5, 509, 87, 115, 107, 40, 157, 172, 329, 632, 448, 29, 102, 3, 318, 282, 9, 107, 23, 4804, 51, 9617, 3, 22356, 518, 41, 1741, 22356, 518, 9, 7, 9, 107, 23, 61, 1762, 1914, 846, 37, 7567, 7, 33, 3, 8317, 1156, 11042, 7, 28, 3, 9, 2777, 18, 9842, 2614, 24, 3, 4610, 30, 16092, 11, 7724, 12, 15305, 190, 8, 2833, 5, 5066, 79, 661, 139, 6917, 6, 79, 31, 60, 2486, 26, 12, 26841, 135, 42, 1977, 6311, 120, 2249, 3, 31, 5420, 75, 1074, 140, 6, 754, 752, 140, 190

In [110]:
# ------------------------------------------------------------
#  Remove the original 'Text' column as it is no longer needed
# ------------------------------------------------------------

In [70]:
tokenized_dataset = preprocessed_dataset.remove_columns("Text")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 71
    })
    validation: Dataset({
        features: ['Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
    test: Dataset({
        features: ['Summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8
    })
})

In [112]:
# ------------------------------------------------------------
#  Load pre-trained T5 model for sequence-to-sequence learning
# ------------------------------------------------------------

In [71]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [113]:
# ---------------------------------------------
#  Define training arguments for Seq2SeqTrainer
# ---------------------------------------------

In [85]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./best_model",
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=12,
    logging_steps=10,
    predict_with_generate=True,
    report_to=None
)



In [86]:
tokenizer.pad_token_id

0

In [114]:
# ------------------------------------------------------------
#   Define function to compute ROUGE metrics during evaluation
# ------------------------------------------------------------

In [87]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # replace -100 in labels as tokenizer.pad_token_id
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    # return only ROUGE-L
    return {"rougeL": result["rougeL"]}

In [115]:
# ------------------------------------------------------------------------
#   Initialize Seq2SeqTrainer with model, tokenizer, datasets, and metrics
# ------------------------------------------------------------------------

In [88]:
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [117]:
# ---------------
# Train the model
# ---------------

In [89]:
trainer.train()

Step,Training Loss,Validation Loss,Rougel
50,2.5634,2.733088,0.0
100,2.4335,2.716584,0.0
150,2.1794,2.704226,0.0
200,2.1516,2.702156,0.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=216, training_loss=2.3802934046144837, metrics={'train_runtime': 82.6977, 'train_samples_per_second': 10.303, 'train_steps_per_second': 2.612, 'total_flos': 115311214854144.0, 'train_loss': 2.3802934046144837, 'epoch': 12.0})

In [118]:
# ---------------------------------------------------
#  Test the model on a sample from the validation set
# ---------------------------------------------------

In [82]:
sample = tokenized_dataset["validation"][0]

input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
generated_ids = model.generate(input_ids, max_new_tokens=128, num_beams=4)

print("Generated Summary:")
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

print("\nReference Summary:")
print(sample["Summary"])  # remove the [0] here


Generated Summary:
JPMorgan Chase has created an internal tool to make sure that its ads don't end up next to unsavory content on YouTube. The company's proprietary algorithm plugs into YouTube's application programming interface (API) to select "safe" channels for its ads to advertise on. From more than 5 million channels the brand has wonowed the list down to 3,000 YouTube channels that its ads appear on.

Reference Summary:
US bank JPMorgan Chase has developed a solution that preventsÂ its ads being placed near unsuitable content on YouTube. A proprietary algorithm with 17 layers of filters that plugs into YouTube's programming systemÂ enablesÂ JPMorgan to whitelist or pre-approve the channels on which its ads are placed. Launched in October amid dissatisfaction with YouTube's own filters, the in-house software has reduced the number of pre-approved sites from five million to 3,000, with a 99.9% success rate. Aaron Smolick, executive director of paid-media analytics and optimisation

In [119]:
# --------------------------------------------------------------------------------------------------
# Save the best trained model and tokenizerxplore sample text and summary from the training dataset.
# --------------------------------------------------------------------------------------------------

In [90]:
trainer.save_model("./best_model/final")
tokenizer.save_pretrained("./best_model/final")
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [120]:
# ---------------------------------------------------
# Define a reusable function to summarize custom text
# ---------------------------------------------------

In [91]:
def summarize_text(text, max_input_length=512, max_output_length=150, num_beams=5):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    ).to(device)

    generated_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_output_length,
        num_beams=num_beams,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return summary


In [121]:
# -------------------------
# Custom text for inference
# -------------------------

In [99]:
custom_text = """The global shift toward renewable energy is accelerating at an unprecedented pace. Countries around the world are increasingly investing in solar, wind, and hydroelectric power to reduce dependence on fossil fuels and combat climate change. Solar energy has become particularly popular due to its declining costs and the ability to install panels on both residential and commercial properties. Wind farms, both onshore and offshore, are also expanding rapidly, providing a significant portion of electricity in countries like Germany, the United States, and China.
Despite these advances, several challenges remain. One of the biggest obstacles is energy storage. Renewable energy sources such as solar and wind are intermittent, producing electricity only when the sun shines or the wind blows. To address this, companies and governments are investing in battery technology and other storage solutions to ensure a steady and reliable supply of power. Additionally, updating and expanding the electrical grid to handle the influx of renewable energy is critical. Aging infrastructure in many countries poses a barrier to efficient energy distribution and requires substantial investment.
Public awareness and government policies also play a crucial role in driving the adoption of renewable energy. Incentives such as tax credits, subsidies, and feed-in tariffs encourage both individuals and businesses to switch to cleaner energy sources. International agreements like the Paris Climate Accord further emphasize the global commitment to reducing greenhouse gas emissions.
Technological innovation continues to improve the efficiency and affordability of renewable energy systems. Advances in photovoltaic cells, wind turbine design, and smart grid technology are making renewable energy more competitive with traditional fossil fuels. Experts predict that as renewable energy becomes more widespread, it will reshape energy markets, create new job opportunities, and reduce the environmental impact of power generation.
Overall, the transition to renewable energy represents a crucial step toward a sustainable future. While challenges exist, the combined efforts of governments, businesses, and individuals are driving meaningful progress. As innovation continues and investments grow, renewable energy is poised to play a central role in meeting the worldâ€™s energy needs in the coming decades."""

generated_summary = summarize_text(custom_text)
print("Generated Summary:\n", generated_summary)


Generated Summary:
 Global shift toward renewable energy is accelerating at an unprecedented pace. Countries around the world are investing in solar, wind, and hydroelectric power to reduce dependence on fossil fuels and combat climate change. Renewable energy sources such as solar and wind are intermittent, producing electricity only when the sun shines or the wind blows.


In [122]:
# ------------------------------
# Compare with reference summary
# ------------------------------

In [98]:
Reference_Summary = """The global shift toward renewable energy is accelerating at an unprecedented pace. Countries around the world are increasingly investing in solar, wind, and hydroelectric power to reduce dependence on fossil fuels and combat climate change. Solar energy has become particularly popular due to its declining costs and the ability to install panels on both residential and commercial properties. Wind farms, both onshore and offshore, are also expanding rapidly, providing a significant portion of electricity in countries like Germany, the United States, and China.
Despite these advances, several challenges remain. One of the biggest obstacles is energy storage. Renewable energy sources such as solar and wind are intermittent, producing electricity only when the sun shines or the wind blows. To address this, companies and governments are investing in battery technology and other storage solutions to ensure a steady and reliable supply of power. Additionally, updating and expanding the electrical grid to handle the influx of renewable energy is critical. Aging infrastructure in many countries poses a barrier to efficient energy distribution and requires substantial investment.
Public awareness and government policies also play a crucial role in driving the adoption of renewable energy. Incentives such as tax credits, subsidies, and feed-in tariffs encourage both individuals and businesses to switch to cleaner energy sources. International agreements like the Paris Climate Accord further emphasize the global commitment to reducing greenhouse gas emissions.
Technological innovation continues to improve the efficiency and affordability of renewable energy systems. Advances in photovoltaic cells, wind turbine design, and smart grid technology are making renewable energy more competitive with traditional fossil fuels. Experts predict that as renewable energy becomes more widespread, it will reshape energy markets, create new job opportunities, and reduce the environmental impact of power generation.
Overall, the transition to renewable energy represents a crucial step toward a sustainable future. While challenges exist, the combined efforts of governments, businesses, and individuals are driving meaningful progress. As innovation continues and investments grow, renewable energy is poised to play a central role in meeting the worldâ€™s energy needs in the coming decades"""

print("Reference Summary:",Reference_Summary)

Reference Summary: The global shift toward renewable energy is accelerating at an unprecedented pace. Countries around the world are increasingly investing in solar, wind, and hydroelectric power to reduce dependence on fossil fuels and combat climate change. Solar energy has become particularly popular due to its declining costs and the ability to install panels on both residential and commercial properties. Wind farms, both onshore and offshore, are also expanding rapidly, providing a significant portion of electricity in countries like Germany, the United States, and China.
Despite these advances, several challenges remain. One of the biggest obstacles is energy storage. Renewable energy sources such as solar and wind are intermittent, producing electricity only when the sun shines or the wind blows. To address this, companies and governments are investing in battery technology and other storage solutions to ensure a steady and reliable supply of power. Additionally, updating and ex