In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import os
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
# Define paths
data_path = "/content/drive/MyDrive/processed_data"  # Update with your folder path

In [None]:
# Prepare dataset
def load_data(folder_path):
    articles = []
    summaries = []

    for file in os.listdir(folder_path):
        if file.endswith("_article.txt"):
            article_path = os.path.join(folder_path, file)
            summary_path = os.path.join(folder_path, file.replace("_article.txt", "_summary.txt"))

            if os.path.exists(summary_path):
                with open(article_path, 'r', encoding='utf-8') as art_file, \
                     open(summary_path, 'r', encoding='utf-8') as sum_file:
                    articles.append(art_file.read())
                    summaries.append(sum_file.read())
    return articles, summaries

articles, summaries = load_data(data_path)

# Create Hugging Face dataset
data_dict = {"article": articles, "summary": summaries}
dataset = Dataset.from_dict(data_dict)

# Split dataset into train and validation
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']


In [None]:
# Load pre-trained model and tokenizer (choose T5 or BART)
model_name = "t5-small"  # Change to "facebook/bart-base" for BART
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Tokenization
def preprocess_data(examples):
    inputs = [f"summarize: {text}" for text in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_data, batched=True)
tokenized_val = val_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/909 [00:00<?, ? examples/s]



Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
)



In [None]:
# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()


  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.3605,0.257376
2,0.2721,0.180178
3,0.2108,0.175512


TrainOutput(global_step=684, training_loss=1.2385864862573077, metrics={'train_runtime': 176.2068, 'train_samples_per_second': 15.476, 'train_steps_per_second': 3.882, 'total_flos': 369077092614144.0, 'train_loss': 1.2385864862573077, 'epoch': 3.0})

In [None]:
# Save the fine-tuned model
model.save_pretrained("/content/drive/My Drive/summarization_model")
tokenizer.save_pretrained("/content/drive/My Drive/summarization_model")


('/content/drive/My Drive/summarization_model/tokenizer_config.json',
 '/content/drive/My Drive/summarization_model/special_tokens_map.json',
 '/content/drive/My Drive/summarization_model/spiece.model',
 '/content/drive/My Drive/summarization_model/added_tokens.json')

In [None]:
# Test the model
test_text = "summarize: " + articles[0]
print(test_text)
print()
input_ids = tokenizer.encode(test_text, return_tensors="pt", max_length=512, truncation=True)

# Move input_ids to the same device as the model
# Assuming the model is on CUDA
input_ids = input_ids.to(model.device)

summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4)
print("Generated Summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))


summarize: <first> the nhs in wales has been underfunded compared with the health service in england for the last six years, the welsh assembly government has admitted. <body> a five-year strategic plan for the health service's future says growth in cash funding has been one-third lower than that for nhs england. the assembly government said the gap was a legacy of underfunding of wales as a whole. the lib dems called it "shocking". the tories blamed ministerial management. the strategic plan warns that nhs organisations in wales "enter this more challenging economic period from a less financially secure platform" than england. by hywel griffithbbc wales health correspondent few cross-border comparisons between england and wales paint the welsh nhs in a positive light. waiting lists in england are shorter, ambulance response times are faster. but for the welsh assembly government to openly admit they've not been as generous with their nhs funding is extraordinary. it is perhaps at leas

In [None]:
import re
# Test the model
text = "Artificial Intelligence is the intelligence possessed by the machines under which they can perform various functions with human help. With the help of A.I, machines will be able to learn, solve problems, plan things, think, etc. Artificial Intelligence, for example, is the simulation of human intelligence by machines. In the field of technology, Artificial Intelligence is evolving rapidly day by day and it is believed that in the near future, artificial intelligence is going to change human life very drastically and will most probably end all the crises of the world by sorting out the major problems. Our life in this modern age depends largely on computers. It is almost impossible to think about life without computers. We need computers in everything that we use in our daily lives. So it becomes very important to make computers intelligent so that our lives become easy. Artificial Intelligence is the theory and development of computers, which imitates the human intelligence and senses, such as visual perception, speech recognition, decision-making, and translation between languages. Artificial Intelligence has brought a revolution in the world of technology. "
text = re.sub(r"\s+", " ", text)
# Convert to lowercase (optional)
text = text.lower()
# Return cleaned and formatted text
text = text.strip()

test_text = "summarize: " + text
input_ids = tokenizer.encode(test_text, return_tensors="pt", max_length=512, truncation=True)

# Move input_ids to the same device as the model
# Assuming the model is on CUDA
input_ids = input_ids.to(model.device)

summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4)
print("Generated Summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))


Generated Summary: artificial intelligence is the intelligence possessed by the machines under which they can perform various functions with human help. in the field of technology, artificial intelligence is evolving rapidly day by day and will most probably end all the crises of the world by sorting out the major problems.


In [None]:
!pip install rouge-score