In [17]:
!pip install opendatasets transformers datasets peft accelerate bitsandbytes --upgrade --quiet

In [2]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: saiswe
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
Downloading newspaper-text-summarization-cnn-dailymail.zip to ./newspaper-text-summarization-cnn-dailymail


100%|██████████| 503M/503M [00:13<00:00, 39.2MB/s]





In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    GenerationConfig , TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model
import pandas as pd
from datasets import Dataset
import re

In [4]:
quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b1", quantization_config = quant_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

In [5]:
train_df = pd.read_csv("/content/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")[["article", "highlights"]]
train_df = train_df.sample(10000)

In [6]:
def filter_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z0-9]+', ' ', text)
  return text

train_df["article"] = train_df["article"].apply(filter_text)
train_df["highlights"] = train_df["highlights"].apply(filter_text)

In [7]:
train_df.head()

Unnamed: 0,article,highlights
265168,two days after being sacked as england s one d...,england cricket team mates play each other in ...
55397,unmanned spy drones could patrol britain s sho...,european commission to spend 260 million on eu...
50981,los angeles california cnn a spokesman for the...,estate spokesman says belt deal was not approv...
281195,cnn a snowstorm that could last up to 18 hour...,winter storm warnings issued from new england ...
82658,by alex ward and mario ledwith published 05 54...,elderly couple refused to have their home demo...


In [8]:
train_df["final_statement"] = ""
for indx, row in train_df.iterrows():
  row["final_statement"] = "Summarize the following article.\n\n" +str(row["article"]) + "\Summary:\n" + str(row["highlights"])

train_df = train_df[["final_statement"]]

In [9]:
print(train_df["final_statement"].iloc[9])




In [10]:
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(example):
    example["input_ids"] = tokenizer(example["final_statement"], padding="max_length", max_length = 250, truncation=True, return_tensors="pt").input_ids
    example["labels"] = tokenizer(example["final_statement"], padding="max_length", max_length = 250, truncation=True, return_tensors="pt").input_ids
    return example

# Convert your DataFrame into a Dataset object
train_data = Dataset.from_pandas(train_df)

# # Apply the tokenize function
train_tokenized_datasets = train_data.map(tokenize_function, batched=True, remove_columns=train_data.column_names)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
print(tokenizer.decode(train_tokenized_datasets[5]["input_ids"], skip_special_tokens = True))




In [12]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, peft_params)
peft_model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 1,067,673,600 || trainable%: 0.2210


In [13]:
training_args = TrainingArguments(
output_dir = './model_checkpoints',
save_total_limit = 1,
auto_find_batch_size = True,
learning_rate = 1e-3,
num_train_epochs = 1,
report_to="none"
)

trainer = Trainer(
model = peft_model,
args = training_args,
train_dataset = train_tokenized_datasets,
)

trainer.train()

trainer.model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')



Step,Training Loss
500,0.0137
1000,0.0


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/tokenizer.json')

In [18]:
news_article = """
All but one of the 100 cities with the world’s worst air pollution last year were in Asia, according to a new report, with the climate crisis playing a pivotal role in bad air quality that is risking the health of billions of people worldwide.

The vast majority of these cities — 83 — were in India and all exceeded the World Health Organization’s air quality guidelines by more than 10 times, according to the report by IQAir, which tracks air quality worldwide.

The study looked specifically at fine particulate matter, or PM2.5, which is the tiniest pollutant but also the most dangerous. Only 9% of more than 7,800 cities analyzed globally recorded air quality that met WHO’s standard, which says average annual levels of PM2.5 should not exceed 5 micrograms per cubic meter.

“We see that in every part of our lives that air pollution has an impact,” said IQAir Global CEO Frank Hammes. “And it typically, in some of the most polluted countries, is likely shaving off anywhere between three to six years of people’s lives. And then before that will lead to many years of suffering that are entirely preventable if there’s better air quality.”

"""

filtered_news_article = "Summarize the following article.\n\n" +filter_text(news_article) + "\nSummary:\n"
tokenizerd_news_article = tokenizer(filtered_news_article, max_length = 250, return_tensors="pt")
output = model.generate(tokenizerd_news_article.input_ids, max_new_tokens = 100)
summary = tokenizer.decode(output[0], skip_special_tokens = True)


In [21]:
print(summary.split("\nSummary:\n")[1])


