<a href="https://colab.research.google.com/github/WilliamHackspeare/headline-generator-app/blob/main/headline-generation-training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rouge_score
!pip install -q transformers[torch] datasets
!pip install -q bitsandbytes trl peft
!pip install flash-attn --no-build-isolation

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=f49c4fd1f4e7ea2dedf14a1118a82fc6ac76f7dfd20deedf77bc35a7ca97a638
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━

In [2]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import pandas as pd
from transformers import AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoModelForCausalLM
import torch
from multiprocessing import cpu_count
from trl import SFTTrainer
from peft import LoraConfig
from torch.utils.data import DataLoader, Dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

In [4]:
model_id = "Helsinki-NLP/opus-mt-en-mul"

In [5]:
splits = {'train': 'final_headline_train_12000.csv', 'validation': 'final_headline_valid_1200.csv'}
train_df = pd.read_csv("hf://datasets/valurank/News_headlines/" + splits["train"])
val_df = pd.read_csv("hf://datasets/valurank/News_headlines/" + splits["validation"])

In [6]:
train_df.head()

Unnamed: 0,article,headline
0,The logo of cryptocurrency exchange Binance di...,Binance pauses bitcoin withdrawals due to a 's...
1,"Police officers, some in riot gear, guard a gr...",White nationalist group members face riot-plan...
2,A woman walks past a row of cash machines outs...,"Lloyds to give staff 1,000 pounds to ease cost..."
3,The Amazon logo is seen outside its JFK8 distr...,"Amazon offers to share data, boost rivals to d..."
4,An unexploded shell from a multiple rocket lau...,Both sides using heavier weapons in war in Ukr...


In [7]:
# Data cleaning function
def clean_article(article):
    # Remove metadata such as dates
    return article.split(') ', 1)[-1] if ') ' in article else article

In [8]:
# Apply cleaning to datasets
train_df['article'] = train_df['article'].apply(clean_article)
train_df.head()

Unnamed: 0,article,headline
0,The logo of cryptocurrency exchange Binance di...,Binance pauses bitcoin withdrawals due to a 's...
1,- Thirty-one members of the white nationalist ...,White nationalist group members face riot-plan...
2,- Britain's biggest domestic bank Lloyds (LLOY...,"Lloyds to give staff 1,000 pounds to ease cost..."
3,- Amazon (AMZN.O) has offered to share marketp...,"Amazon offers to share data, boost rivals to d..."
4,- Finnish President Sauli Niinisto said on Mon...,Both sides using heavier weapons in war in Ukr...


In [9]:
val_df['article'] = val_df['article'].apply(clean_article)

In [10]:
# Define a custom dataset class
class HeadlineDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, target=True):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.target = target

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        input_text = row['article']
        inputs = self.tokenizer(input_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")

        if self.target:
            target_text = row['headline']
            targets = self.tokenizer(target_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': targets['input_ids'].squeeze(),
                'target_text': target_text
            }
        else:
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze()
            }

In [11]:
# Initialize tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained(model_id)
max_length = 256

train_dataset = HeadlineDataset(train_df, tokenizer, max_length)
val_dataset = HeadlineDataset(val_df, tokenizer, max_length)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]



In [12]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype="torch.bfloat16",
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    attn_implementation="flash_attention_2",
    torch_dtype="auto",
    use_cache=False,
    device_map=device_map,
    quantization_config=quantization_config,
)

AttributeError: module 'torch' has no attribute 'torch.bfloat16'

In [None]:
def compute_metrics(eval_preds):
    """Custom metrics computation function for text generation tasks."""
    predictions, labels = eval_preds

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(
        predictions,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    # Decode labels (ignore padding tokens)
    decoded_labels = tokenizer.batch_decode(
        labels.where(labels != -100, tokenizer.pad_token_id),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    # Initialize scorers
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rougeL_scores = []
    bleu_scores = []

    # Calculate scores for each prediction-label pair
    for pred, label in zip(decoded_preds, decoded_labels):
        # ROUGE scores
        scores = rouge.score(label, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

        # BLEU score
        label_tokens = label.split()
        pred_tokens = pred.split()
        bleu = sentence_bleu([label_tokens], pred_tokens)
        bleu_scores.append(bleu)

    return {
        'rouge1': np.mean(rouge1_scores),
        'rougeL': np.mean(rougeL_scores),
        'bleu': np.mean(bleu_scores)
    }

In [None]:
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

output_dir = 'headline-generator-model'

training_args = TrainingArguments(
    fp16=True,
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=128,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    save_strategy="no",
    save_total_limit=None,
    seed=42,
)

peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        compute_metrics=compute_metrics(),
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=True,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )

In [None]:
train_result = trainer.train()

In [None]:
metrics = train_result.metrics
max_train_samples = training_args.max_train_samples if training_args.max_train_samples is not None else len(train_dataset)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

In [None]:
model.push_to_hub('headline-generator-opus-mt-en-mul-qlora-sft')
tokenizer.push_to_hub('headline-generator-opus-mt-en-mul-qlora-sft')