In [1]:
!pip install peft bitsandbytes transformers datasets polars --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import polars as pl
import pandas as pd

# Load and select relevant columns
df_polars = pl.read_csv("/kaggle/input/news-summarization/data.csv", ignore_errors=True).lazy()
df_polars = df_polars.select(["Content", "Summary"]).limit(40000).collect()
df = df_polars.to_pandas()

df.drop_duplicates(subset=['Content', 'Summary'], inplace=True)
df.dropna(subset=['Content', 'Summary'], inplace=True)
df = df[df['Content'].str.len() > 150]
df = df[df['Summary'].str.len() > 75]
df.reset_index(drop=True, inplace=True)

df['input_text'] = 'Summarize the following news article: ' + df['Content']

In [3]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

model_name = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)
base_model = prepare_model_for_kbit_training(base_model)
base_model.config.use_cache = False

lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q", "v", "k", "o", "wi", "wo"]
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

2025-07-07 08:01:12.491194: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751875272.699895      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751875272.752973      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 4,620,288 || all params: 252,198,144 || trainable%: 1.8320


In [5]:
def preprocess(examples):
    inputs = tokenizer(
        examples['input_text'],
        max_length=600,
        truncation=True,
        padding='max_length'
    )
    targets = tokenizer(
        examples['Summary'],
        max_length=300,
        truncation=True,
        padding='max_length'
    )
    inputs['labels'] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in targets['input_ids']
    ]
    return inputs

processed_dataset = dataset.map(
    preprocess,
    batched=True,
    batch_size=32,
    remove_columns=dataset.column_names,
    load_from_cache_file=False,
    num_proc=2
)
processed_dataset.set_format(
    type="torch",
    device="cuda" if torch.cuda.is_available() else "cpu",
    columns=["input_ids", "attention_mask", "labels"]
)

Map (num_proc=2):   0%|          | 0/38715 [00:00<?, ? examples/s]

In [6]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-base-peft",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4, 
    warmup_steps=500,
    logging_steps=100,
    logging_dir="./logs",
    save_steps=1000,
    save_total_limit=2,
    eval_strategy="no",
    report_to="none",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    gradient_checkpointing=False,
    fp16=False,  
    torch_compile=False,
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    logging_strategy="steps",
    remove_unused_columns=False,
    label_names=["labels"],
    max_grad_norm=1.0  
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=tokenizer
)

import torch.multiprocessing as mp
mp.set_start_method('spawn', force=True)

import gc
torch.cuda.empty_cache()
gc.collect()

trainer.train()

  trainer = Seq2SeqTrainer(
2025-07-07 08:03:27.831147: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751875407.852526     156 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751875407.859184     156 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-07 08:03:35.490640: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751875415.512672     167 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751875415.5

Step,Training Loss


In [None]:
merged_model = model.merge_and_unload()

In [None]:
merged_model.save_pretrained("feedflash-flan-t5-base")
tokenizer.save_pretrained("feedflash-flan-t5-base")

In [None]:
from huggingface_hub import login
login()

In [None]:
model.push_to_hub("Arihant-Bhandari/feedflash-flan-t5")
tokenizer.push_to_hub("Arihant-Bhandari/feedflash-flan-t5")