In [1]:
from google.colab import files
files.upload()

Saving data_summarization.csv to data_summarization.csv




In [110]:
!pip install -q transformers datasets evaluate accelerate -U

In [111]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    LEDForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate
import torch
import os
import random

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

MODEL_CHECKPOINT = "allenai/led-base-16384"
OUTPUT_DIR = "./led_finetuned_summarizer"
LOGGING_DIR = "./logs"
CSV_PATH = "data_summarization.csv"
SEED = 42

MAX_INPUT_LENGTH = 4096
MAX_TARGET_LENGTH = 500

random.seed(SEED)
torch.manual_seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("Model:", MODEL_CHECKPOINT)

Using device: cuda
Model: allenai/led-base-16384


In [112]:
df = pd.read_csv(CSV_PATH)
df = df.rename(columns={'lesson_text': 'article', 'summary': 'summary'})
df = df[['article', 'summary']].dropna()

mask = df["article"].str.contains("�") | df["summary"].str.contains("�")
df = df[~mask].reset_index(drop=True)

print("Cleaned dataset size:", len(df))

Cleaned dataset size: 1986


In [113]:
dataset = Dataset.from_pandas(df)

train_val_test_split = dataset.train_test_split(test_size=200, seed=SEED)
train_val_split = train_val_test_split["train"].train_test_split(test_size=0.1, seed=SEED)

dataset = DatasetDict({
    "train": train_val_split["train"],
    "validation": train_val_split["test"],
    "test": train_val_test_split["test"],
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'summary'],
        num_rows: 1607
    })
    validation: Dataset({
        features: ['article', 'summary'],
        num_rows: 179
    })
    test: Dataset({
        features: ['article', 'summary'],
        num_rows: 200
    })
})


In [114]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [115]:
def preprocess_function(examples):

    model_inputs = tokenizer(
        examples["article"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length"
        )

    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100)
         for token in seq]
        for seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [116]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

def add_global_attention(batch):
    batch["global_attention_mask"] = [
        [1] + [0]*(len(input_ids)-1) for input_ids in batch["input_ids"]
    ]
    return batch

tokenized_datasets = tokenized_datasets.map(add_global_attention, batched=True)

print(tokenized_datasets)

Map:   0%|          | 0/1607 [00:00<?, ? examples/s]



Map:   0%|          | 0/179 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1607 [00:00<?, ? examples/s]

Map:   0%|          | 0/179 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'global_attention_mask'],
        num_rows: 1607
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'global_attention_mask'],
        num_rows: 179
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'global_attention_mask'],
        num_rows: 200
    })
})


In [117]:
from transformers import TrainerCallback

class LossPrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if "loss" in logs:
            print(f"Step {state.global_step} - loss: {logs['loss']:.4f}")

In [118]:
model = LEDForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT).to(device)

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [119]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100
)

In [124]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)
    labels = np.clip(labels, 0, tokenizer.vocab_size - 1)

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    return result

In [125]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGGING_DIR,
    logging_strategy="steps",
    logging_steps=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=False,
    report_to="none",
    save_total_limit=3,
    predict_with_generate=True,
    generation_max_length=256,
)

In [126]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[LossPrinterCallback()],
)

  trainer = Seq2SeqTrainer(


In [None]:
print("Starting training...")
train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Starting training...


Epoch,Training Loss,Validation Loss


Step 5 - loss: 2.5554
Step 10 - loss: 2.5312
Step 15 - loss: 2.4961
Step 20 - loss: 2.6132
Step 25 - loss: 2.5577
Step 30 - loss: 2.4182
Step 35 - loss: 2.4906
Step 40 - loss: 2.5122
Step 45 - loss: 2.5270
Step 50 - loss: 2.4789
Step 55 - loss: 2.5201
Step 60 - loss: 2.5172
Step 65 - loss: 2.4669
Step 70 - loss: 2.4449
Step 75 - loss: 2.5937
Step 80 - loss: 2.4105
Step 85 - loss: 2.4260
Step 90 - loss: 2.4899
Step 95 - loss: 2.2675
Step 100 - loss: 2.3024
Step 105 - loss: 2.3660
Step 110 - loss: 2.3055
Step 115 - loss: 2.3435
Step 120 - loss: 2.3485
Step 125 - loss: 2.3326
Step 130 - loss: 2.2623
Step 135 - loss: 2.2501
Step 140 - loss: 2.3185
Step 145 - loss: 2.2412
Step 150 - loss: 2.4577
Step 155 - loss: 2.3090
Step 160 - loss: 2.2669
Step 165 - loss: 2.1855
Step 170 - loss: 2.2691
Step 175 - loss: 2.3281
Step 180 - loss: 2.3790
Step 185 - loss: 2.3643
Step 190 - loss: 2.3184
Step 195 - loss: 2.3013
Step 200 - loss: 2.4265


In [None]:
test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)

In [None]:
model_inf = LEDForConditionalGeneration.from_pretrained(OUTPUT_DIR).to(device)
tokenizer_inf = tokenizer

In [None]:
def generate_summary(text):

    inputs = tokenizer_inf(
        text,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(device)

    global_attention_mask = torch.zeros_like(inputs["input_ids"])
    global_attention_mask[:, 0] = 1

    summary_ids = model_inf.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        global_attention_mask=global_attention_mask,
        num_beams=4,
        max_length=MAX_TARGET_LENGTH,
        early_stopping=True
    )

    return tokenizer_inf.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
sample = dataset["test"][0]
generated = generate_summary(sample["article"])
print("True Summary:", sample["summary"])
print("Generated:", generated)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DEST = "/content/drive/My Drive/Finetuned_Model_Saves"
!mkdir -p "$DEST"
!cp -r "$OUTPUT_DIR" "$DEST"