In [None]:

from google.colab import drive
import os
drive.mount('/content/drive')





DATA_PATH = "/content/drive/MyDrive/tokenized_led_data/test"
p="/content/drive/MyDrive/led_finetune_trained"
# Check file exists
print(os.path.exists(DATA_PATH))
print(os.path.exists(p))



In [None]:
import pandas as pd
import re
import html
import warnings
import os
import torch

from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from transformers import LEDTokenizer
from sklearn.model_selection import train_test_split

# ===============================
# Suppress warnings
# ===============================
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

# ===============================
# Paths
# ===============================
DATA_PATH = "/content/drive/MyDrive/NewsSumm Dataset.xlsx"
SAVE_DIR  = "/content/drive/MyDrive/tokenized_led_data"

TRAIN_DIR = os.path.join(SAVE_DIR, "train")
TEST_DIR  = os.path.join(SAVE_DIR, "test")

os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)

# ===============================
# Load dataset
# ===============================
df = pd.read_excel(DATA_PATH)
df = df.dropna(subset=["article_text", "human_summary"])
print("Rows after cleaning:", len(df))

# ===============================
# Resample to 15,000 rows
# ===============================
df = df.sample(n=15000, random_state=42)
print("Rows after resampling:", len(df))

# ===============================
# Clean text function
# ===============================
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = BeautifulSoup(text, "html.parser").get_text(" ")
    text = html.unescape(text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "", text)
    text = re.sub(r"\+?\d[\d\s\-]{8,}\d", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["article_text"]  = df["article_text"].apply(clean_text)
df["human_summary"] = df["human_summary"].apply(clean_text)

# ===============================
# Train / Test split
# ===============================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Train rows:", len(train_df))
print("Test rows:", len(test_df))

# ===============================
# Load LED tokenizer
# ===============================
MODEL_NAME = "allenai/led-base-16384"
tokenizer = LEDTokenizer.from_pretrained(MODEL_NAME)

# ===============================
# Tokenization settings (LED)
# ===============================
BATCH_SIZE = 2          # LED is heavy â†’ keep small
MAX_INPUT_LEN = 1024
MAX_TARGET_LEN = 256

# ===============================
# Batch tokenization function
# ===============================
def tokenize_and_save(articles, summaries, out_dir, name):

    batch_id = 0

    for i in range(0, len(articles), BATCH_SIZE):

        batch_articles = articles[i:i+BATCH_SIZE]
        batch_summaries = summaries[i:i+BATCH_SIZE]

        enc = tokenizer(
            batch_articles,
            truncation=True,
            padding="max_length",
            max_length=MAX_INPUT_LEN,
            return_tensors="pt"
        )

        dec = tokenizer(
            batch_summaries,
            truncation=True,
            padding="max_length",
            max_length=MAX_TARGET_LEN,
            return_tensors="pt"
        )

        labels = dec["input_ids"]
        labels[labels == tokenizer.pad_token_id] = -100

        batch_data = {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "labels": labels
        }

        save_path = os.path.join(out_dir, f"batch_{batch_id}.pt")
        torch.save(batch_data, save_path)

        if batch_id % 10 == 0:
            print(f"{name}: saved batch {batch_id}")

        batch_id += 1

    print(f"{name} tokenization done. Total batches: {batch_id}")

# ===============================
# Run tokenization
# ===============================
tokenize_and_save(
    train_df["article_text"].astype(str).tolist(),
    train_df["human_summary"].astype(str).tolist(),
    TRAIN_DIR,
    "TRAIN"
)

tokenize_and_save(
    test_df["article_text"].astype(str).tolist(),
    test_df["human_summary"].astype(str).tolist(),
    TEST_DIR,
    "TEST"
)

print("\nLED batch-wise tokenization completed successfully")


In [None]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
import torch
TRAIN_DIR = "/content/drive/MyDrive/tokenized_led_data/train/"
SAVE_DIR  = "/content/drive/MyDrive/led_finetune_trained"
MODEL_NAME = "allenai/led-base-16384"
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.to(device)
model.train()
optim=AdamW(model.parameters(),lr=2e-5)
files = [f for f in os.listdir(TRAIN_DIR) if f.endswith(".pt")]
files.sort()
files = files[:3000]
total_loss=0
for i,fname in enumerate(files,1):
  batch=torch.load(os.path.join(TRAIN_DIR,fname))
  input_id=batch["input_ids"].to(device)
  atten_id=batch["attention_mask"].to(device)
  label=batch["labels"].to(device)
  optim.zero_grad()
  outputs=model(input_ids=input_id,attention_mask=atten_id,labels=label)
  loss=outputs.loss
  loss.backward()
  optim.step()

  total_loss=total_loss+loss.item()
  if i%100==0:
    print("loss:",i,":",total_loss/i)
  del batch, input_id, atten_id, label, outputs, loss
  torch.cuda.empty_cache()
os.makedirs(SAVE_DIR,exist_ok=True)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)


In [None]:
!pip install rouge-score


In [None]:
import os
import torch
from transformers import LEDTokenizer, LEDForConditionalGeneration
from rouge_score import rouge_scorer

# --------------------
# Paths
# --------------------
TEST_DIR  = "/content/drive/MyDrive/tokenized_led_data/test"
MODEL_DIR = "/content/drive/MyDrive/led_finetune_trained"

# --------------------
# Device
# --------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# --------------------
# Load model & tokenizer
# --------------------
tokenizer = LEDTokenizer.from_pretrained(MODEL_DIR)
model = LEDForConditionalGeneration.from_pretrained(MODEL_DIR).to(device)
model.eval()

# --------------------
# ROUGE scorer
# --------------------
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

r1 = r2 = rl = 0.0
r11 = r21 = rl1 = 0.0
count = 0

files = sorted([f for f in os.listdir(TEST_DIR) if f.endswith(".pt")])

# --------------------
# Evaluation loop
# --------------------
for fname in files:

    batch = torch.load(os.path.join(TEST_DIR, fname))

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    labels = batch["labels"].clone()
    labels[labels == -100] = tokenizer.pad_token_id   # restore padding for decoding

    # Global attention mask (required for LED)
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, :64] = 1

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            max_new_tokens=128,
            num_beams=4
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    refs  = tokenizer.batch_decode(labels, skip_special_tokens=True)

    for pred, ref in zip(preds, refs):
        scores = scorer.score(ref, pred)

        r1  += scores["rouge1"].fmeasure
        r2  += scores["rouge2"].fmeasure
        rl  += scores["rougeL"].fmeasure
        r11 += scores["rouge1"].precision
        r21 += scores["rouge2"].precision
        rl1 += scores["rougeL"].precision

        count += 1


print("\nROUGE RESULTS (LED)")
print("ROUGE-1 F1:", round(r1 / count, 4))
print("ROUGE-2 F1:", round(r2 / count, 4))
print("ROUGE-L F1:", round(rl / count, 4))
print("ROUGE-1 Precision:", round(r11 / count, 4))
print("ROUGE-2 Precision:", round(r21 / count, 4))
print("ROUGE-L Precision:", round(rl1 / count, 4))
