## PDF Summarization

In [None]:
!pip install transformers datasets accelerate -U
!pip install evaluate rouge-score

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import numpy as np

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

raw_datasets = load_dataset("knkarthick/samsum")
print("Dataset successfully loaded!")

MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
PREFIX = "summarize: "

In [None]:
def preprocess_function(examples):
    inputs = [PREFIX + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(50))
print(f"Using {len(small_train_dataset)} samples for training.")

In [None]:
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_summarization_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to="none",
    predict_with_generate=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\n" + "="*50)
print("  Starting T5 Summarization Fine-Tuning...")
print("  Watch the Loss and ROUGE metrics update below.")
print("="*50 + "\n")

trainer.train()

print("\n" + "="*50)
print("Fine-Tuning Complete! Model is now saved.")
print("="*50)

trainer.save_model("./final_t5_summarizer")
tokenizer.save_pretrained("./final_t5_summarizer")

In [None]:
!pip install PyMuPDF

In [None]:
from google.colab import files
import fitz

print("Please upload your PDF file now:")
uploaded = files.upload()

if uploaded:
    PDF_FILE_NAME = list(uploaded.keys())[0]
    print(f"File '{PDF_FILE_NAME}' detected and uploaded.")
else:
    print("No file uploaded. Please re-run the cell and upload a PDF.")
    PDF_FILE_NAME = None

def extract_text_from_pdf(pdf_path):
    text = ""
    if not pdf_path:
        return None
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text() + "\n"
        doc.close()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None
    return text.strip()

if PDF_FILE_NAME:
    long_text = extract_text_from_pdf(PDF_FILE_NAME)

    if long_text:
        print(f"Successfully extracted {len(long_text)} characters.")
    else:
        print("Could not extract text. Check file name and format.")
else:
    long_text = None

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_PATH = "./final_t5_summarizer"
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to("cuda")
    PREFIX = "summarize: "
    MAX_INPUT_LENGTH = 512
    print(f"Model '{MODEL_PATH}' loaded successfully.")

except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you ran the fine-tuning cell and it completed successfully.")
    model = None

In [None]:
def generate_summary(text, model, tokenizer, max_input=MAX_INPUT_LENGTH):
    if not model or not text:
        return "Model not loaded or no text extracted."
    input_text = PREFIX + text

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=max_input,
        truncation=True
    ).to(model.device)

    summary_ids = model.generate(
        inputs.input_ids,
        num_beams=4,
        max_length=150,
        min_length=30,
        early_stopping=True
    )

    summary = tokenizer.decode(
        summary_ids.squeeze(),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    return summary

if long_text and model:
    final_summary = generate_summary(long_text, model, tokenizer)
    print("\n" + "="*70)
    print(f"       SUMMARY FOR THE DOCUMENT: {PDF_FILE_NAME}")
    print("="*70)
    print(final_summary)
    print("="*70)
elif not model:
    print("\nCannot run summarization: Model failed to load.")
elif not long_text:
    print("\nCannot run summarization: No text was successfully extracted from the PDF.")