# --- 1. Load and Prepare the Dataset ---

In [1]:
# Install necessary libraries
!pip install transformers[torch] datasets accelerate -q

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- 1. Configuration ---
MODEL_CHECKPOINT = "t5-small"
MODEL_SAVE_PATH = "t5-small-scientific-paper-summarizer"
DATASET_NAME = "franz96521/scientific_papers"
MAX_INPUT_LENGTH = 1024  # Max length for the paper text
MAX_TARGET_LENGTH = 128   # Max length for the summary (abstract)


print("Loading dataset...")
# Load a smaller subset for a quicker fine-tuning demo


Loading dataset...


In [2]:
# --- 2. Load and Prepare the Dataset ---
print("Loading dataset...")

# Define the specific CSV file we want to use from the Hub
DATA_FILE_URL = "hf://datasets/franz96521/scientific_papers/scientific_paper_en.csv"

# Load the dataset by pointing directly to the specific file
# We specify the type as 'csv' and pass the URL via data_files
dataset = load_dataset("csv", data_files={'train': DATA_FILE_URL}, split='train[:1000]')

# Now, we can split our loaded data as before
dataset = dataset.train_test_split(test_size=0.1)

print("Dataset loaded and split successfully:")
print(dataset)

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


scientific_paper_en.csv:   0%|          | 0.00/135M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded and split successfully:
DatasetDict({
    train: Dataset({
        features: ['id', 'full_text', 'abstract', 'text_no_abstract'],
        num_rows: 900
    })
    test: Dataset({
        features: ['id', 'full_text', 'abstract', 'text_no_abstract'],
        num_rows: 100
    })
})


In [3]:
print("Dataset loaded and split:")
print(dataset)

Dataset loaded and split:
DatasetDict({
    train: Dataset({
        features: ['id', 'full_text', 'abstract', 'text_no_abstract'],
        num_rows: 900
    })
    test: Dataset({
        features: ['id', 'full_text', 'abstract', 'text_no_abstract'],
        num_rows: 100
    })
})


# --- 2. Tokenization ---

In [4]:

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(examples):
    """Tokenizes the text and abstracts."""
    # The 'text' column will be our model's input
    inputs = tokenizer(
        examples["full_text"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # The 'abstract' column is our target label
    # We tokenize it separately
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["abstract"],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length"
        )

    inputs["labels"] = labels["input_ids"]
    return inputs

print("Tokenizing dataset...")
tokenized_datasets = dataset.map(preprocess_function, batched=True)
# Remove columns we don't need for training
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'full_text', 'abstract'])
print("Tokenization complete.")


Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/900 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenization complete.


# --- 3. Fine-Tuning the Model ---

In [5]:

print("Loading model for fine-tuning...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

# Data collator handles batching and padding dynamically
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_SAVE_PATH,
    eval_strategy="epoch",   # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4, # Lower if you get memory errors
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,         # Increase for better performance
    predict_with_generate=True,
    fp16=True,                  # Use mixed precision for speed (if on GPU)
    push_to_hub=False,          # Set to True to upload to Hugging Face Hub
    report_to="none",
)

# Create the Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()
print("Training complete.")


Loading model for fine-tuning...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


Starting training...


Epoch,Training Loss,Validation Loss
1,No log,0.747341
2,No log,0.649505
3,1.419700,0.632351


Training complete.


# --- 4. Save the Fine-Tuned Model ---

In [6]:

print(f"Saving model to {MODEL_SAVE_PATH}...")
trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print("Model and tokenizer saved.")




Saving model to t5-small-scientific-paper-summarizer...
Model and tokenizer saved.


# --- 5. Example: How to Use the Fine-Tuned Model ---

In [7]:

print("\n--- Testing the fine-tuned model ---")
from transformers import pipeline

# Load the saved model using a pipeline for easy inference
summarizer = pipeline("summarization", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)

# Grab a sample paper text from the original dataset to test
sample_text = dataset['test'][0]['full_text'][:2000] # Use first 2000 chars for demo

original_summary = dataset['test'][0]['abstract']
generated_summary = summarizer(sample_text)[0]['summary_text']

print("\nSAMPLE PAPER TEXT (truncated):")
print(sample_text)
print("\nORIGINAL ABSTRACT:")
print(original_summary)
print("\nGENERATED SUMMARY:")
print(generated_summary)


--- Testing the fine-tuned model ---


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors



SAMPLE PAPER TEXT (truncated):
7 Enumerating limit groups
Daniel Groves and Henry Wilton
21st May 2007
Abstract
We prove that the set of limit groups is recursive, answering a
question of Delzant. One ingredient of the proof is the observation
that a finitely presented group with local retractions (à la Long and
Reid) is coherent and, furthermore, there exists an algorithm that
computes presentations for finitely generated subgroups. The other
main ingredient is the ability to algorithmically calculate centralizers
in relatively hyperbolic groups. Applications include the existence of
recognition algorithms for limit groups and free groups.
A limit group is a finitely generated, fully residually free group. Recent
research into limit groups has been motivated by their role in the theory of
the set of homomorphisms from a finitely presented group to a free group, and
in the logic of free groups. This research has culminated in the independent
solutions to Tarski’s problems on the elem