In [None]:
!pip install transformers datasets torch accelerate sentencepiece -q
!pip install opencv-python -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m116.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Install required libraries
!pip install transformers datasets torch scikit-learn --quiet

# Import necessary libraries
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments

# Check if GPU is available
use_fp16 = torch.cuda.is_available()  # Use fp16 only if GPU (CUDA) is available
use_bf16 = torch.device("xla") if not use_fp16 else False  # Use bf16 if TPU is detected

# Load PubMed dataset from Hugging Face
pubmed_dataset = load_dataset("ccdv/pubmed-summarization", split="train")
pubmed_df = pubmed_dataset.to_pandas()
pubmed_df = pubmed_df[['article', 'abstract']].dropna()
pubmed_df.columns = ['text', 'summary']

# Load CompScholar.csv manually
comp_df = pd.read_csv("/content/CompScholar.csv")  # Upload manually in Colab
comp_df = comp_df[['Document', 'Summary']].dropna()
comp_df.columns = ['text', 'summary']

# Combine datasets & limit size for faster training
df = pd.concat([comp_df, pubmed_df])
df = df.sample(n=10000, random_state=42).reset_index(drop=True)  # Reduce dataset size

# Split dataset into Training (80%) and Evaluation (20%)
train_texts, eval_texts, train_summaries, eval_summaries = train_test_split(
    df["text"], df["summary"], test_size=0.2, random_state=42
)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_dict({"text": train_texts, "summary": train_summaries})
eval_dataset = Dataset.from_dict({"text": eval_texts, "summary": eval_summaries})

# Load PEGASUS Model & Tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Freeze first 6 layers to speed up training
for param in model.model.encoder.layers[:6]:
    param.requires_grad = False

# Tokenization Function (Fixes Decoder Input Issue)
def tokenize_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    targets = tokenizer(examples["summary"], truncation=True, padding="max_length", max_length=128)

    inputs["labels"] = targets["input_ids"]  # Add decoder input_ids
    return inputs

# Apply tokenization to datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Training Arguments (Fixed Evaluation Strategy & Precision)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Fixed eval dataset issue
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=use_fp16,  # Enable fp16 only if GPU is available
    bf16=use_bf16,  # Enable bf16 for TPU
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Fixed missing eval dataset issue
)

# Train Model
trainer.train()

# Save trained model
model.save_pretrained("./pegasus_summarization_model")
tokenizer.save_pretrained("./pegasus_summarization_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

train-00000-of-00005.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/59.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/58.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,3.0395,2.749694
2,2.8098,2.601429
3,2.6452,2.434433




NameError: name 'train_result' is not defined