In [27]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [28]:
import os
import tarfile
import nltk
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Download NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
# ===== 2. IMPORTS =====
import torch
from transformers import (
    BertTokenizer,
    EncoderDecoderModel,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import load_from_disk
import numpy as np

In [30]:
from datasets import DatasetDict, load_from_disk
import json

# Path to the directory where the dataset is saved
base_dir = '/kaggle/input/dataset'

# Load the dataset_dict.json to get the splits information
json_file_path = os.path.join(base_dir, 'dataset_dict.json')
with open(json_file_path, 'r') as f:
    splits = json.load(f)

# Verify the content of splits.json
print(f"Loaded splits: {splits}")

# Load the dataset using Hugging Face's load_from_disk method
def load_custom_dataset(base_dir, splits):
    encoded_dataset = {}
    for split in splits['splits']:
        split_dir = os.path.join(base_dir, split)

        # Load the Arrow dataset using Hugging Face's Dataset library
        encoded_dataset[split] = load_from_disk(split_dir)
        print(f"Loaded {split} dataset with {len(encoded_dataset[split])} examples.")

    return DatasetDict(encoded_dataset)

# Load the full dataset
encoded_dataset = load_custom_dataset(base_dir, splits)

# Example: Verify the structure of the loaded dataset
print(f"Loaded dataset with {len(encoded_dataset['train'])} training examples")
print(f"Loaded dataset with {len(encoded_dataset['validation'])} validation examples")
print(f"Loaded dataset with {len(encoded_dataset['test'])} test examples")

Loaded splits: {'splits': ['train', 'validation', 'test']}
Loaded train dataset with 74063 examples.
Loaded validation dataset with 9258 examples.
Loaded test dataset with 9258 examples.
Loaded dataset with 74063 training examples
Loaded dataset with 9258 validation examples
Loaded dataset with 9258 test examples


In [31]:
# ===== 4. INITIALIZE MODEL =====
import warnings
warnings.filterwarnings("ignore")

import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

import torch
from transformers import BertTokenizer, EncoderDecoderModel

# Choose device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize encoder-decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased",  # Encoder
    "bert-base-uncased"   # Decoder
).to(device)

# Configure for summarization
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 128
model.config.min_length = 30
model.config.num_beams = 4
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.early_stopping = True


In [32]:
pip install -U transformers

Note: you may need to restart the kernel to use updated packages.


In [33]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./bert_summarizer_gpu",  # Output directory for the model checkpoints
    eval_strategy="steps",  # Perform evaluation every few steps
    eval_steps=2000,  # Evaluate the model every 2000 steps
    save_steps=2000,  # Save model checkpoints every 2000 steps
    learning_rate=5e-5,  # Slightly higher learning rate for GPU
    per_device_train_batch_size=4,  # Larger batch size possible with GPU (adjustable)
    per_device_eval_batch_size=4,  # Evaluation batch size
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    num_train_epochs=2,  # Number of epochs to train the model
    warmup_steps=500,  # Number of warmup steps
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir="./logs",  # Directory for TensorBoard logs
    logging_steps=100,  # Log every 100 steps for better visibility
    save_total_limit=3,  # Limit the number of saved checkpoints
    predict_with_generate=True,  # Use generate for predictions
    fp16=True,  # Enable mixed precision for faster training
    optim="adamw_torch",  # Optimizer for training
    report_to="tensorboard",  # Log metrics to TensorBoard
    metric_for_best_model="rougeL",  # Use ROUGE-L for evaluation metric
    load_best_model_at_end=True,  # Load the best model during training
    disable_tqdm=False,  # Enable progress bars
    dataloader_num_workers=4,  # Number of workers for data loading (tune based on GPU/CPU)
)

In [34]:
!pip install evaluate
!pip install rouge_score



In [35]:
import evaluate

# Load the ROUGE metric using the evaluate library
rouge = evaluate.load("rouge")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Decode predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    # Convert label_ids to tensor if they are not already
    label_ids = torch.tensor(label_ids)

    # Replace padding token id with -100 for loss calculation
    label_ids = torch.where(label_ids == -100, torch.tensor(tokenizer.pad_token_id), label_ids)

    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute ROUGE metrics
    rouge_output = rouge.compute(
        predictions=pred_str,
        references=label_str,
        rouge_types=["rouge1", "rouge2", "rougeL"],
        use_stemmer=True
    )

    # New way to extract scores (for evaluate>=0.4.0)
    return {
        "rouge1": round(rouge_output["rouge1"] * 100, 4),
        "rouge2": round(rouge_output["rouge2"] * 100, 4),
        "rougeL": round(rouge_output["rougeL"] * 100, 4)
    }

In [36]:
# ===== 7. TRAINER =====
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics
)

In [37]:
# ===== 8. START TRAINING =====
print("üöÄ Starting GPU training...")
try:
    # Train the model
    trainer.train()
    print("üéâ Training completed!")

    # Save model
    trainer.save_model("./bert_summarizer_gpu_final")
    print("Model saved to ./bert_summarizer_gpu_final")

    # Evaluate
    results = trainer.evaluate(encoded_dataset["test"])
    print("\nüìä Final Test Results:")
    print(f"ROUGE-1: {results['eval_rouge1']:.2f}")
    print(f"ROUGE-2: {results['eval_rouge2']:.2f}")
    print(f"ROUGE-L: {results['eval_rougeL']:.2f}")

except KeyboardInterrupt:
    print("Training interrupted!")
except Exception as e:
    print(f"Training failed: {e}")
    if "CUDA out of memory" in str(e):
        print("\n‚ö†Ô∏è Reduce batch_size or sequence length!")


üöÄ Starting GPU training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
2000,8.8271,4.278018,20.7054,3.4493,14.3592
4000,8.2641,3.986781,23.2476,4.3774,15.9212
6000,7.5023,3.778291,25.8542,5.4699,17.4215
8000,7.1893,3.653685,27.0071,6.1614,18.1613


üéâ Training completed!
Model saved to ./bert_summarizer_gpu_final



üìä Final Test Results:
ROUGE-1: 27.09
ROUGE-2: 6.12
ROUGE-L: 18.17
