In [1]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=89002e4d2348b2842ee1b8721f5ef896a2b02a5fbc30408268029a24bd82574c
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import re
from rouge_score import rouge_scorer
import warnings

warnings.filterwarnings('ignore')
np.random.seed(42)
torch.manual_seed(42)

2025-07-24 07:12:37.930738: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753341158.092118      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753341158.139995      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


<torch._C.Generator at 0x7d931ddb8db0>

In [3]:
FILE_PATH = '/kaggle/input/indo-article-data-summarization/IndoData200.csv'
BEST_MODEL = "gaduhhartawan/indobart-base"
SAVE_MODEL_PATH = "./saved_indobart_model"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
NUM_TRAIN_EPOCHS = 4
RANDOM_STATE = 42

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Load data
df = pd.read_csv(FILE_PATH)
print(f"Dataset loaded with {len(df)} samples")

# Create train/validation/test splits
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_STATE)

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"Test set: {len(test_df)} samples")

# Create HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df[['article', 'summary']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df[['article', 'summary']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df[['article', 'summary']].reset_index(drop=True))

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

def preprocess_function(examples, tokenizer, max_input_length, max_target_length):
    inputs = tokenizer(
        examples['article'],
        max_length=max_input_length,
        truncation=True,
        padding=False
    )

    with tokenizer.as_target_tokenizer():
        targets = tokenizer(
            examples['summary'],
            max_length=max_target_length,
            truncation=True,
            padding=False
        )

    inputs['labels'] = targets['input_ids']
    return inputs

# ROUGE metrics computation
def compute_rouge_metrics(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

    for pred, ref in zip(predictions, references):
        pred = str(pred).strip() if pred else " "
        ref = str(ref).strip() if ref else " "
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    return {
        'rouge1': np.mean(rouge1_scores),
        'rouge2': np.mean(rouge2_scores),
        'rougeL': np.mean(rougeL_scores)
    }

# Compute metrics function for trainer
def compute_metrics(eval_preds, tokenizer):
    predictions, labels = eval_preds

    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    vocab_size = len(tokenizer)
    predictions = np.clip(predictions, 0, vocab_size - 1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    
    rouge_metrics = compute_rouge_metrics(decoded_preds, decoded_labels)
    return rouge_metrics

def train_and_save_model(model_name, dataset_dict, save_path):
    print(f"\n{'='*60}\nTraining and Saving Model: {model_name}\n{'='*60}")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name, legacy=False)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # Handle vocab size mismatch
    if model.config.vocab_size != len(tokenizer):
        print(f"Alert: Vocab size mismatch found for {model_name}.")
        print(f"  - Model config vocab size: {model.config.vocab_size}")
        print(f"  - Tokenizer vocab size:    {len(tokenizer)}")
        print("Resizing model token embeddings to match tokenizer.")
        model.resize_token_embeddings(len(tokenizer))
        
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.pad_token_id

    model.to(device)

    # Tokenize datasets
    tokenized_datasets = dataset_dict.map(
        lambda examples: preprocess_function(examples, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH),
        batched=True,
        remove_columns=dataset_dict['train'].column_names
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir='./results_training',
        logging_dir='./logs_training',
        num_train_epochs=NUM_TRAIN_EPOCHS,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        warmup_steps=0,
        weight_decay=0,
        logging_strategy="epoch",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="rougeL",
        greater_is_better=True,
        save_total_limit=1,
        predict_with_generate=True,
        generation_max_length=MAX_TARGET_LENGTH,
        generation_num_beams=2,
        fp16=torch.cuda.is_available(),
        report_to="none",
    )

    # Initialize trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)]
    )

    print("--- Starting training ---")
    trainer.train()

    print("\n--- Evaluating on test set ---")
    test_results = trainer.evaluate(eval_dataset=tokenized_datasets['test'], metric_key_prefix="test")

    print(f"\nFinal Test Results for {model_name}:")
    print(f"  ROUGE-1: {test_results['test_rouge1']:.4f}")
    print(f"  ROUGE-2: {test_results['test_rouge2']:.4f}")
    print(f"  ROUGE-L: {test_results['test_rougeL']:.4f}")

    print("\n--- Generating sample predictions ---")
    sample_data = dataset_dict['test'].select(range(3))
    sample_tokenized = tokenized_datasets['test'].select(range(3))

    predictions = trainer.predict(sample_tokenized)
    decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)

    print("\nSample Predictions:")
    for i, (article, reference, prediction) in enumerate(zip(
        sample_data['article'],
        sample_data['summary'],
        decoded_preds
    )):
        print(f"\nSample {i+1}:")
        print(f"| Article   |: {article[:150]}...")
        print(f"| Reference |: {reference}")
        print(f"| Prediction|: {prediction.strip()}")
        print("-" * 50)

    # 🎯 SAVE THE TRAINED MODEL AND TOKENIZER
    print(f"\n{'='*60}")
    print("SAVING TRAINED MODEL AND TOKENIZER")
    print(f"{'='*60}")
    
    try:
        # Create save directory if it doesn't exist
        os.makedirs(save_path, exist_ok=True)
        
        # Save the trained model
        model.save_pretrained(save_path)
        print(f"✅ Model saved to: {save_path}")
        
        # Save the tokenizer
        tokenizer.save_pretrained(save_path)
        print(f"✅ Tokenizer saved to: {save_path}")
        
        # Save training configuration for reference
        config_info = {
            'original_model': model_name,
            'max_input_length': MAX_INPUT_LENGTH,
            'max_target_length': MAX_TARGET_LENGTH,
            'num_train_epochs': NUM_TRAIN_EPOCHS,
            'train_batch_size': TRAIN_BATCH_SIZE,
            'test_rouge1': test_results['test_rouge1'],
            'test_rouge2': test_results['test_rouge2'],
            'test_rougeL': test_results['test_rougeL']
        }
        
        import json
        with open(f"{save_path}/training_config.json", "w") as f:
            json.dump(config_info, f, indent=2)
        print(f"✅ Training config saved to: {save_path}/training_config.json")
        
        print(f"\n🎉 MODEL SUCCESSFULLY SAVED!")
        print(f"📁 Location: {save_path}")
        print(f"📋 Files saved:")
        print(f"   - pytorch_model.bin (model weights)")
        print(f"   - config.json (model configuration)")
        print(f"   - tokenizer.json (tokenizer)")
        print(f"   - tokenizer_config.json (tokenizer config)")
        print(f"   - training_config.json (your training info)")
        
    except Exception as e:
        print(f"❌ Error saving model: {str(e)}")
        raise e

    # Clean up GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return {
        'model_name': model_name,
        'save_path': save_path,
        'rouge1': test_results['test_rouge1'],
        'rouge2': test_results['test_rouge2'],
        'rougeL': test_results['test_rougeL']
    }

Dataset loaded with 200 samples
Train set: 160 samples
Validation set: 20 samples
Test set: 20 samples


In [5]:
print(f"Model: {BEST_MODEL}")
print(f"Save path: {SAVE_MODEL_PATH}")

result = train_and_save_model(BEST_MODEL, dataset_dict, SAVE_MODEL_PATH)

print(f"Model: {result['model_name']}")
print(f"Saved Path: {result['save_path']}")
print(f"ROUGE-1: {result['rouge1']:.4f}")
print(f"ROUGE-2: {result['rouge2']:.4f}")
print(f"ROUGE-L: {result['rougeL']:.4f}")

Model: gaduhhartawan/indobart-base
Save path: ./saved_indobart_model

Training and Saving Model: gaduhhartawan/indobart-base


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

Alert: Vocab size mismatch found for gaduhhartawan/indobart-base.
  - Model config vocab size: 50264
  - Tokenizer vocab size:    50265
Resizing model token embeddings to match tokenizer.


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

--- Starting training ---


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.1272,2.052587,0.244004,0.079714,0.184578
2,1.4383,2.048216,0.27717,0.105013,0.233914
3,1.0293,2.221187,0.273347,0.100168,0.212518
4,0.7624,2.409934,0.259002,0.089648,0.21286


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



--- Evaluating on test set ---


early stopping required metric_for_best_model, but did not find eval_rougeL so early stopping is disabled



Final Test Results for gaduhhartawan/indobart-base:
  ROUGE-1: 0.2635
  ROUGE-2: 0.0839
  ROUGE-L: 0.2162

--- Generating sample predictions ---

Sample Predictions:

Sample 1:
| Article   |: Apa yang terjadi saat manusia sekarat?.
Orang-orang sering mengira bahwa kehidupan adalah pertempuran melawan kematian. Tetapi apakah mungkin berdama...
| Reference |: Sejumlah orang mengatakan saat sekarat seseorang akan merasa bahagia. Apa itu benar?
| Prediction|: Sejumlah orang sering mengira bahwa kehidupan adalah pertempuran melawan kematian dan apakah mungkin berdamai dengan kemampuan menelan tablet dan minuman selama hampir dua minggu sebelum kita meninggal.
--------------------------------------------------

Sample 2:
| Article   |: Tes darah rutin dapat deteksi kanker ovarium.
Uji coba ini akan mengubah metode deteksi tes darah Hasil uji coba ini dapat mengubah metode pemeriksaa...
| Reference |: Tes darah rutin dapat mendeteksi 86% kanker ovarium lebih awal sebelum masa dimana perempua