# Setup environment

## Environment variables

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Only use 1 GPU
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Get secrets

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HUGGINGFACE_TOKEN")
WANDB_API_KEY = user_secrets.get_secret("wandb")

## Import modules

In [3]:
!pip install -qU transformers accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset

import wandb
import numpy as np
from datetime import datetime
import json
from tqdm.auto import tqdm
import gc
import math
import time

## Random seed & device

In [5]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Finetune config

In [6]:
class Config:
    # Model configuration
    #model_name = "Qwen/Qwen3-0.6B"
    model_name = "Qwen/Qwen3-1.7B"
    dataset_name = "vietgpt/wikipedia_vi"
    
    # Training configuration
    output_dir = "./qwen-vietnamese-wiki-finetuned"
    # output_dir = "./qwen-vietnamese-wiki-finetuned-2"
    num_train_epochs = 3
    per_device_train_batch_size = 2
    per_device_valid_batch_size = 2
    gradient_accumulation_steps = 8
    learning_rate = 5e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    max_length = 128

    # Optimization settings
    adam_epsilon = 1e-8
    max_grad_norm = 1.0
    
    # Logging and saving
    logging_steps = 40
    save_strategy = "epoch"
    valid_strategy = "epoch"
    
    # Other settings
    fp16 = True
    num_workers = os.cpu_count()
    
    # W&B configuration
    use_wandb = True
    wandb_run_id = None
    #wandb_project = "PARADIS-Qwen3_0.6B"
    wandb_project = "PARADIS_Qwen3_1.7B"
    wandb_run_name = "1GPU"

    # HuggingFace configuration
    use_hf = True
    # hf_repo = "kaggle/PARADIS_Qwen3_0.6B-1GPU(T4)"
    hf_repo = "Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU"
    
    # Dataset
    train_size = 10000
    valid_size = 10000
    test_size = 5000
    min_text_length = 50
    random_seed = 42

config = Config()

In [7]:
config_dict = {k: v for k, v in Config.__dict__.items() if not k.startswith("__") and not callable(v)}
config_dict

{'model_name': 'Qwen/Qwen3-1.7B',
 'dataset_name': 'vietgpt/wikipedia_vi',
 'output_dir': './qwen-vietnamese-wiki-finetuned',
 'num_train_epochs': 3,
 'per_device_train_batch_size': 2,
 'per_device_valid_batch_size': 2,
 'gradient_accumulation_steps': 8,
 'learning_rate': 5e-05,
 'weight_decay': 0.01,
 'warmup_ratio': 0.1,
 'max_length': 128,
 'adam_epsilon': 1e-08,
 'max_grad_norm': 1.0,
 'logging_steps': 40,
 'save_strategy': 'epoch',
 'valid_strategy': 'epoch',
 'fp16': True,
 'num_workers': 4,
 'use_wandb': True,
 'wandb_run_id': None,
 'wandb_project': 'PARADIS_Qwen3_1.7B',
 'wandb_run_name': '1GPU',
 'use_hf': True,
 'hf_repo': 'Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU',
 'train_size': 10000,
 'valid_size': 10000,
 'test_size': 5000,
 'min_text_length': 50,
 'random_seed': 42}

# Setup wandb

In [8]:
wandb.login(key=WANDB_API_KEY)
if config.use_wandb:
    if config.wandb_run_id is None:
        wandb.init( # New run
            project=config.wandb_project,
            name=config.wandb_run_name,
            config=config_dict,
        )
    else:
        wandb.init( # Resume to created run
            project=config.wandb_project,
            id=config.wandb_run_id,
            resume='allow',
        )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mngoquochieu2002[0m ([33mngoquochieu2002-hanoi-university-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250621_071252-j0469c8t[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m1GPU[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ngoquochieu2002-hanoi-university-of-science-and-technology/PARADIS_Qwen3_1.7B[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ngoquochieu2002-h

# Setup HuggingFace

In [9]:
if config.use_hf:
    from huggingface_hub import login, HfApi
    login(HF_TOKEN)
    hf_api = HfApi()

# Model and tokenizer

## Download and quantization

In [10]:
print("Loading tokenizer and model...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    config.model_name,
    trust_remote_code=True,
    padding_side="right"
)

# Add pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Cấu hình 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    device_map="auto", # automatically move to correct device
    quantization_config=quantization_config,
    torch_dtype=torch.float32,
    trust_remote_code=True
)

# Turn on gradient checkpointing to save memory
model.config.use_cache = False
model.gradient_checkpointing_enable()

# Num parameters
print(f"Model loaded. Parameters: {model.num_parameters():,}")

Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

2025-06-21 07:13:04.499991: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750489984.716934      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750489984.779197      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Model loaded. Parameters: 1,720,574,976


## Generation function

In [11]:
def generate_text(
    prompt,
    max_length=config.max_length,
    temperature=0.7,
    top_p=0.9,
    top_k=50
):
    """Generate text using the model."""
    
    model.eval()
    
    # Tokenize input
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        # Generate
        outputs = model.generate(
            inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    
    # Decode generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Dataset

## Custom dataset

In [12]:
class WikiViDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Get data
        item = self.dataset[idx]
        combined_text = f"Tiêu đề: {item['title']}\n\nNội dung: {item['text']}"

        # Tokenize data
        tokenized_text = self.tokenizer(
            combined_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        # # Print a tokenized sample
        # print(tokenized_text)

        # Prepare data from tokenizer output
        input_ids = tokenized_text["input_ids"].squeeze()
        attention_mask = tokenized_text["attention_mask"].squeeze()
        labels = input_ids.clone() # In causal LM, labels is the same with input_ids
        labels[attention_mask == 0] = -100 # Do not calculate loss on padding tokens

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

## Load wikipedia_vi dataset

In [13]:
print("Loading dataset...")
dataset = load_dataset(config.dataset_name, split="train")
print(f"Dataset loaded. Total samples: {len(dataset)}")

Loading dataset...


README.md:   0%|          | 0.00/632 [00:00<?, ?B/s]

(…)-00000-of-00003-6218d2963e302058.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

(…)-00001-of-00003-12e6c4fadbec91d4.parquet:   0%|          | 0.00/55.2M [00:00<?, ?B/s]

(…)-00002-of-00003-175fcfe1c45b0b85.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1284930 [00:00<?, ? examples/s]

Dataset loaded. Total samples: 1284930


In [14]:
dataset[0]

{'id': 2,
 'revid': '90949',
 'url': 'https://vi.wikipedia.org/wiki?curid=2',
 'title': 'Trang Chính',
 'text': '&lt;templatestyles src="Wiki2021/styles.css" /&gt;__NOEDITSECTION__\n \n \n \n '}

## Preprocess data

In [15]:
# keep only title and text column
dataset = dataset.select_columns(['title', 'text'])

In [16]:
dataset[0]

{'title': 'Trang Chính',
 'text': '&lt;templatestyles src="Wiki2021/styles.css" /&gt;__NOEDITSECTION__\n \n \n \n '}

In [17]:
def filter_function(example):
    """Filter out empty or very short texts"""
    
    return (
        example['text'] is not None and 
        example['title'] is not None and
        len(example['text'].strip()) > config.min_text_length
    )

dataset = dataset.filter(filter_function)
print(f"After filtering: {len(dataset)} samples")

Filter:   0%|          | 0/1284930 [00:00<?, ? examples/s]

After filtering: 1263196 samples


## Create splits

In [18]:
dataset = dataset.shuffle(seed=config.random_seed)

train_split = dataset.select(range(
    config.train_size
))

valid_split = dataset.select(range(
    config.train_size,
    config.train_size + config.valid_size
))

test_split = dataset.select(range(
    config.train_size + config.valid_size,
    config.train_size + config.valid_size + config.test_size
))

print(f'train split: {len(train_split)} samples')
print(f'valid split: {len(valid_split)} samples')
print(f'test split: {len(test_split)} samples')

train split: 10000 samples
valid split: 10000 samples
test split: 5000 samples


In [19]:
train_ds = WikiViDataset(train_split, tokenizer, config.max_length)
valid_ds = WikiViDataset(valid_split, tokenizer, config.max_length)
test_ds = WikiViDataset(test_split, tokenizer, config.max_length)

## Data loader

In [20]:
train_dataloader = DataLoader(
    train_ds,
    batch_size=config.per_device_train_batch_size,
    shuffle=True,
    num_workers=config.num_workers,
    pin_memory=True,
)

valid_dataloader = DataLoader(
    valid_ds,
    batch_size=config.per_device_valid_batch_size,
    shuffle=True,
    num_workers=config.num_workers,
    pin_memory=True,
)

In [21]:
print(f"Train batches: {len(train_dataloader)}")
print(f"Valid batches: {len(valid_dataloader)}")

Train batches: 5000
Valid batches: 5000


# Optimizer & scheduler

In [22]:
total_steps = len(train_dataloader) * config.num_train_epochs // config.gradient_accumulation_steps
warmup_steps = int(total_steps * config.warmup_ratio)

print(f"Total training steps: {total_steps}")
print(f"Warmup steps: {warmup_steps}")

Total training steps: 1875
Warmup steps: 187


In [23]:
# Setup optimizer
optimizer = optim.AdamW(
    model.parameters(),
    lr=config.learning_rate,
    weight_decay=config.weight_decay,
    eps=config.adam_epsilon
)

# Setup learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Setup gradient scaler for mixed precision training
scaler = torch.amp.GradScaler(device) if config.fp16 else None

# Training function

In [24]:
def train_epoch(model, dataloader, optimizer, scheduler, scaler, epoch):
    """Train for one epoch."""
    
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    progress_bar = tqdm(dataloader, desc=f"Training Epoch {epoch + 1}")
    
    for step, batch in enumerate(progress_bar):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass with mixed precision
        if config.fp16:
            # For mixed precision
            with torch.autocast(device_type=device.type):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                # Chia loss cho gradient_accumulation_steps
                # Nếu không nhận được loss sẽ gấp <gradient_accumulation_steps> lần loss thực sự
                loss = outputs.loss / config.gradient_accumulation_steps
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / config.gradient_accumulation_steps
        
        # Backward pass
        if config.fp16:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        
        total_loss += loss.item()
        
        # Update weights every gradient_accumulation_steps
        if (step + 1) % config.gradient_accumulation_steps == 0:
            if config.fp16:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                optimizer.step()
            
            scheduler.step()
            optimizer.zero_grad()
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f"{loss.item() * config.gradient_accumulation_steps:.4f}",
            'lr': f"{scheduler.get_last_lr()[0]:.2e}"
        })
        
        # Logging
        if (step + 1) % config.logging_steps == 0:
            
            avg_loss = total_loss / (step + 1) * config.gradient_accumulation_steps
            print(f"Step {step + 1}/{len(dataloader)}, Loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.2e}")

            if config.use_wandb:
                wandb.log({
                    "train_loss": avg_loss,
                    "learning_rate": scheduler.get_last_lr()[0],
                    "train_step": epoch * len(dataloader) + step + 1
                })
    
    return total_loss / len(dataloader) * config.gradient_accumulation_steps

# Validation function

In [25]:
def validate(model, dataloader):
    """Validate the model."""
    
    model.eval()
    total_loss = 0
    total_steps = 0
    
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Validating")
        
        for batch in progress_bar:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            if config.fp16:
                with torch.autocast(device_type=device.type):
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
            else:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            
            loss = outputs.loss
            total_loss += loss.item()
            total_steps += 1
            
            progress_bar.set_postfix({'valid_loss': f"{loss.item():.4f}"})
    
    avg_loss = total_loss / total_steps
    perplexity = math.exp(avg_loss)
    
    return avg_loss, perplexity

# Training loop

## Main loop

In [26]:
print("Starting training...")

# Create output directory
os.makedirs(config.output_dir, exist_ok=True)

# Training history
training_history = {
    'train_losses': [],
    'train_times': [],
    'valid_losses': [],
    'valid_perplexities': [],
    'valid_times': [],
    'learning_rates': []
}

best_valid_loss = float('inf')
step_count = 0

for epoch in range(config.num_train_epochs):
    print(f"\n{'=' * 50}")
    print(f"Epoch {epoch + 1}/{config.num_train_epochs}")
    print(f"{'=' * 50}")
    
    # Training
    start_time = time.time()
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, scaler, epoch)
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    train_mins, train_secs = divmod(elapsed_time, 60)
    training_history['train_times'].append(train_mins)
    print(f"Training Time: {int(train_mins)} mins {int(train_secs)} seconds")
    
    training_history['train_losses'].append(train_loss)
    print(f"Training Loss: {train_loss:.4f}")
    
    # Validation
    start_time = time.time()
    valid_loss, perplexity = validate(model, valid_dataloader)
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    valid_mins, valid_secs = divmod(elapsed_time, 60)
    training_history['valid_times'].append(valid_mins)
    print(f"Training Time: {int(valid_mins)} mins {int(valid_secs)} seconds")
    
    training_history['valid_losses'].append(valid_loss)
    training_history['valid_perplexities'].append(perplexity)
    print(f"Validation Loss: {valid_loss:.4f}")
    print(f"Perplexity: {perplexity:.2f}")
    
    # Log to wandb
    if config.use_wandb:
        wandb.log({
            "epoch": epoch + 1,
            "train_time (m)": train_mins,
            "valid_time (m)": valid_mins,
            "valid_loss": valid_loss,
            "perplexity": perplexity,
        })
    
    # Save best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        
        model.save_pretrained(config.output_dir)
        tokenizer.save_pretrained(config.output_dir)
        print(f"New best model! Saved to {config.output_dir}")
        
        if config.use_hf:
            model.push_to_hub(config.hf_repo)
            tokenizer.push_to_hub(config.hf_repo)
            print(f"Also saved to repo {config.hf_repo}")
        
    # Save training state
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'best_valid_loss': best_valid_loss,
        'training_history': training_history
    }, os.path.join(config.output_dir, 'training_state.pt'))
    print(f"Training state saved to {config.output_dir}!")

    if config.use_hf:
        hf_api.upload_file(
            path_or_fileobj=os.path.join(config.output_dir, 'training_state.pt'),
            path_in_repo="training_state.pt",
            repo_id=config.hf_repo,
            repo_type="model",
        )
    print(f"Training state pushed to repo {config.hf_repo}!")
    
    # Clean up GPU memory
    torch.cuda.empty_cache()
    gc.collect()

Starting training...

Epoch 1/3


Training Epoch 1:   0%|          | 0/5000 [00:00<?, ?it/s]



Step 40/5000, Loss: 2.7724, LR: 1.34e-06
Step 80/5000, Loss: 2.7651, LR: 2.67e-06
Step 120/5000, Loss: 2.7435, LR: 4.01e-06
Step 160/5000, Loss: 2.7063, LR: 5.35e-06
Step 200/5000, Loss: 2.7064, LR: 6.68e-06
Step 240/5000, Loss: 2.6932, LR: 8.02e-06
Step 280/5000, Loss: 2.6977, LR: 9.36e-06
Step 320/5000, Loss: 2.6769, LR: 1.07e-05
Step 360/5000, Loss: 2.6637, LR: 1.20e-05
Step 400/5000, Loss: 2.6457, LR: 1.34e-05
Step 440/5000, Loss: 2.6328, LR: 1.47e-05
Step 480/5000, Loss: 2.6112, LR: 1.60e-05
Step 520/5000, Loss: 2.5934, LR: 1.74e-05
Step 560/5000, Loss: 2.5751, LR: 1.87e-05
Step 600/5000, Loss: 2.5568, LR: 2.01e-05
Step 640/5000, Loss: 2.5386, LR: 2.14e-05
Step 680/5000, Loss: 2.5178, LR: 2.27e-05
Step 720/5000, Loss: 2.4980, LR: 2.41e-05
Step 760/5000, Loss: 2.4774, LR: 2.54e-05
Step 800/5000, Loss: 2.4592, LR: 2.67e-05
Step 840/5000, Loss: 2.4424, LR: 2.81e-05
Step 880/5000, Loss: 2.4251, LR: 2.94e-05
Step 920/5000, Loss: 2.4074, LR: 3.07e-05
Step 960/5000, Loss: 2.3898, LR: 3.2

Validating:   0%|          | 0/5000 [00:00<?, ?it/s]

Training Time: 12 mins 38 seconds
Validation Loss: 1.4353
Perplexity: 4.20
New best model! Saved to ./qwen-vietnamese-wiki-finetuned


model.safetensors:   0%|          | 0.00/2.04G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Also saved to repo Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU
Training state saved to ./qwen-vietnamese-wiki-finetuned!


training_state.pt:   0%|          | 0.00/4.53G [00:00<?, ?B/s]

Training state pushed to repo Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU!

Epoch 2/3


Training Epoch 2:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 40/5000, Loss: 1.3750, LR: 3.69e-05
Step 80/5000, Loss: 1.3403, LR: 3.67e-05
Step 120/5000, Loss: 1.3477, LR: 3.66e-05
Step 160/5000, Loss: 1.3276, LR: 3.64e-05
Step 200/5000, Loss: 1.3126, LR: 3.63e-05
Step 240/5000, Loss: 1.3095, LR: 3.61e-05
Step 280/5000, Loss: 1.3067, LR: 3.60e-05
Step 320/5000, Loss: 1.2969, LR: 3.58e-05
Step 360/5000, Loss: 1.3044, LR: 3.57e-05
Step 400/5000, Loss: 1.3100, LR: 3.55e-05
Step 440/5000, Loss: 1.3165, LR: 3.54e-05
Step 480/5000, Loss: 1.3197, LR: 3.52e-05
Step 520/5000, Loss: 1.3139, LR: 3.51e-05
Step 560/5000, Loss: 1.3076, LR: 3.50e-05
Step 600/5000, Loss: 1.2960, LR: 3.48e-05
Step 640/5000, Loss: 1.2874, LR: 3.47e-05
Step 680/5000, Loss: 1.2789, LR: 3.45e-05
Step 720/5000, Loss: 1.2884, LR: 3.44e-05
Step 760/5000, Loss: 1.2902, LR: 3.42e-05
Step 800/5000, Loss: 1.2895, LR: 3.41e-05
Step 840/5000, Loss: 1.2859, LR: 3.39e-05
Step 880/5000, Loss: 1.2869, LR: 3.38e-05
Step 920/5000, Loss: 1.2823, LR: 3.36e-05
Step 960/5000, Loss: 1.2863, LR: 3.3

Validating:   0%|          | 0/5000 [00:00<?, ?it/s]

Training Time: 12 mins 38 seconds
Validation Loss: 1.3821
Perplexity: 3.98
New best model! Saved to ./qwen-vietnamese-wiki-finetuned


model.safetensors:   0%|          | 0.00/2.04G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Also saved to repo Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU
Training state saved to ./qwen-vietnamese-wiki-finetuned!


training_state.pt:   0%|          | 0.00/4.53G [00:00<?, ?B/s]

Training state pushed to repo Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU!

Epoch 3/3


Training Epoch 3:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 40/5000, Loss: 1.1654, LR: 1.84e-05
Step 80/5000, Loss: 1.1446, LR: 1.82e-05
Step 120/5000, Loss: 1.1633, LR: 1.81e-05
Step 160/5000, Loss: 1.1540, LR: 1.79e-05
Step 200/5000, Loss: 1.1924, LR: 1.78e-05
Step 240/5000, Loss: 1.1852, LR: 1.76e-05
Step 280/5000, Loss: 1.1886, LR: 1.75e-05
Step 320/5000, Loss: 1.1870, LR: 1.73e-05
Step 360/5000, Loss: 1.1862, LR: 1.72e-05
Step 400/5000, Loss: 1.1765, LR: 1.70e-05
Step 440/5000, Loss: 1.1790, LR: 1.69e-05
Step 480/5000, Loss: 1.1792, LR: 1.67e-05
Step 520/5000, Loss: 1.1808, LR: 1.66e-05
Step 560/5000, Loss: 1.1863, LR: 1.64e-05
Step 600/5000, Loss: 1.1836, LR: 1.63e-05
Step 640/5000, Loss: 1.1816, LR: 1.61e-05
Step 680/5000, Loss: 1.1823, LR: 1.60e-05
Step 720/5000, Loss: 1.1818, LR: 1.58e-05
Step 760/5000, Loss: 1.1841, LR: 1.57e-05
Step 800/5000, Loss: 1.1843, LR: 1.56e-05
Step 840/5000, Loss: 1.1816, LR: 1.54e-05
Step 880/5000, Loss: 1.1806, LR: 1.53e-05
Step 920/5000, Loss: 1.1822, LR: 1.51e-05
Step 960/5000, Loss: 1.1864, LR: 1.5

Validating:   0%|          | 0/5000 [00:00<?, ?it/s]

Training Time: 12 mins 38 seconds
Validation Loss: 1.3724
Perplexity: 3.94
New best model! Saved to ./qwen-vietnamese-wiki-finetuned


model.safetensors:   0%|          | 0.00/2.04G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Also saved to repo Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU
Training state saved to ./qwen-vietnamese-wiki-finetuned!


training_state.pt:   0%|          | 0.00/4.53G [00:00<?, ?B/s]

Training state pushed to repo Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU!


# After training

## Save training log

In [27]:
# Save comprehensive training log
training_log = {
    'config': vars(config),
    'model_info': {
        'model_name': config.model_name,
        'num_parameters': model.num_parameters(),
        'dataset_name': config.dataset_name,
        'train_samples': len(train_ds),
        'valid_samples': len(valid_ds)
    },
    'training_results': {
        'best_valid_loss': best_valid_loss,
        'final_perplexity': training_history['valid_perplexities'][-1],
        'total_epochs': config.num_train_epochs,
        'total_steps': total_steps
    },
    'training_history': training_history,
    'training_date': datetime.now().isoformat()
}

with open(os.path.join(config.output_dir, 'training_log.json'), 'w', encoding='utf-8') as f:
    json.dump(training_log, f, indent=2, ensure_ascii=False)
print(f"\nTraining log saved to {config.output_dir}/training_log.json")

if config.use_hf:
    hf_api.upload_file(
        path_or_fileobj=os.path.join(config.output_dir, 'training_log.json'),
        path_in_repo="training_log.json",
        repo_id=config.hf_repo,
        repo_type="model",
    )
print(f"\nTraining log pushed to repo {config.hf_repo}")


Training log saved to ./qwen-vietnamese-wiki-finetuned/training_log.json

Training log pushed to repo Quoc59/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU


## Clean up

In [28]:
if config.use_wandb:
    wandb.finish()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:          epoch ▁▅█
[34m[1mwandb[0m:  learning_rate ▁▂▄▄▄██▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▃▃▃▃▂▂▂▂▁▁▁▁
[34m[1mwandb[0m:     perplexity █▂▁
[34m[1mwandb[0m:     train_loss ██▇▇▇▅▅▅▅▄▄▄▄▄▄▄▄▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     train_step ▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇█████
[34m[1mwandb[0m: train_time (m) ▁▁▁
[34m[1mwandb[0m:     valid_loss █▂▁
[34m[1mwandb[0m: valid_time (m) ▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:          epoch 3
[34m[1mwandb[0m:  learning_rate 0
[34m[1mwandb[0m:     perplexity 3.94467
[34m[1mwandb[0m:     train_loss 1.17938
[34m[1mwandb[0m:     train_step 15000
[34m[1mwandb[0m: train_time (m) 42
[34m[1mwandb[0m:     valid_loss 1.37236
[34m[1mwandb[0m: valid_time (m) 12
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View r