In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Khmer next word prediction (unsegmented)

Model: mT5-small
Task: Predict the NEXT word-like span only
Metric: Loss + Perplexity

---

# Library

In [None]:
!pip install -q transformers datasets sentencepiece accelerate

import re
import math
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    MT5ForConditionalGeneration,
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback)

# Config

In [None]:
DATASET_PATH = "/content/drive/MyDrive/dataset_for_spellcheck"
FILES = ["kh_oscars_Dataset.txt", "‚Äãkh_CC100.txt"]

MODEL_NAME = "google/mt5-small"
OUTPUT_DIR = "./khmer_nextword_mt5"

TEST_LINES = 4000   # None for full dataset
EPOCHS = 3
BATCH_SIZE = 16
LR = 2e-4

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# STEP 1: LOAD & CLEAN UNSEGMENTED KHMER

In [None]:
lines = []
for fname in FILES:
    with open(f"{DATASET_PATH}/{fname}", "r", encoding="utf-8") as f:
        file_lines = f.readlines()
        lines.extend(file_lines)
    print(f"Loaded {len(file_lines)} lines from {fname}")

def clean_khmer(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(
        r"[^\u1780-\u17FF\u19E0-\u19FF\s.,!?;:'\"()\-·üó·üã·üé·üè·üê·üç·üë·üí]",
        "",
        text
    )
    return text.strip()

cleaned = [clean_khmer(l) for l in lines if len(clean_khmer(l)) > 12]
print(f" Cleaned lines kept: {len(cleaned)}")

# STEP 2: CREATE NEXT WORD PAIRS (UNSEGMENTED)

In [None]:
def create_next_word_pairs(
    lines,
    min_prefix=8,
    max_prefix=48,
    target_chars=6,   # ~one Khmer word
    stride=2,
    max_lines=None
):
    pairs = []

    if max_lines:
        lines = lines[:max_lines]

    for text in lines:
        L = len(text)
        if L < min_prefix + target_chars:
            continue

        for p in range(min_prefix, min(L - target_chars, max_prefix), stride):
            input_text = text[:p]
            target_text = text[p:p + target_chars]

            pairs.append({
                "input_text": input_text,
                "target_text": target_text
            })

    return pairs

pairs = create_next_word_pairs(cleaned, max_lines=TEST_LINES)
np.random.shuffle(pairs)

print(f"‚úì Total next-word samples: {len(pairs)}")

# STEP 3: DATASET SPLIT

In [None]:
dataset = Dataset.from_list(pairs)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

train_ds = dataset["train"]
val_ds = dataset["test"]

print(f"‚úì Train: {len(train_ds)} | Val: {len(val_ds)}")

# STEP 4: MODEL & TOKENIZER

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

model.gradient_checkpointing_enable()
model.config.use_cache = False

# STEP 5: TOKENIZATION

In [None]:
def preprocess(batch):
    inputs = ["next: " + t for t in batch["input_text"]]

    model_inputs = tokenizer(
        inputs,
        max_length=64,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target_text"],
            max_length=8,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = [
        [t if t != tokenizer.pad_token_id else -100 for t in label]
        for label in labels["input_ids"]
    ]

    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)


# STEP 6: PERPLEXITY CALLBACK

In [None]:
class PerplexityCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics and "eval_loss" in metrics:
            ppl = math.exp(metrics["eval_loss"])
            metrics["perplexity"] = ppl
            print(f"üìä Perplexity: {ppl:.4f}")

# STEP 7: TRAINING CONFIG

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=32,
    learning_rate=LR,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    fp16=False,
    report_to="none",
    predict_with_generate=False  # IMPORTANT: no auto-suggestion
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model)
)

trainer.add_callback(PerplexityCallback())

In [None]:
# STEP 8: TRAIN

print("\nüöÄ Training Khmer Next Word Model...")
trainer.train()

# STEP 9: SAVE

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# STEP 10: SIMPLE NEXT WORD TEST (SINGLE OUTPUT)

print("\n=== NEXT WORD PREDICTION TEST ===")

model.eval()

test_inputs = [
    "·ûü·ûò·üí·ûè·üÅ·ûÖ·ûñ·ûª·ûÄ",
    "·ûÄ·û∂·ûö·û¢·ûî·üã·ûö·üÜ",
    "·ûî·üí·ûö·ûë·üÅ·ûü·ûÄ·ûò·üí·ûñ·ûª·ûá·û∂"
]

for text in test_inputs:
    inputs = tokenizer(
        "next: " + text,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=16,
            num_beams=1
        )

    prediction = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"\nInput: {text}")
    print(f"Next word: {prediction}")

print("\n‚úÖ Khmer Next Word Prediction Ready")



=== STEP 1: Loading Data ===
‚úì Loaded 147208 lines from kh_oscars_Dataset.txt
‚úì Loaded 3012632 lines from ‚Äãkh_CC100.txt
‚úì Cleaned lines kept: 3087920

=== STEP 2: Creating Next Word Pairs ===
‚úì Total next-word samples: 79998
‚úì Train: 71998 | Val: 8000

=== STEP 4: Loading Model ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/71998 [00:00<?, ? examples/s]



Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(



üöÄ Training Khmer Next Word Model...


Step,Training Loss,Validation Loss
500,4.3173,3.71705
1000,3.8643,3.474234
1500,3.6399,3.248202
2000,3.5197,3.133784
2500,3.419,3.041406
3000,3.3122,2.919609
3500,3.2228,2.850423
4000,3.092,2.76645
4500,3.0663,2.717021
5000,2.933,2.660603


üìä Perplexity: 41.1428
üìä Perplexity: 32.2731
üìä Perplexity: 25.7440
üìä Perplexity: 22.9607
üìä Perplexity: 20.9346
üìä Perplexity: 18.5340
üìä Perplexity: 17.2951
üìä Perplexity: 15.9021
üìä Perplexity: 15.1352
üìä Perplexity: 14.3049
üìä Perplexity: 13.7650
üìä Perplexity: 13.2488
üìä Perplexity: 12.9884
üìä Perplexity: 12.3608
üìä Perplexity: 12.0608
üìä Perplexity: 11.6556
üìä Perplexity: 11.3446
üìä Perplexity: 11.2039
üìä Perplexity: 10.9360
üìä Perplexity: 10.7721
üìä Perplexity: 10.6699
üìä Perplexity: 10.5274


Step,Training Loss,Validation Loss
500,4.3173,3.71705
1000,3.8643,3.474234
1500,3.6399,3.248202
2000,3.5197,3.133784
2500,3.419,3.041406
3000,3.3122,2.919609
3500,3.2228,2.850423
4000,3.092,2.76645
4500,3.0663,2.717021
5000,2.933,2.660603


üìä Perplexity: 10.4069
üìä Perplexity: 10.3584
üìä Perplexity: 10.3467
üìä Perplexity: 10.2559
üìä Perplexity: 10.2532

=== NEXT WORD PREDICTION TEST ===

Input: ·ûü·ûò·üí·ûè·üÅ·ûÖ·ûñ·ûª·ûÄ
Next word: ·û∂·ûó·û∑·ûî·û∂·ûõ

Input: ·ûÄ·û∂·ûö·û¢·ûî·üã·ûö·üÜ
Next word: ·û¢·ûî·üã·ûö·üÜ

Input: ·ûî·üí·ûö·ûë·üÅ·ûü·ûÄ·ûò·üí·ûñ·ûª·ûá·û∂
Next word: ·ûá·ûî·üâ·ûª·ûì

‚úÖ Khmer Next Word Prediction Ready


In [None]:
!cp -r /content/khmer_nextword_mt5/checkpoint-13500 /content/drive/MyDrive/MT5_Models/

In [None]:
model.eval()

test_inputs = [
    "·ûü·û∂·ûõ·û∂",
    "·ûü·ûª·ûê·û∂·ûô·ûü·üí·ûò·üÑ·üá",
    "·ûî·üí·ûö·ûë·üÅ·ûü·ûÖ·üÑ·ûö"
]

for text in test_inputs:
    inputs = tokenizer(
        "next: " + text,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=25,
            num_beams=1
        )

    prediction = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"\nInput: {text}")
    print(f"Next word: {prediction}")

print("\n‚úÖ Khmer Next Word Prediction Ready")


Input: ·ûü·û∂·ûõ·û∂
Next word: ·û¢·ûî·üã·ûö·üÜ

Input: ·ûü·ûª·ûê·û∂·ûô·ûü·üí·ûò·üÑ·üá
Next word: ·ûü·ûò·üí·ûö·û∂·ûî

Input: ·ûî·üí·ûö·ûë·üÅ·ûü·ûÖ·üÑ·ûö
Next word: ·ûõ·ûΩ·ûÖ·ûÖ·üí

‚úÖ Khmer Next Word Prediction Ready


In [None]:
!cp -r /content/khmer_nextword_mt5/checkpoint-13500 /content/drive/MyDrive/MT5_Checkpoints/

In [None]:
from safetensors.torch import load_file

# Step 2: Load safetensors weights
checkpoint_path = "/content/khmer_nextword_mt5/checkpoint-13500/model.safetensors"
state_dict = load_file(checkpoint_path)

# Step 3: Save as .pth file
torch.save(state_dict, "/content/khmer_nextword_mt5/khmer_nextword_mt5.pth")

# Step 4: Copy the .pth file into Google Drive
!cp /content/khmer_nextword_mt5/khmer_nextword_mt5.pth /content/drive/MyDrive/