In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q transformers datasets accelerate sentencepiece

In [3]:
# =============================================================================
# KHMER NEXT-WORD PREDICTION - GPT STYLE (CAUSAL LANGUAGE MODEL)
# =============================================================================
# Decoder-only GPT training for Khmer autocomplete
# =============================================================================

import re
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)


In [4]:
# =============================================================================
# STEP 0: ENVIRONMENT
# =============================================================================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [5]:
# =============================================================================
# STEP 1: LOAD & CLEAN KHMER DATA
# =============================================================================

print("=" * 70)
print("STEP 1: Loading & Cleaning Khmer Text")
print("=" * 70)

data_path = "/content/drive/MyDrive/I5_WR_Project/Dataset/khmer_wiki_corpus_segmented_latest.txt"

with open(data_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

print(f"✓ Loaded {len(lines)} lines")

def clean_khmer_text(text):
    text = re.sub(r"\s+", " ", text)

    khmer_pattern = re.compile(
        r"[^\u1780-\u17FF\u19E0-\u19FF\s\d០១២៣៤៥៦៧៨៩,.?!;:()\[\]\"'ៗ់៎៏័៍៑្\-]",
        re.UNICODE
    )

    text = khmer_pattern.sub("", text)
    return text.strip()

cleaned_lines = []
for line in lines:
    cleaned = clean_khmer_text(line)
    if len(cleaned) > 10:
        cleaned_lines.append(cleaned)

print(f"✓ Cleaned lines: {len(cleaned_lines)}")


STEP 1: Loading & Cleaning Khmer Text
✓ Loaded 98005 lines
✓ Cleaned lines: 96610


In [6]:
# =============================================================================
# STEP 2: BUILD GPT TRAINING TEXT (AUTOREGRESSIVE)
# =============================================================================

print("\n" + "=" * 70)
print("STEP 2: Building GPT Training Sequences")
print("=" * 70)

TEST_LINES = 1000  # set None for full dataset
print(f"⚠️ TEST MODE: {TEST_LINES} lines")

def build_gpt_text(lines, max_lines=None, min_words=5):
    texts = []

    if max_lines:
        lines = lines[:max_lines]

    for line in lines:
        words = line.split()
        if len(words) < min_words:
            continue

        # Generate multiple prefixes
        for i in range(3, len(words)):
            prefix = " ".join(words[:i])
            texts.append(prefix)

    return texts

gpt_texts = build_gpt_text(cleaned_lines, TEST_LINES)
np.random.shuffle(gpt_texts)

print(f"✓ Total GPT sequences: {len(gpt_texts)}")

dataset = Dataset.from_dict({"text": gpt_texts})
dataset = dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = dataset["train"]
val_dataset = dataset["test"]

print(f"✓ Train: {len(train_dataset)}")
print(f"✓ Validation: {len(val_dataset)}")


STEP 2: Building GPT Training Sequences
⚠️ TEST MODE: 1000 lines
✓ Total GPT sequences: 28826
✓ Train: 25943
✓ Validation: 2883


In [7]:
# =============================================================================
# STEP 3: LOAD XGLM MODEL
# =============================================================================

model_name = "facebook/xglm-564M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

model.to(device)
print("Model loaded:", model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.92M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Model loaded: facebook/xglm-564M


In [8]:
# =============================================================================
# STEP 4: TOKENIZATION
# =============================================================================

MAX_LENGTH = 64

def tokenize_word_level(examples):
    tok = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

    tok["labels"] = [
        [(t if t != tokenizer.pad_token_id else -100) for t in seq]
        for seq in tok["input_ids"]
    ]
    return tok

train_ds = dataset["train"].map(
    tokenize_word_level,
    batched=True,
    remove_columns=["text"],
    num_proc=2
)

val_ds = dataset["test"].map(
    tokenize_word_level,
    batched=True,
    remove_columns=["text"],
    num_proc=2
)

Map (num_proc=2):   0%|          | 0/25943 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/2883 [00:00<?, ? examples/s]

In [9]:

# =============================================================================
# STEP 5: TRAINING SETUP
# =============================================================================

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./xglm-khmer-word",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    warmup_ratio=0.05,
    weight_decay=0.01,
    fp16=True,
    # evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
100,8.3219
200,5.6599
300,5.1183
400,4.7655
500,4.2499
600,3.9336
700,3.5261
800,3.2515
900,2.9464
1000,2.6442


TrainOutput(global_step=9729, training_loss=0.8756132886519792, metrics={'train_runtime': 9985.023, 'train_samples_per_second': 7.795, 'train_steps_per_second': 0.974, 'total_flos': 9034980794302464.0, 'train_loss': 0.8756132886519792, 'epoch': 3.0})

In [10]:
# =============================================================================
# STEP 6: WORD-LEVEL TEST
# =============================================================================

model.eval()

prompts = [
    "ខ្ញុំ ចង់ ទៅ",
    "នេះ ជា របស់",
    "ការ អប់រំ ប្រកប"
]

for p in prompts:
    inputs = tokenizer(p, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=5,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        num_return_sequences=3
    )
    print(f"\nInput: {p}")
    for o in outputs:
        print(tokenizer.decode(o, skip_special_tokens=True))


Input: ខ្ញុំ ចង់ ទៅ
ខ្ញុំ ចង់ ទៅ ទៀត នៅ មាន កន្លែង សម
ខ្ញុំ ចង់ ទៅ ទៀត នៅ មាន កន្លែង សម
ខ្ញុំ ចង់ ទៅ ទៀត នៅ ចុង អត្ថបទ

Input: នេះ ជា របស់
នេះ ជា របស់ ខ្លួន ជា ទីស្រឡ
នេះ ជា របស់ ខ្លួន នោះ ឲ្យ ជា ទី
នេះ ជា របស់ ខ្លួន នោះ ឲ្យ ជា ទី

Input: ការ អប់រំ ប្រកប
ការ អប់រំ ប្រកបដោយ ប្រយោជន៍ សូម្បី
ការ អប់រំ ប្រកបដោយ សេចក្ដីត្រ
ការ អប់រំ ប្រកបដោយ សេចក្ដីត្រ


In [26]:
# =============================================================================
# STEP 7: SAVE MODEL
# =============================================================================

output_dir = "/content/drive/MyDrive/I5_WR_Project/xglm-khmer-word-final"

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"\n✅ Model and tokenizer saved to: {output_dir}")



✅ Model and tokenizer saved to: /content/drive/MyDrive/I5_WR_Project/xglm-khmer-word-final


In [27]:
# =============================================================================
# SAVE MODEL AS .pth
# =============================================================================

pth_path = "/content/drive/MyDrive/I5_WR_Project/xglm-khmer-word-final/xglm_khmer_word.pth"

# Save only model weights
torch.save(model.state_dict(), pth_path)

print(f"✅ Model weights saved as: {pth_path}")


✅ Model weights saved as: /content/drive/MyDrive/I5_WR_Project/xglm-khmer-word-final/xglm_khmer_word.pth


In [30]:
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# MODEL_DIR = "/content/drive/MyDrive/I5_WR_Project/xglm-khmer-word/xglm-khmer-word-final"

# tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
# model = AutoModelForCausalLM.from_pretrained(MODEL_DIR)

# model.to(device)
# model.eval()

# print("✅ Model loaded successfully")


In [31]:
import torch.nn.functional as F

def predict_next_word_from_pth(prompt, top_k=10):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    next_token_logits = logits[0, -1, :]
    probs = F.softmax(next_token_logits, dim=-1)

    top_probs, top_indices = torch.topk(probs, top_k)

    results = []
    for p, idx in zip(top_probs, top_indices):
        token = tokenizer.decode([idx.item()]).strip()
        results.append((token, p.item()))

    return results

prompts = [
    "ខ្ញុំ រស់​​ នៅ",
    "ប្រទេស កម្ពុជា",
    "វិទ្យាល័យ"
]

for p in prompts:
    print(f"\nPrompt: {p}")
    preds = predict_next_word_from_pth(p, top_k=10)
    for token, prob in preds:
        print(f"  {token:<15} prob={prob:.4f}")



Prompt: ខ្ញុំ រស់​​ នៅ
  មួយ             prob=0.6144
  ទី              prob=0.1207
  ស្រុក           prob=0.1060
  ឯ               prob=0.0704
  ប្រចាំ          prob=0.0587
  ផ្ទះ            prob=0.0055
  ឆ្នាំ           prob=0.0029
  មាន             prob=0.0025
  ក្នុង           prob=0.0018
  ថ្ងៃ            prob=0.0017

Prompt: ប្រទេស កម្ពុជា
  ជា              prob=0.9877
                  prob=0.0047
  យើង             prob=0.0030
  មិន             prob=0.0009
  ទទួល            prob=0.0004
  វា              prob=0.0003
  ទ               prob=0.0002
  បុរស            prob=0.0002
  មាន             prob=0.0001
  គេ              prob=0.0001

Prompt: វិទ្យាល័យ
                  prob=0.8099
  ឯ               prob=0.0622
  ឯ               prob=0.0328
  ទាំង            prob=0.0120
  ត្រូវ           prob=0.0111
  នៅក្នុង         prob=0.0077
  ហើយ             prob=0.0072
  ្យ              prob=0.0046
  រ               prob=0.0038
  តា              prob=0.0036


In [32]:
import torch.nn.functional as F

def predict_next_word(prompt, top_k=10):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    next_token_logits = logits[0, -1]
    probs = F.softmax(next_token_logits, dim=-1)

    top_probs, top_ids = torch.topk(probs, top_k)

    results = []
    for p, idx in zip(top_probs, top_ids):
        token = tokenizer.decode([idx.item()]).strip()
        results.append((token, p.item()))

    return results


In [33]:
prompt = "នេះ ជា របស់"

preds = predict_next_word(prompt, top_k=10)
for token, prob in preds:
    print(f"{token:<15} {prob:.4f}")


ខ្លួន           0.9780
                0.0137
មនុស្ស          0.0035
ព               0.0007
ស្ត្រី          0.0004
ពួក             0.0004
ប               0.0003
រឿង             0.0002
ប្រជាពលរដ្ឋ     0.0002
ប្រទេស          0.0002


In [35]:
torch.save(model.state_dict(), "/content/drive/MyDrive/I5_WR_Project/xglm_khmer_word.pth")


In [36]:
pth_path = "/content/drive/MyDrive/I5_WR_Project/xglm_khmer_word.pth"

state_dict = torch.load(pth_path, map_location=device)

model.load_state_dict(state_dict)

model.to(device)
model.eval()

print("✅ XGLM model loaded from .pth")


✅ XGLM model loaded from .pth


In [37]:
prompt = "ខ្ញុំ ចង់ ទៅ"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


ខ្ញុំ ចង់ ទៅ ដល់ ចុង ព្រះរាជ រោង បុរស នោះ ព


In [38]:
predict_next_word("ខ្ញុំ ចង់ ទៅ", top_k=10)


[('ដល់', 0.5236069560050964),
 ('ទៀត', 0.24185344576835632),
 ('', 0.08214238286018372),
 ('ដ', 0.028394849970936775),
 ('ឃើញ', 0.025840027257800102),
 ('លោក', 0.015505442395806313),
 ('ក្នុង', 0.01237097941339016),
 ('ឲ្យ', 0.008648434653878212),
 ('នៅ', 0.008645152673125267),
 ('រក', 0.00686281081289053)]