In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
BASE_MODEL = "facebook/xglm-564M"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)

# Padding fix for GPT-style models
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id




Device: cpu


In [7]:
pth_path = "/content/drive/MyDrive/I5_WR_Project/xglm_khmer_word.pth"

state_dict = torch.load(pth_path, map_location=device)

model.load_state_dict(state_dict)

model.to(device)
model.eval()

print("✅ XGLM model loaded from .pth")

✅ XGLM model loaded from .pth


In [9]:
prompt = "ខ្ញុំ ចង់ ទៅ"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


ខ្ញុំ ចង់ ទៅ ដល់ សួរ គេ ថា លោក មាន ការ អ្វី បាន


In [13]:
import torch
import torch.nn.functional as F

def top10_next_tokens(prompt, top_k=10):
    model.eval()

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # logits shape: [batch, seq_len, vocab]
    logits = outputs.logits

    # take last token logits
    next_token_logits = logits[0, -1]

    probs = F.softmax(next_token_logits, dim=-1)

    top_probs, top_ids = torch.topk(probs, top_k)

    print(f"\nPrompt: {prompt}")
    print("-" * 40)

    for rank, (prob, token_id) in enumerate(zip(top_probs, top_ids), start=1):
        token = tokenizer.decode([token_id.item()])
        token = token.replace("▁", " ")  # SentencePiece fix
        print(f"{rank:02d}. '{token}'  →  {prob.item():.4f}")

top10_next_tokens("ខ្ញុំ ចង់ ទៅ")



Prompt: ខ្ញុំ ចង់ ទៅ
----------------------------------------
01. 'ដល់'  →  0.5236
02. 'ទៀត'  →  0.2419
03. ''  →  0.0821
04. 'ដ'  →  0.0284
05. 'ឃើញ'  →  0.0258
06. 'លោក'  →  0.0155
07. 'ក្នុង'  →  0.0124
08. 'ឲ្យ'  →  0.0086
09. 'នៅ'  →  0.0086
10. 'រក'  →  0.0069


In [15]:
import torch
import torch.nn.functional as F

def autosuggest(prompt, top_k=5):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits[0, -1]
    probs = F.softmax(logits, dim=-1)

    top_probs, top_ids = torch.topk(probs, top_k)

    suggestions = []
    for p, idx in zip(top_probs, top_ids):
        token = tokenizer.decode([idx.item()])
        token = token.replace("▁", " ").strip()
        suggestions.append((token, p.item()))

    return suggestions


In [16]:
print("=== Khmer Keyboard Auto-Suggestion Demo ===")
print("Type a sentence, press ENTER (type 'exit' to quit)\n")

while True:
    prompt = input("Input: ").strip()
    if prompt.lower() == "exit":
        break

    suggestions = autosuggest(prompt, top_k=5)

    print("Suggestions:")
    for i, (word, prob) in enumerate(suggestions, 1):
        print(f" {i}. {word}  ({prob:.3f})")
    print()


=== Khmer Keyboard Auto-Suggestion Demo ===
Type a sentence, press ENTER (type 'exit' to quit)

Input: ចង់
Suggestions:
 1. បញ្ជាក់  (0.212)
 2. ច  (0.166)
 3.   (0.153)
 4. ប្រើ  (0.082)
 5. សេ  (0.037)

Input: exit
