# <h1 align="center">N-GRAM</h1>

```Pipeline
Raw sentences
→ Train/Test split
→ Tokenization
→ <s>, </s>
→ Bigram / Trigram (TRAIN)
→ Evaluation (TEST)
```

## 1. Load Tokenizer Model 
(Local Pretrained Model)

In [1]:
import torch
import re

# Redefine Tokenizer class
class Tokenizer(object):
    def __init__(self, train_filepath=None, test_filepath=None):
        self.train_filepath = train_filepath
        self.chDict = dict()

    def clean_text(self, text):
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def tokenize(self, text):
        text = self.clean_text(text)
        return list(text)

    def char_to_idx(self, text):
        return [self.chDict.get(ch, 0) for ch in text]

# Now load
tokenizer = torch.load("../pre-train_model/Tokenizer.pt", weights_only=False)
print("Tokenizer loaded successfully!")

Tokenizer loaded successfully!


In [2]:
import torch.nn as nn

# --- Define only the model architecture (still needed) ---
class BiLSTM_Seg(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(BiLSTM_Seg, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bilstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )
        self.classifier = nn.Linear(hidden_dim * 2, 2)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        logits = self.classifier(lstm_out)
        return logits

# --- Load model weights ---
vocab_size = len(tokenizer.chDict) + 1
embed_dim = 128
hidden_dim = 128

model = BiLSTM_Seg(vocab_size, embed_dim, hidden_dim)
model.load_state_dict(torch.load("../pre-train_model/segmentation_model.pth", map_location="cpu"))
model.eval()

BiLSTM_Seg(
  (embedding): Embedding(124, 128)
  (bilstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (classifier): Linear(in_features=256, out_features=2, bias=True)
)

In [3]:
def predict_segmentation(text, model, tokenizer):
    model.eval()
    with torch.no_grad():
        cleaned_text = tokenizer.clean_text(text)
        char_indices = tokenizer.char_to_idx(cleaned_text.replace(" ", ""))

        if not char_indices:
            return ""

        input_tensor = torch.tensor(char_indices).unsqueeze(0)
        logits = model(input_tensor)
        predictions = torch.argmax(logits, dim=-1).squeeze(0).tolist()

        segmented_words = []
        current_word = []

        for i, char_idx in enumerate(char_indices):
            char = list(cleaned_text.replace(" ", ""))[i]
            label = predictions[i]

            if label == 0 and current_word:
                segmented_words.append("".join(current_word))
                current_word = [char]
            else:
                current_word.append(char)

        if current_word:
            segmented_words.append("".join(current_word))

        return " ".join(segmented_words)

## 2. Data Preprocessing

In [4]:
from sklearn.model_selection import train_test_split
from itertools import chain

1️⃣ Read sentences line by line

Each line = one sentence (this is already true in `../dataset/clean_khmer.txt` file)

In [5]:
def load_sentences(path):
    with open(path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

2️⃣ Add sentence boundary tokens to each sentence lines

In [6]:
def add_sentence_markers(tokens):
    return ["<s>"] + tokens + ["</s>"]

3️⃣ Train Test Split

In [7]:
sentences = load_sentences("../dataset/clean_khmer.txt")

train_sents, test_sents = train_test_split(
    sentences,
    test_size=0.2,
    random_state=42
)

In [8]:
print(f"Number of training sentences: {len(train_sents)}")
print(f"Number of testing sentences: {len(test_sents)}")

Number of training sentences: 31646
Number of testing sentences: 7912


4️⃣ Full preprocessing loop

In [9]:
def preprocess_sentences(sentences, model, tokenizer):
    tokenized = []

    for sent in sentences:
        tokens = predict_segmentation(sent, model, tokenizer)

        if isinstance(tokens, str):
            tokens = tokens.split()

        tokens = ["<s>"] + tokens + ["</s>"]
        tokenized.append(tokens)

    return tokenized

In [10]:
train_tokenized = preprocess_sentences(train_sents, model, tokenizer)
test_tokenized  = preprocess_sentences(test_sents, model, tokenizer)

In [11]:
print("\tSample tokenized training sentences:")
for s in train_tokenized[:3]:
    print(s)
print("\tSample tokenized testing sentences:")
for s in test_tokenized[:3]:
    print(s)

	Sample tokenized training sentences:
['<s>', 'អ្នក', 'នាង', 'នី', '</s>']
['<s>', 'បាន', 'ជា', 'ជន', 'ជាតិ', 'ចិន', 'មាន', 'ឥទ្ធិពល', 'ខ្លាំង', 'នៅ', 'ម៉ាលេស៊ី', 'ដោយសារ', 'ជន', 'ជាតិ', 'ចិន', 'មាន', 'តួនាទី', 'ក្នុង', 'សេដ្ឋ', 'កិច្ច', 'ពិសេស', 'ពាណិជ្ជ', 'កម្ម', '</s>']
['<s>', 'បុរស', 'វ័យ', 'ចំណាស់', 'ម្នាក់', 'បាន', 'ស្លាប់', 'មួយ', 'រំពេច', 'ក្រោយ', 'យក', 'គ្រាប់', 'បែក', 'D', 'K', '75', 'ដំបំ', 'បែក', 'ធ្វើ', 'ជា', 'កណ្តឹង', 'គោ', '</s>']
	Sample tokenized testing sentences:
['<s>', 'នៅ', 'ឆ្នាំ', '១៩៧៥', 'តើ', 'នរណា', 'ខ្លះ', 'ជា', 'សមាជិក', 'សំខាន់', 'គណៈ', 'កម្មាធិការ', 'មជ្ឈិមបក្ស', 'កុំ', 'មុយនីស្ដ', 'កម្ពុជា', '</s>']
['<s>', 'អត្រា', 'អាករ', 'លើ', 'តម្លៃ', 'បន្ថែម', 'មាន', 'អត្រា', 'ពីរ', 'គឺ', 'អត្រា', 'ស្តង់', 'ដា', '១០', 'ភាគ', 'រយ', 'និង', 'អត្រា', 'សូន្យ', '០', 'ភាគ', 'រយ', '</s>']
['<s>', 'តើ', 'នៅ', 'ក្រោយ', 'សង្រ្គាម', 'លោក', 'លើក', 'ទី', '២', 'ចលនាត', 'ស៊ូ', 'ទាំង', 'នេះ', 'បាន', 'ទទួល', 'ផល', 'ដូច', 'ម្ដេច', 'ចូរ', 'ប្រាប់', 'ឈ្មោះ', 'ប្រទេស', 'ទាំង', 'បី', 'និ

5️⃣ Merge Tokens into one Corpus

In [12]:
train_corpus = list(chain.from_iterable(train_tokenized))
test_corpus  = list(chain.from_iterable(test_tokenized))

In [13]:
print(train_corpus[:25])
print(test_corpus[:15])

['<s>', 'អ្នក', 'នាង', 'នី', '</s>', '<s>', 'បាន', 'ជា', 'ជន', 'ជាតិ', 'ចិន', 'មាន', 'ឥទ្ធិពល', 'ខ្លាំង', 'នៅ', 'ម៉ាលេស៊ី', 'ដោយសារ', 'ជន', 'ជាតិ', 'ចិន', 'មាន', 'តួនាទី', 'ក្នុង', 'សេដ្ឋ', 'កិច្ច']
['<s>', 'នៅ', 'ឆ្នាំ', '១៩៧៥', 'តើ', 'នរណា', 'ខ្លះ', 'ជា', 'សមាជិក', 'សំខាន់', 'គណៈ', 'កម្មាធិការ', 'មជ្ឈិមបក្ស', 'កុំ', 'មុយនីស្ដ']


## 3. Cleaning

In [14]:
def clean_khmer_tokens(token_list):
    cleaned_list = []
    num_pattern = re.compile(r'[0-9០-៩]+')

    for token in token_list:
        token = token.replace('.', '').replace(',', '').strip()

        if not token:
            continue

        if num_pattern.fullmatch(token):
            token = '<NUM>'

        cleaned_list.append(token)

    return cleaned_list

In [15]:
train_corpus_cleaned = clean_khmer_tokens(train_corpus)
test_corpus_cleaned  = clean_khmer_tokens(test_corpus)
print(train_corpus_cleaned[:15])
print(test_corpus_cleaned[:15])

['<s>', 'អ្នក', 'នាង', 'នី', '</s>', '<s>', 'បាន', 'ជា', 'ជន', 'ជាតិ', 'ចិន', 'មាន', 'ឥទ្ធិពល', 'ខ្លាំង', 'នៅ']
['<s>', 'នៅ', 'ឆ្នាំ', '<NUM>', 'តើ', 'នរណា', 'ខ្លះ', 'ជា', 'សមាជិក', 'សំខាន់', 'គណៈ', 'កម្មាធិការ', 'មជ្ឈិមបក្ស', 'កុំ', 'មុយនីស្ដ']


In [58]:
print(len(set(train_corpus_cleaned)))
print(len(train_corpus_cleaned))
print(len(test_corpus_cleaned))

33252
1298835
331994


## 4. Bigram and Trigram

We estimate probabilities using **Maximum Likelihood Estimation (MLE)** with **add-one (Laplace) smoothing**.

In [59]:
from collections import defaultdict, Counter

### 4.1. **Bigram model**
  Predicts a word based on **1 previous word**
$$
  P(w_i \mid w_{i-1})
$$

In [60]:
def build_bigram_model(tokens):
    bigram_counts = defaultdict(Counter)
    unigram_counts = Counter()

    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i+1]
        bigram_counts[w1][w2] += 1
        unigram_counts[w1] += 1

    bigram_model = {}
    for w1 in bigram_counts:
        total = unigram_counts[w1]
        bigram_model[w1] = {
            w2: count / total
            for w2, count in bigram_counts[w1].items()
        }
    return bigram_model

In [62]:
bigram_model = build_bigram_model(train_corpus_cleaned)

### 4.2. **Trigram model**
  Predicts a word based on **2 previous words**
$$
  P(w_i \mid w_{i-2}, w_{i-1})
$$

In [61]:
def build_trigram_model(tokens):
    trigram_counts = defaultdict(Counter)
    bigram_counts = Counter()

    for i in range(len(tokens) - 2):
        context = (tokens[i], tokens[i+1])
        target = tokens[i+2]
        trigram_counts[context][target] += 1
        bigram_counts[context] += 1

    trigram_model = {}
    for ctx in trigram_counts:
        total = bigram_counts[ctx]
        trigram_model[ctx] = {
            w: count / total
            for w, count in trigram_counts[ctx].items()
        }
    return trigram_model

In [63]:
trigram_model = build_trigram_model(train_corpus_cleaned)

In [64]:
print("Bigram contexts:", len(bigram_model))
print("Trigram contexts:", len(trigram_model))

Bigram contexts: 33252
Trigram contexts: 274064


In [77]:
print(list(bigram_model.items())[:5])
print(list(trigram_model.items())[:5])

[('<s>', {'អ្នក': 0.03125197497314037, 'បាន': 0.01993932882512798, 'បុរស': 0.0007267901156544271, 'ជួរ': 0.00012639828098337863, 'ដោយសារ': 0.00047399355368766985, 'ក្នុង': 0.012229033685141883, 'តើ': 0.19945648739177146, 'អធិការដ្ឋាន': 0.00015799785122922328, 'ស្មៀន': 0.00012639828098337863, 'គោល': 0.001737976363521456, 'ហេតុ': 0.02300448713897491, 'ឯក': 0.024521266510775453, 'គាត់': 0.02417367123807116, 'ទេវតា': 9.479871073753397e-05, 'ខ្ញុំ': 0.02448966694052961, 'បើ': 0.0029071604626177082, 'របប': 0.0017695759337673007, 'នៅ': 0.028534411931997725, 'ដើម្បី': 0.01621057953611831, 'ការ': 0.01933893699045693, 'លោក': 0.08999557606016559, 'ចូរ': 0.03596031093977122, 'IL': 3.159957024584466e-05, 'សីតុណ្ហភាព': 0.00012639828098337863, 'ប្រទេស': 0.011849838842191747, 'ដូច': 0.014567401883334387, 'ស្ត្រី': 0.00022119699172091259, 'តាម': 0.0044239398344182515, '-': 0.0013271819503254756, 'លុះ': 0.0007583896859002717, 'យោធា': 9.479871073753397e-05, 'គេ': 0.005119130379826834, 'និយាយ': 0.00135878

## 5. Handle Prediction

In [26]:
skip_tokens = {"។", "៕", "!", "?", "^", ".", ",", "%", "...", "៖", "«", "»", "<NUM>"}

In [75]:
def predict_top_k_ngram(context, model, skip_tokens, k=5):
    """
    context: str (bigram) or tuple (trigram)
    model: bigram or trigram dictionary
    """
    candidates = model.get(context, {})

    filtered = [
        (tok, prob)
        for tok, prob in candidates.items()
        if tok not in skip_tokens and tok not in ['<s>', '</s>']
    ]

    filtered.sort(key=lambda x: x[1], reverse=True)
    return filtered[:k]

In [78]:
print(predict_top_k_ngram("ខ្ញុំ", bigram_model, skip_tokens, k=5))
print(predict_top_k_ngram(("ខ្ញុំ", "ទៅ"), trigram_model, skip_tokens, k=5))

[('ឈ្មោះ', 0.5794025157232704), ('មាន', 0.025157232704402517), ('មិន', 0.0220125786163522), ('បាន', 0.018081761006289308), ('សូម', 0.01650943396226415)]
[('ផ្សារ', 0.25), ('សាលា', 0.25), ('ភូមិ', 0.125), ('កាន់', 0.125), ('លេងង', 0.125)]


## 6. Evaluation Matrices

In [70]:
import math

In [71]:
def ngram_metrics(model, test_tokens, n=2, k_values=[1,5,10], eps=1e-10):
    """
    Evaluate bigram or trigram models.
    
    Parameters:
    - model: dict, trained n-gram model
    - test_tokens: list of str, tokenized test corpus
    - n: int, 2 for bigram, 3 for trigram
    - k_values: list, Accuracy@k values to compute
    - eps: float, smoothing for log(0)
    
    Returns:
    - dict with cross_entropy, perplexity, and Accuracy@k
    """
    log_sum = 0
    total = 0
    accuracy_results = {k: 0 for k in k_values}

    for i in range(len(test_tokens) - (n-1)):
        context = tuple(test_tokens[i:i+n-1])
        target = test_tokens[i+n-1]
        candidates = model.get(context, {}) if n > 2 else model.get(context[0], {})
        prob = candidates.get(target, eps)
        
        log_sum += math.log(prob)
        total += 1

        # Accuracy@k
        if candidates:
            top_k_words = [w for w,_ in sorted(candidates.items(), key=lambda x: x[1], reverse=True)]
            for k in k_values:
                if target in top_k_words[:k]:
                    accuracy_results[k] += 1

    cross_entropy = -log_sum / total
    perplexity = math.exp(cross_entropy)
    accuracy_results = {k: v / total for k, v in accuracy_results.items()}

    return {
        "cross_entropy": cross_entropy,
        "perplexity": perplexity,
        "accuracy": accuracy_results
    }

In [72]:
# Bigram
bigram_eval = ngram_metrics(bigram_model, test_corpus_cleaned, n=2, k_values=[1,5,10])
print("Bigram Evaluation:", bigram_eval)

# Trigram
trigram_eval = ngram_metrics(trigram_model, test_corpus_cleaned, n=3, k_values=[1,5,10])
print("Trigram Evaluation:", trigram_eval)

Bigram Evaluation: {'cross_entropy': 6.088285039771365, 'perplexity': 440.66503974060606, 'accuracy': {1: 0.22544752449599842, 5: 0.40995743886166275, 10: 0.5001611479760115}}
Trigram Evaluation: {'cross_entropy': 8.514324773297513, 'perplexity': 4985.678459309372, 'accuracy': {1: 0.3441588953950698, 5: 0.5142563676233163, 10: 0.5717005229041664}}


## 7. Saving Model

In [80]:
import os
os.mkdir('../saved_ngram')

In [81]:
import pickle

# --- Saving the model ---
def save_model(model, filepath):
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {filepath}")

save_model(bigram_model, '../saved_ngram/bigram_model.pkl')
save_model(trigram_model, '../saved_ngram/trigram_model.pkl')

Model saved to ../saved_ngram/bigram_model.pkl
Model saved to ../saved_ngram/trigram_model.pkl
