In [1]:
import numpy as np
import os
import onnxruntime as rt
from onnxruntime import InferenceSession
# ReferenceEvaluator from onnx.reference is not used in the original
# so it's commented out for simplicity.
# from onnx.reference import ReferenceEvaluator 
import joblib
from pythainlp.corpus import thai_orst_words
import numpy as np

# list_word_use=[i for i in list_word if " " not in i]


class FastTextEncoder:
    """
    A class to load pre-trained FastText-like word embeddings, 
    compute word and sentence vectors, and interact with an ONNX 
    model for nearest neighbor suggestions.
    """

    # --- Initialization and Data Loading ---
    
    def __init__(self, model_dir, nn_model_path, words_list_path, bucket=2000000, nb_words=2000000, minn=5, maxn=5):
        """
        Initializes the FastTextEncoder, loading embeddings, vocabulary, 
        nearest neighbor model, and suggestion words list.

        Args:
            model_dir (str): Directory containing 'embeddings.npy' and 'vocabulary.txt'.
            nn_model_path (str): Path to the ONNX nearest neighbors model.
            words_list_path (str): Path to the joblib file containing the list of words for suggestions.
            bucket (int): The size of the hash bucket for subword hashing.
            nb_words (int): The number of words in the vocabulary (used as an offset for subword indices).
            minn (int): Minimum character length for subwords.
            maxn (int): Maximum character length for subwords.
        """
        self.model_dir = model_dir
        self.nn_model_path = nn_model_path
        self.bucket = bucket
        self.nb_words = nb_words
        self.minn = minn
        self.maxn = maxn

        # Load data and models
        self.vocabulary, self.embeddings = self._load_embeddings()
        self.words_for_suggestion = self._load_suggestion_words(words_list_path)
        self.nn_session = self._load_onnx_session(nn_model_path)
        self.embedding_dim = self.embeddings.shape[1]

    def _load_embeddings(self):
        """Loads embeddings matrix and vocabulary list."""
        print(f"Loading embeddings from {self.model_dir}...")
        input_matrix = np.load(os.path.join(self.model_dir, "embeddings.npy"))
        words = []
        vocab_path = os.path.join(self.model_dir, "vocabulary.txt")
        with open(vocab_path, "r", encoding='utf-8') as f:
            for line in f.readlines():
                words.append(line.rstrip())
        print("Embeddings loaded successfully.")
        return words, input_matrix

    def _load_suggestion_words(self, words_list_path):
        """Loads the list of words used for suggestions."""
        print(f"Loading suggestion words from {words_list_path}...")
        list_word=sorted(list(thai_orst_words()))
        words=list_word
        words = np.array(list(words))
        # words = words
        print("Suggestion words loaded successfully.")
        return words

    def _load_onnx_session(self, onnx_path):
        """Loads the ONNX inference session."""
        print(f"Loading ONNX model from {onnx_path}...")
        # Note: Using providers=["CPUExecutionProvider"] for platform independence
        sess = rt.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
        print("ONNX session created successfully.")
        return sess

    # --- Helper Methods for Encoding ---

    def _get_hash(self, subword):
        """Computes the FastText-like hash for a subword."""
        h = 2166136261  # FNV-1a basis
        for c in subword:
            c_ord = ord(c) % 2**8
            h = (h ^ c_ord) % 2**32
            h = (h * 16777619) % 2**32  # FNV-1a prime
        return h % self.bucket + self.nb_words

    def _get_subwords(self, word):
        """Extracts subwords and their corresponding indices for a given word."""
        _word = "<" + word + ">"
        _subwords = []
        _subword_ids = []

        # 1. Check for the word in vocabulary (full word is the first subword)
        if word in self.vocabulary:
            _subwords.append(word)
            _subword_ids.append(self.vocabulary.index(word))
            if word == "</s>":
                return _subwords, np.array(_subword_ids)

        # 2. Extract n-grams (subwords) and get their hash indices
        for ngram_start in range(0, len(_word)):
            for ngram_length in range(self.minn, self.maxn + 1):
                if ngram_start + ngram_length <= len(_word):
                    _candidate_subword = _word[ngram_start:ngram_start + ngram_length]
                    # Only append if not already included (e.g., as the full word)
                    if _candidate_subword not in _subwords: 
                        _subwords.append(_candidate_subword)
                        _subword_ids.append(self._get_hash(_candidate_subword))

        return _subwords, np.array(_subword_ids)

    def get_word_vector(self, word):
        """Computes the normalized vector for a single word."""
        # subword_ids[1] contains the array of indices for the word and its subwords
        subword_ids = self._get_subwords(word)[1]
        
        # Check if the array of subword indices is empty
        if subword_ids.size == 0:
            # Return a 300-dimensional zero vector if no word/subword is found.
            return np.zeros(self.embedding_dim)

        # Compute the mean of the embeddings for all subword indices
        vector = np.mean([self.embeddings[s] for s in subword_ids], axis=0)
        
        # Normalize the vector
        norm = np.linalg.norm(vector)
        if norm > 0:
            vector /= norm
            
        return vector

    def _tokenize(self, sentence):
        """Tokenizes a sentence based on whitespace."""
        tokens = []
        word = ""
        for c in sentence:
            if c in [' ', '\n', '\r', '\t', '\v', '\f', '\0']:
                if word:
                    tokens.append(word)
                    word = ""
                if c == '\n':
                    tokens.append("</s>")
            else:
                word += c
        if word:
            tokens.append(word)
        return tokens

    def get_sentence_vector(self, line):
        """Computes the mean vector for a sentence."""
        tokens = self._tokenize(line)
        vectors = []
        for t in tokens:
            # get_word_vector already handles normalization, so no need to do it again here
            vec = self.get_word_vector(t)
            vectors.append(vec)
            
        # If the sentence was empty and resulted in no vectors, return a zero vector
        if not vectors:
            return np.zeros(self.embedding_dim)

        return np.mean(vectors, axis=0)

    # --- Nearest Neighbor Method ---

    def get_word_suggestion(self, list_word):
        """
        Queries the ONNX model to find the nearest neighbor word(s) 
        for the given word or list of words.

        Args:
            list_word (str or list of str): A single word or a list of words 
                                            to get suggestions for.

        Returns:
            str or list of str: The nearest neighbor word(s) from the 
                                pre-loaded suggestion list.
        """
        if isinstance(list_word, str):
            input_words = [list_word]
            return_single = True
        else:
            input_words = list_word
            return_single = False
            
        # Compute sentence vector for each input word/phrase
        # The original code's `get_sentence_vector(' '.join(list(word)))` seems 
        # intended to treat a list of characters/tokens as a sentence. 
        # I'll stick to a more standard usage: treat each item in `input_words` 
        # as a separate phrase/word to encode.
        word_input_vecs = [self.get_sentence_vector(' '.join(list(word))) for word in input_words]

        # Convert to numpy array for ONNX input (ensure float32)
        input_data = np.array(word_input_vecs, dtype=np.float32)

        # Run ONNX inference
        indices = self.nn_session.run(None, {"X": input_data})[0]
        
        # Look up suggestions
        suggestions = [self.words_for_suggestion[i].tolist() for i in indices]
        
        return suggestions[0] if return_single else suggestions

In [2]:
MODEL_DIRECTORY = 'model/'
ONNX_PATH = "nearest_neighbors.onnx"
WORDS_LIST_PATH = 'words.joblib'

In [3]:
encoder = FastTextEncoder(
    model_dir=MODEL_DIRECTORY, 
    nn_model_path=ONNX_PATH, 
    words_list_path=WORDS_LIST_PATH
)

Loading embeddings from model/...
Embeddings loaded successfully.
Loading suggestion words from words.joblib...
Suggestion words loaded successfully.
Loading ONNX model from nearest_neighbors.onnx...
ONNX session created successfully.


In [4]:
word = ["โรงเรีย","คมดี"]
word_vector = encoder.get_word_suggestion(word)
print(f"Vector for '{word}' (first 5 values): {word_vector[:5]}")

Vector for '['โรงเรีย', 'คมดี']' (first 5 values): [['โรงเรียน', 'ระเรียง', 'โรงเรียนประจำ', 'เรียบเรียง', 'ระเดียง'], ['คนดีผีคุ้ม', 'มีดคอม้า', 'คดี', 'มีดสองคม', 'มูลคดี']]


In [5]:
word_vector

[['โรงเรียน', 'ระเรียง', 'โรงเรียนประจำ', 'เรียบเรียง', 'ระเดียง'],
 ['คนดีผีคุ้ม', 'มีดคอม้า', 'คดี', 'มีดสองคม', 'มูลคดี']]

In [6]:
word = "โรงเรีย"
word_vector = encoder.get_word_suggestion(word)
print(f"Vector for '{word}' (first 5 values): {word_vector}")

Vector for 'โรงเรีย' (first 5 values): ['โรงเรียน', 'ระเรียง', 'โรงเรียนประจำ', 'เรียบเรียง', 'ระเดียง']
