In [1]:
import numpy as np
import os

In [2]:
import fasttext
import joblib

words = joblib.load('words.joblib')
nbrs = joblib.load('nbrs.joblib')

In [3]:
wordlist=words.tolist()

In [4]:
def load_embeddings(output_dir):
  input_matrix = np.load(os.path.join(output_dir, "embeddings.npy"))
  words = []
  with open(os.path.join(output_dir, "vocabulary.txt"), "r", encoding='utf-8') as f:
    for line in f.readlines():
      words.append(line.rstrip())
  return words, input_matrix

vocabulary, embeddings = load_embeddings('model/')

In [5]:
def get_hash(subword, bucket=2000000, nb_words=2000000):
  h = 2166136261
  for c in subword:
    c = ord(c) % 2**8
    h = (h ^ c) % 2**32
    h = (h * 16777619) % 2**32
  return h % bucket + nb_words
def get_subwords(word, vocabulary, minn=5, maxn=5):
  _word = "<" + word + ">"
  _subwords = []
  _subword_ids = []
  if word in vocabulary:
    _subwords.append(word)
    _subword_ids.append(vocabulary.index(word))
    if word == "</s>":
      return _subwords, np.array(_subword_ids)
  for ngram_start in range(0, len(_word)):
    for ngram_length in range(minn, maxn+1):
      if ngram_start+ngram_length <= len(_word):
        _candidate_subword = _word[ngram_start:ngram_start+ngram_length]
        if _candidate_subword not in _subwords:
          _subwords.append(_candidate_subword)
          _subword_ids.append(get_hash(_candidate_subword))
  return _subwords, np.array(_subword_ids)
def get_word_vector(word, vocabulary, embeddings):
  # subwords[1] contains the array of indices for the word and its subwords
  subword_ids = get_subwords(word, vocabulary)[1]
  
  # Check if the array of subword indices is empty
  if subword_ids.size == 0:
    # 💥 FIX: If no word/subword is found, return a 300-dimensional zero vector.
    # This ensures that all elements appended to the 'vectors' list have the same shape.
    embedding_dim = embeddings.shape[1] # This should be 300
    return np.zeros(embedding_dim)

  # Otherwise, compute the mean as before
  return np.mean([embeddings[s] for s in subword_ids], axis=0)
def tokenize(sentence):
  tokens = []
  word = ""
  for c in sentence:
    if c in [' ', '\n', '\r', '\t', '\v', '\f', '\0']:
      if word:
        tokens.append(word)
        word = ""
      if c == '\n':
        tokens.append("</s>")
    else:
      word += c
  if word:
    tokens.append(word)
  return tokens

def get_sentence_vector(line):
    tokens = tokenize(line)
    vectors = []
    for t in tokens:
        vec = get_word_vector(t, vocabulary, embeddings)
        norm = np.linalg.norm(vec)
        if norm > 0:
            vec /= norm
        vectors.append(vec)
    return np.mean(vectors, axis=0)

In [6]:
words_input = ['การบด้าน', 'สวัดี', 'vออกเลอร์', 'ปละเทศไทยย', 'อรอย']

In [7]:
' '.join(list(words_input[-3]))

'v อ อ ก เ ล อ ร ์'

In [8]:
get_sentence_vector(' '.join(list(words_input[-3])))

array([ 0.07064344,  0.02868866,  0.0450275 , -0.09357667,  0.07799961,
       -0.01659612, -0.03994706,  0.02467421,  0.06386302,  0.01014723,
        0.00586393, -0.00511677, -0.04473881,  0.02240631, -0.00792691,
       -0.04050883, -0.01842077, -0.03816694,  0.03310383, -0.04765693,
       -0.01877511,  0.0173038 , -0.02365492, -0.0524214 , -0.00238156,
       -0.01424657,  0.0097129 , -0.03103621, -0.01206911, -0.02017615,
        0.05735345,  0.0350092 , -0.0109985 ,  0.01728996, -0.06626441,
       -0.0402539 , -0.03565674, -0.00495577,  0.02013647, -0.02001686,
       -0.00338332, -0.06862625,  0.04504699,  0.04516517,  0.04703264,
        0.00171725,  0.01698291, -0.03530983, -0.01771621,  0.00118982,
       -0.06714062,  0.03024822, -0.0108296 , -0.10299826, -0.06772696,
       -0.04453408,  0.01552465,  0.00672687,  0.03721033,  0.07133001,
       -0.05051875,  0.00600575,  0.01320744, -0.07520515, -0.03936205,
        0.04523896, -0.04980612,  0.04056922, -0.07245174,  0.00

In [9]:
word_input_vec = [get_sentence_vector(' '.join(list(word))) for word in words_input]
indices = nbrs.kneighbors(word_input_vec, 5, False)  # n_neighbors is 5
suggestion = words[indices]

for w, s in zip(words_input, suggestion):
    print(f'{w} \n---> {s}')

การบด้าน 
---> ['การบ้าน' 'ภาษากึ่งแบบแผน' 'ทำการบ้าน' 'แก้วสารพัดนึก' 'การจัดรูปที่ดิน']
สวัดี 
---> ['วสวัดดี' 'สวัสดี' 'ดบัสวี' 'วิดัสดี' 'สรัสวดี']
vออกเลอร์ 
---> ['ออร์แกน' 'อาร์กอน' 'ออกเบอร์' 'เอทิลแอลกอฮอล์' 'คลอโรฟอร์ม']
ปละเทศไทยย 
---> ['กระเทียมโทน' 'ทะเลทราย' 'กะเทย' 'ทรงกระเทียม' 'กระเทียม']
อรอย 
---> ['รอย' 'อร่อย' 'ร่องรอย' 'เอร็ดอร่อย' 'ย้อนรอย']


In [10]:
nbrs

In [11]:
from onnxruntime import InferenceSession
from sklearn.datasets import load_diabetes
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    VotingRegressor,
)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from skl2onnx import to_onnx
from onnx.reference import ReferenceEvaluator
from skl2onnx.common.data_types import FloatTensorType

In [12]:
type(word_input_vec[0])

numpy.ndarray

In [13]:
word_input_vec[4].shape

(100,)

In [14]:
N_FEATURES = 100
initial_types = [('X', FloatTensorType([None, N_FEATURES]))]

In [15]:
onx = to_onnx(nbrs, initial_types=initial_types, target_opset=13)

In [16]:
sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
pred_ort = sess.run(None, {"X": word_input_vec[0].reshape(1, -1).astype(np.float32)})[0]

# pred_skl = nbrs.predict(X_test.astype(numpy.float32))

# print("Onnx Runtime prediction:\n", pred_ort[:5])
# print("Sklearn rediction:\n", pred_skl[:5])

In [17]:
pred_ort

array([[ 2243, 17429, 10652, 34789,  2231]], dtype=int64)

In [18]:
pred_ort = sess.run(None, {"X": word_input_vec})[0]
pred_ort

array([[ 2243, 17429, 10652, 34789,  2231],
       [22315, 24316,  7695, 22787, 24147],
       [28623, 29199, 28596, 34464,  4452],
       [  988, 10436,  1889, 10097,   986],
       [19770, 28444, 20756, 34476, 19542]], dtype=int64)

In [19]:
nbrs.kneighbors(word_input_vec, 5, False)

array([[ 2243, 17429, 10652, 34789,  2231],
       [22315, 24316,  7695, 22787, 24147],
       [28623, 29199, 28596, 34464,  4452],
       [  988, 10436,  1889, 10097,   986],
       [19770, 28444, 20756, 34476, 19542]])

In [20]:
def get_pred(vetor):
    return sess.run(None, {"X": vetor.reshape(1, -1).astype(np.float32)})[0]

In [21]:
word_input_vec = [get_sentence_vector(' '.join(list(word))) for word in words_input]
indices = sess.run(None, {"X": word_input_vec})[0] # n_neighbors is 5
suggestion = words[indices]

for w, s in zip(words_input, suggestion):
    print(f'{w} \n---> {s}')

การบด้าน 
---> ['การบ้าน' 'ภาษากึ่งแบบแผน' 'ทำการบ้าน' 'แก้วสารพัดนึก' 'การจัดรูปที่ดิน']
สวัดี 
---> ['วสวัดดี' 'สวัสดี' 'ดบัสวี' 'วิดัสดี' 'สรัสวดี']
vออกเลอร์ 
---> ['ออร์แกน' 'อาร์กอน' 'ออกเบอร์' 'เอทิลแอลกอฮอล์' 'คลอโรฟอร์ม']
ปละเทศไทยย 
---> ['กระเทียมโทน' 'ทะเลทราย' 'กะเทย' 'ทรงกระเทียม' 'กระเทียม']
อรอย 
---> ['รอย' 'อร่อย' 'ร่องรอย' 'เอร็ดอร่อย' 'ย้อนรอย']


In [22]:
onnx_path = "nearest_neighbors.onnx"
with open(onnx_path, "wb") as f:
    f.write(onx.SerializeToString())

print(f"ONNX model saved successfully to {onnx_path}")

ONNX model saved successfully to nearest_neighbors.onnx


In [23]:
words

array(['ก', 'ก กา', 'ก ข', ..., 'ไฮโล', 'ไฮไฟ', 'ไฮ้'], dtype='<U50')