<a href="https://colab.research.google.com/github/OpenPecha-dev/models/blob/main/models/lm/Classical_Bo_Custom_LM_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from typing import List, Dict
import pickle

import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
import numpy as np
from sklearn.model_selection import train_test_split

## Dataset

In [None]:
from pathlib import Path

def _mkdir(path: Path) -> Path:
  path.mkdir(exist_ok=True, parents=True)
  return path

BASE_PATH = Path("/content/drive/MyDrive/OpenPecha/ML/LM")
DATA_PATH = BASE_PATH / "data"
MODELS_PATH = _mkdir(BASE_PATH / "models" / "BiLSTM")

In [None]:
def get_text_paths(path) -> List[str]:
   files = []
   for pecha_path in tqdm(list(path.iterdir())):
     for fn in pecha_path.iterdir():
       if 'tokenized' not in fn.stem:
         continue
       files.append(fn)
   return files

def normalize_sentences_length(sentences, sent_len=50):
  tokens = [tok for sent in sentences for tok in sent.split()]
  sentences = []
  for i in range(0, len(tokens), sent_len):
    sentences.append(tokens[i: i+sent_len])
  return [' '.join(s) for s in sentences]
   
def get_sentences(path, build=False):
  sentences_fn = path / "sentences.txt"
  if sentences_fn.is_file() and not build:
    print("[INFO] loading sentences from last built...")
    for line in tqdm(sentences_fn.read_text().splitlines()):
      if not line: continue
      yield line
  else:
    print("[INFO] Building sentences.txt...")
    if sentences_fn.is_file(): sentences_fn.unlink()
    sentences = []
    for path in tqdm(get_text_paths(path)):
      for line in path.read_text().splitlines():
        if not line: continue
        sentences.append(line)
    sentences = normalize_sentences_length(sentences)
    sentences_fn.write_text('\n'.join(sentences))
    return sentences

In [None]:
# corpus_name = "classical_bo"
# corpus_path = DATA_PATH / corpus_name
# sentences = list(get_sentences(corpus_path))
# len(sentences)

[INFO] loading sentences from last built...


100%|██████████| 64304/64304 [00:00<00:00, 694115.88it/s]


64304

In [None]:
# sentences[-5:]

['༔ འོད་ ལྔ འི་ ཚུལ་ འཕྲོ ས་ སྡུག་བསྔལ་ སེལ ༔ སྐུ་ སྟོད་ གཅེ ར་བ་ རུས་པ ས་ བརྒྱན ༔ སྐུ་ སྨད་ དར་ དམར་ ཤམ་ཐབས་ མཛེས ༔ ཕྱག་ གཉིས་ མཉམ་བཞག་ མཛད་པ འི་ སྟེང་ ༔ བདུད་རྩི་ སྨན་ གྱི་ བུམ་ བཟུང་ ཐོགས ༔ ཞབས་ གཉིས་ མཉམ་པ འི་ སྟབས་ ཀྱིས་ བཞེངས ༔ ཡེ་ཤེས་ མེ་འོད་ ཀློང་',
 'ན་ གསལ ༔ དེ་ནས་ བཟླས་པ འི་ རིམ་པ་ ནི ༔ ཐུགས་ཀ ར་ པད་ ཟླ འི་ སྟེང་ དུ་ འཇམ་དབྱངས་ དཀར་པོ་ ལོངས་སྐུ འི་ རྒྱན་ ཅན་ ཕྱག་ གཉིས་ བུམ་པ་ བདུད་རྩི ས་ གང་བ་ བསྣམས་པ འི་ ཐུགས་ ཙིཏྟ་ རིན་པོ་ཆེ འི་ ནང་ དུ ༔ ཟླ་ སྟེང་ ཧཱུྃ་ ཡིག་དཀར་པོ འི་ མཐ ར ༔ སྔགས་ ཕྲེང་ སྐར་མ འི་ ཕྲེང་བ་',
 'ལྟར ༔ འཁོར་བ འི་ འོད་ ཀྱིས་ དོན་ གཉིས་ བྱས ༔ ཁྱད་པར་ ཟླ་ ཞུན་ ལྟ་བུ འི་ འོད ༔ དྭངས་ བསིལ་ བདུད་རྩི་ དང་ བཅས་ འཕྲོ ས ༔ རང་གཞན་ ནད་རིམས་ རྒྱུ་རྐྱེན་ བཅས ༔ ཞི་ ཞིང་ དག་ ནས་ བདེ་སྟོང་ གི ༔ ཡེ་ཤེས་ རྒྱུད་ ལ་ སྐྱེས་ ནས་ ཀྱང་ ༔ ཟག་མེད་ རྡོ་རྗེ་ ལྟ་བུ འི་ སྐུ ༔',
 'བསྒྲེས་ རྒུད་ མེད་པ འི་ རང་བཞིན་ བསམ ༔ ཨོཾ་ མཉྫུ་ ཤྲཱི་ ཀྲོ་ དྷ་ ར་ དྷི་པ་ ན་ ཨ་ ཙ་ ཡེ་ ཧཱུྃ་ ཕཊ ༔ འབུམ་ཕྲག་ དྲུག་ གིས་ ནད་རིམས་ ཞི ༔ ཚེ་རབས་ ཀུན་ ཏུ་ ནད་མེད་པ འི ༔ བདེ་བ་ ཕུན་སུམ་ ཚོགས་ ཐོབ་ འགྱུར ༔ ས་ མ་ ཡ ༔ ལ ས་ 

In [None]:
# max_len = max([len(s.split()) for s in sentences])
# max_len

50

In [None]:
# tokenizer = Tokenizer(lower=False)
# tokenizer.fit_on_texts(sentences)
# vocab = tokenizer.word_index
# seqs = tokenizer.texts_to_sequences_generator(sentences)
# del sentences

In [None]:
# # add special token to vocab
# UNKNOWN = '<unk>'
# PADDING = '<pad>'
# SENT_START = '<s>'
# SENT_END = '</s>'
# for i, s_token in enumerate([UNKNOWN, PADDING, SENT_START, SENT_END], start=1):
#   if s_token in vocab:
#     continue
#   vocab[s_token] = len(vocab) + i
#   print(vocab[s_token])

In [None]:
def add_start_and_end_to_seqs(seqs, start, end):
  return [[start] + seq + [end] for seq in seqs]

In [None]:
class DataGenerator(tf.keras.utils.Sequence):

  def __init__(self, seqs, batch_size, vocab_size, shuffle=True):
    self.X, self.Y = [], []
    self.create_examples(seqs)
    self.batch_size = batch_size
    self.vocab_size = vocab_size

  def create_examples(self, seqs):

    def generate_xy_pairs(seq, max_len):
      x, y = [], []
      for i, tok_id in enumerate(seq):
        x_padded = pad_sequences([seq[:i]], maxlen=max_len)[0]
        x.append(x_padded)
        y.append(tok_id)
      return x, y

    max_len = max([len(seq) for seq in seqs])
    for seq in tqdm(seqs):
      xs, ys = generate_xy_pairs(seq, max_len)
      self.X += xs
      self.Y += ys

    self.X = np.array(self.X)
    self.Y = np.array(self.Y)

  def __len__(self):
    return len(self.X) // self.batch_size

  def __getitem__(self, index):
    start = index*self.batch_size
    end = (index+1)*self.batch_size
    X = self.X[start: end]
    Y = self.Y[start: end]

    return np.array(X), to_categorical(Y, num_classes=self.vocab_size)

In [None]:
# train, valid = train_test_split(list(seqs), test_size=0.2, random_state=42)

In [None]:
# params = {
#     "batch_size": 1000,
#     "vocab_size": len(vocab) + 1,
#     "shuffle": True,
#     "max_len": max_len
# }

In [None]:
# training_generator = DataGenerator(train, params["batch_size"], params["vocab_size"], params["shuffle"])
# validation_generator = DataGenerator(valid, params["batch_size"], params["vocab_size"], params["shuffle"])

100%|██████████| 51443/51443 [01:30<00:00, 570.83it/s]
100%|██████████| 12861/12861 [00:21<00:00, 603.11it/s]


In [None]:
# training_generator.batch_size = params["batch_size"]
# validation_generator.batch_size = params["batch_size"]

## Model

In [None]:
# def get_model(params):
#   model = Sequential()
#   model.add(Embedding(input_dim=params["vocab_size"], output_dim=100, input_length=params["max_len"]))
#   model.add(LSTM(100, return_sequences=True))
#   model.add(LSTM(100))
#   model.add(Dense(params["vocab_size"], activation='softmax'))
#   model.compile('rmsprop', 'categorical_crossentropy')
  
#   return model

In [None]:
# model = get_model(params)

## Train

In [None]:
model.fit_generator(
    generator=training_generator,
    validation_data=validation_generator,
    epochs=10,
    use_multiprocessing=True,
    workers=6
)

  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc69158b290>

In [None]:
def save_model(path, model, tokenizer=None):
  model.save(path)
  if tokenizer:
    pickle.dump(tokenizer, (path / 'tokenizer.pkl').open('wb'))
  return path

def load_model(path):
  model = tf.keras.models.load_model(path)
  tokenizer = pickle.load((path / 'tokenizer.pkl').open('rb'))
  return model, tokenizer

In [None]:
model_path = MODELS_PATH / "lstm"
# save_model(model_path, model, tokenizer)

## Evaluate

In [None]:
l_model, l_tokenizer = load_model(model_path)

In [None]:
def continue_training():
  # get setences
  print("[INFO] Loading sentences...")
  corpus_name = "classical_bo"
  corpus_path = DATA_PATH / corpus_name
  sentences = list(get_sentences(corpus_path))

  # load model
  model_path = MODELS_PATH / "lstm"
  model, tokenizer = load_model(model_path)

  # Tokenize
  print("[INFO] Tokenizing sentences...")
  vocab = tokenizer.word_index
  seqs = tokenizer.texts_to_sequences_generator(sentences)
  max_len = max([len(s.split()) for s in sentences])
  del sentences

  # Define Parameter
  params = {
    "batch_size": 500,
    "vocab_size": len(vocab) + 1,
    "shuffle": True,
    "max_len": max_len
  }

  # Create Dataset
  print("[INFO] Preparing training dataset...")
  train, valid = train_test_split(list(seqs), test_size=0.2, random_state=42)
  training_generator = DataGenerator(train, params["batch_size"], params["vocab_size"], params["shuffle"])
  validation_generator = DataGenerator(valid, params["batch_size"], params["vocab_size"], params["shuffle"])


  # Train model
  print("[INFO] Training model...")
  model.fit_generator(
      generator=training_generator,
      validation_data=validation_generator,
      epochs=10,
      use_multiprocessing=True,
      workers=6
  )

  print(f"[INFO] Model saved at: {model_path}")
  save_model(model_path, model, tokenizer)

  return model_path, model, tokenizer

In [None]:
_, l_model, l_tokenizer = continue_training()

[INFO] Loading sentences...
[INFO] loading sentences from last built...


100%|██████████| 64304/64304 [00:00<00:00, 2343882.20it/s]


[INFO] Tokenizing sentences...
[INFO] Preparing training dataset...


100%|██████████| 51443/51443 [01:03<00:00, 815.14it/s]
100%|██████████| 12861/12861 [00:16<00:00, 791.87it/s]


[INFO] Training model...
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[INFO] Model saved at: /content/drive/MyDrive/OpenPecha/ML/LM/models/BiLSTM/lstm




INFO:tensorflow:Assets written to: /content/drive/MyDrive/OpenPecha/ML/LM/models/BiLSTM/lstm/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/OpenPecha/ML/LM/models/BiLSTM/lstm/assets


## Sentence Score

In [None]:
def generate_xy_pairs(seq, max_len):
    x, y = [], []
    for i, tok_id in enumerate(seq):
      x_padded = pad_sequences([seq[:i]], maxlen=max_len)[0]
      x.append(x_padded)
      y.append(tok_id)
    return x, y 

In [None]:
def score_sentence(sentence, model, tokenizer):
  seq = tokenizer.texts_to_sequences([sentence])[0]
  x_test, y_test = generate_xy_pairs(seq, model.layers[0].input_length)
  x_test = np.array(x_test)
  y_test = np.array(y_test)
  p_pred = model.predict(x_test)
  vocab_inv = {v: k for k, v in tokenizer.word_index.items()}
  log_p_sentence = 0
  for i, prob in enumerate(p_pred):
      word = vocab_inv[y_test[i]] 
      history = ' '.join([vocab_inv[w] for w in x_test[i, :] if w != 0])
      prob_word = prob[y_test[i]]
      log_p_sentence += np.log(prob_word)
      print('P(w={}|h={})={}'.format(word, history, prob_word))
  print('Prob. sentence: {}'.format(np.exp(log_p_sentence)))

In [None]:
sent = 'བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ འི་ ཕོ་བྲང་ ནས'
score_sentence(sent, l_model, l_tokenizer)

P(w=བདེ་ཆེན་|h=)=0.0004911079886369407
P(w=པདྨ་|h=བདེ་ཆེན་)=0.0012315770145505667
P(w=འཁྱིལ་བ|h=བདེ་ཆེན་ པདྨ་)=5.0276112233405e-06
P(w=འི་|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ)=0.9756913781166077
P(w=ཕོ་བྲང་|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ འི་)=0.14474745094776154
P(w=ནས|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ འི་ ཕོ་བྲང་)=0.29027533531188965
Prob. sentence: 1.2466193152867404e-13


In [None]:
sent = 'བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ གི་ ཕོ་བྲང་ ནས་'
score_sentence(sent, l_model, l_tokenizer)

P(w=བདེ་ཆེན་|h=)=0.0004911079886369407
P(w=པདྨ་|h=བདེ་ཆེན་)=0.0012315770145505667
P(w=འཁྱིལ་བ|h=བདེ་ཆེན་ པདྨ་)=5.0276112233405e-06
P(w=གི་|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ)=1.1202250789210666e-06
P(w=ཕོ་བྲང་|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ གི་)=0.06572142988443375
P(w=ནས་|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ གི་ ཕོ་བྲང་)=0.0037065837532281876
Prob. sentence: 8.298249036817588e-22


In [None]:
sent = 'བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ་ ཡི་ ཕོ་བྲང་ ནས་'
score_sentence(sent, l_model, l_tokenizer)

P(w=བདེ་ཆེན་|h=)=0.0004911079886369407
P(w=པདྨ་|h=བདེ་ཆེན་)=0.0012315770145505667
P(w=འཁྱིལ་བ་|h=བདེ་ཆེན་ པདྨ་)=8.791105301497737e-07
P(w=ཡི་|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ་)=0.00028916046721860766
P(w=ཕོ་བྲང་|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ་ ཡི་)=0.0043965959921479225
P(w=ནས་|h=བདེ་ཆེན་ པདྨ་ འཁྱིལ་བ་ ཡི་ ཕོ་བྲང་)=0.00020606110047083348
Prob. sentence: 1.3929429852153917e-22


In [None]:
sent = 'བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་'
score_sentence(sent, l_model, l_tokenizer)

P(w=བདེ་ཆེན་|h=)=0.0004911079886369407
P(w=བདེ་ཆེན་|h=བདེ་ཆེན་)=0.0033430515322834253
P(w=བདེ་ཆེན་|h=བདེ་ཆེན་ བདེ་ཆེན་)=0.008468111045658588
P(w=བདེ་ཆེན་|h=བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་)=0.015528939664363861
P(w=བདེ་ཆེན་|h=བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་)=0.011780750937759876
P(w=བདེ་ཆེན་|h=བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་ བདེ་ཆེན་)=0.008753247559070587
Prob. sentence: 2.226335352696179e-14
