In [1]:
from typing import List, Dict
import pickle
from pathlib import Path

import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
import numpy as np
from sklearn.model_selection import train_test_split

import config

2022-04-22 07:35:46.389342: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-22 07:35:46.389376: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
model_name = "BiLSTM"
model_path = config.MODELS_DIR / model_name
model_path.mkdir(parents=True, exist_ok=True)

corpus_path = config.DATA_DIR / "classical_bo"

## Dataset

In [3]:
def get_text_paths(path) -> List[str]:
   files = []
   for pecha_path in tqdm(list(path.iterdir())):
     for fn in pecha_path.iterdir():
       if 'tokenized' not in fn.stem:
         continue
       files.append(fn)
   return files

def normalize_sentences_length(sentences, sent_len=50):
  tokens = [tok for sent in sentences for tok in sent.split()]
  sentences = []
  for i in range(0, len(tokens), sent_len):
    sentences.append(tokens[i: i+sent_len])
  return [' '.join(s) for s in sentences]
   
def get_sentences(path, build=False):
  sentences_fn = path / "sentences.txt"
  if sentences_fn.is_file() and not build:
    print("[INFO] loading sentences from last built...")
    for line in tqdm(sentences_fn.read_text().splitlines()):
      if not line: continue
      yield line
  else:
    print("[INFO] Building sentences.txt...")
    if sentences_fn.is_file(): sentences_fn.unlink()
    sentences = []
    for path in tqdm(get_text_paths(path)):
      for line in path.read_text().splitlines():
        if not line: continue
        sentences.append(line)
    sentences = normalize_sentences_length(sentences)
    sentences_fn.write_text('\n'.join(sentences))
    return sentences

In [4]:
class DataGenerator(tf.keras.utils.Sequence):

  def __init__(self, seqs, batch_size, vocab_size, shuffle=True):
    self.X, self.Y = [], []
    self.create_examples(seqs)
    self.batch_size = batch_size
    self.vocab_size = vocab_size
    
  @staticmethod
  def generate_xy_pairs(seq, max_len):
    x, y = [], []
    for i, tok_id in enumerate(seq):
      x_padded = pad_sequences([seq[:i]], maxlen=max_len)[0]
      x.append(x_padded)
      y.append(tok_id)
    return x, y

  def create_examples(self, seqs):
    max_len = max([len(seq) for seq in seqs])
    for seq in tqdm(seqs):
      xs, ys = self.generate_xy_pairs(seq, max_len)
      self.X += xs
      self.Y += ys

    self.X = np.array(self.X)
    self.Y = np.array(self.Y)

  def __len__(self):
    return len(self.X) // self.batch_size

  def __getitem__(self, index):
    start = index*self.batch_size
    end = (index+1)*self.batch_size
    X = self.X[start: end]
    Y = self.Y[start: end]

    return np.array(X), to_categorical(Y, num_classes=self.vocab_size)

## Model

In [5]:
def get_model(params):
  model = Sequential()
  model.add(Embedding(input_dim=params["vocab_size"], output_dim=100, input_length=params["max_len"]))
  model.add(Bidirectional(LSTM(100, return_sequences=True)))
  model.add(Bidirectional(LSTM(100)))
  model.add(Dense(params["vocab_size"], activation='softmax'))
  model.compile('rmsprop', 'categorical_crossentropy')
  
  return model

In [6]:
model = get_model({"vocab_size": 20000, "max_len": 50})
model.summary()

2022-04-22 07:35:52.411126: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-22 07:35:52.411164: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-22 07:35:52.411196: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (default): /proc/driver/nvidia/version does not exist
2022-04-22 07:35:52.412449: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           2000000   
                                                                 
 bidirectional (Bidirectiona  (None, 50, 200)          160800    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 20000)             4020000   
                                                                 
Total params: 6,421,600
Trainable params: 6,421,600
Non-trainable params: 0
_________________________________________________________________


In [7]:
def save_model(path, model, tokenizer=None):
  model.save(path)
  if tokenizer:
    pickle.dump(tokenizer, (path / 'tokenizer.pkl').open('wb'))
  return path

def load_model(path):
  model = tf.keras.models.load_model(path)
  tokenizer = pickle.load((path / 'tokenizer.pkl').open('rb'))
  return model, tokenize

## Train

In [8]:
def train(corpus_path, model_path):
  # get setences
  print("[INFO] Loading sentences...")
  sentences = list(get_sentences(corpus_path))
  print("[INFO] Loaded no. of sentences (of length 50):", len(sentences))

  # Tokenize
  print("[INFO] Tokenizing sentences...")
  tokenizer = Tokenizer(lower=False)
  tokenizer.fit_on_texts(sentences)
  vocab = tokenizer.word_index
  seqs = tokenizer.texts_to_sequences_generator(sentences)
  max_len = max([len(s.split()) for s in sentences])
  del sentences

  # Define Parameters
  params = {
    "batch_size": 500,
    "vocab_size": len(vocab) + 1,
    "shuffle": True,
    "max_len": max_len
  }

  # Create Dataset
  print("[INFO] Preparing training dataset...")
  train, valid = train_test_split(list(seqs), test_size=0.2, random_state=42)
  training_generator = DataGenerator(train, params["batch_size"], params["vocab_size"], params["shuffle"])
  validation_generator = DataGenerator(valid, params["batch_size"], params["vocab_size"], params["shuffle"])


  # Train model
  print("[INFO] Training model...")
  model = get_model(params)
  model.fit_generator(
      generator=training_generator,
      validation_data=validation_generator,
      epochs=10,
      use_multiprocessing=True,
      workers=6
  )

  print(f"[INFO] Model saved at: {model_path}")
  save_model(model_path, model, tokenizer)

  return model_path

In [None]:
train(corpus_path, model_path)

[INFO] Loading sentences...
[INFO] loading sentences from last built...


100%|██████████| 362132/362132 [00:00<00:00, 2899550.23it/s]


[INFO] Loaded no. of sentences (of length 50): 362132
[INFO] Tokenizing sentences...
[INFO] Preparing training dataset...


 92%|█████████▏| 265164/289705 [04:00<1:31:31,  4.47it/s]

## Evaluate

In [None]:
model, tokenizer = load_model(model_path)

In [None]:
def score_sentence(sentence, model, tokenizer):
  seq = tokenizer.texts_to_sequences([sentence])[0]
  x_test, y_test = DataGenerator.generate_xy_pairs(seq, model.layers[0].input_length)
  x_test = np.array(x_test)
  y_test = np.array(y_test)
  p_pred = model.predict(x_test)
  vocab_inv = {v: k for k, v in tokenizer.word_index.items()}
  log_p_sentence = 0
  for i, prob in enumerate(p_pred):
      word = vocab_inv[y_test[i]] 
      history = ' '.join([vocab_inv[w] for w in x_test[i, :] if w != 0])
      prob_word = prob[y_test[i]]
      log_p_sentence += np.log(prob_word)
      print('P(w={}|h={})={}'.format(word, history, prob_word))
  print('Prob. sentence: {}'.format(np.exp(log_p_sentence)))