In [68]:
import requests
raw_poe = requests.get('https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt').content
raw_frost = requests.get('https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt').content

In [483]:
import numpy as np
import dask.array as da
import spacy
nlp = spacy.load("en_core_web_sm")

def decode_text(text):
    return bytes(text.decode("utf-8"), "utf-8").decode("unicode_escape")

def make_arr_rectangular(jagged_array):
    width = max((len(row) for row in jagged_array))
    right_pad = lambda row: row + [""]*(width-len(row))
    return [right_pad(row) for row in jagged_array]

def tokenize_line(line):
    return [token.lemma_ for token in nlp(line)]

def process_text(text, chunksize=500, pad=False):
    enc_dec = decode_text(text)
    jagged_array = [tokenize_line(line) for line in enc_dec.split("\n")]
    array = make_arr_rectangular(jagged_array) if pad else jagged_array
    return np.array(array, dtype=str if pad else object)

In [297]:
from sklearn.model_selection import train_test_split
def test_split_text(text):
    array = process_text(text, pad=True)
    return train_test_split(array, random_state=1)
    
poe_train, poe_test = test_split_text(raw_poe)
frost_train, frost_test = test_split_text(raw_frost)

In [463]:
base = np.array(["$BEGINCHAR$", "$ENDCHAR$", "$NEWCHAR$"])
vocab_p = np.unique(poe_train)
vocab_f = np.unique(frost_train)
combined_vocab = np.unique(np.concatenate([vocab_p, vocab_f]))
total_vocab = np.concatenate([base, combined_vocab])
vocab_size = len(total_vocab)
print(f"Vocab counts - poe: {len(vocab_p)}, frost: {len(vocab_f)}, total: {vocab_size}")

Vocab counts - poe: 1045, frost: 1585, total: 2239


In [464]:
idx2word = total_vocab
word2idx = {word:idx for idx, word in enumerate(idx2word)}

In [474]:
def map_word2idx(val):
    return word2idx.get(val, 2)
vectorized_word2idx = np.vectorize(map_word2idx)

In [466]:
poe_vec_t = vectorized_word2idx(poe_train)
frost_vec_t = vectorized_word2idx(frost_train)

In [467]:
def count_corpora(corpora, vocab_size):
    count = np.zeros(vocab_size, dtype=int)
    all_values = np.concatenate([corpus.ravel() for corpus in corpora])
    np.add.at(count, all_values, 1)
    return count

corpora = [poe_vec_t, frost_vec_t]
word_counter = count_corpora(corpora, vocab_size)

In [468]:
def get_transition_matrix(vec_arr, vocab_size, epsilon_smoothing=0.2):

    # Account for unknown words
    vocab_size += 1
    
    # Initialize matrices with zeros
    trans_mat = np.zeros((vocab_size, vocab_size))
    init_mat = np.zeros(vocab_size)
    
    # For the initial states
    starts = vec_arr[:, 0]
    init_counts, _ = np.histogram(starts, bins=np.arange(vocab_size + 1))
    init_mat += init_counts

    # For the transitions
    y, x = vec_arr[:, :-1].ravel(), vec_arr[:, 1:].ravel()
    hist, _, _ = np.histogram2d(x, y, bins=(vocab_size, vocab_size))
    trans_mat += hist
    
    # Apply epsilon smoothing
    trans_mat += epsilon_smoothing
    init_mat += epsilon_smoothing

    # Normalize
    normalized_trans_mat = trans_mat / trans_mat.sum(axis=1, keepdims=True)
    normalized_init_mat = init_mat / init_mat.sum()
    
    return normalized_trans_mat, normalized_init_mat

In [469]:
poe_t_mat, summy = get_transition_matrix(poe_vec_t, vocab_size)
frost_t_mat, _ = get_transition_matrix(frost_vec_t, vocab_size)

In [470]:
import math
def get_logprob_mat(t_mat):
    vec_log = np.vectorize(math.log)
    return vec_log(t_mat)

In [480]:
poe_logprob_t = get_logprob_mat(poe_t_mat)
frost_logprob_t = get_logprob_mat(frost_t_mat)

In [472]:
def get_argprob(sequence, logprob_mat, vectorized_word2idx):
    encoded_sequence = vectorized_word2idx(sequence)
    argsum = 0
    for i in range(len(encoded_sequence)-1):
        argsum += logprob_mat[i, i+1]
    return argsum

In [487]:
line = "These were days when my heart was volcanic As the scoriac rivers that roll"
seq = np.array(tokenize_line(line))
ap_1 = get_argprob(seq, poe_logprob_t, vectorized_word2idx)
ap_2 = get_argprob(seq, frost_logprob_t, vectorized_word2idx)

In [488]:
print(ap_1)
print(ap_2)

-96.00104407028091
-100.89728561566


In [486]:
raw_poe

b'LO! Death hath rear\'d himself a throne\nIn a strange city, all alone,\nFar down within the dim west\nWhere the good, and the bad, and the worst, and the best,\nHave gone to their eternal rest.\n\xe2\x80\x89\nThere shrines, and palaces, and towers\nAre not like any thing of ours\nOh no! O no! ours never loom\nTo heaven with that ungodly gloom!\nTime-eaten towers that tremble not!\nResemble nothing that is ours.\nAround, by lifting winds forgot,\nResignedly beneath the sky\nThe melancholy waters lie.\n\xe2\x80\x89\nNo holy rays from heaven come down\nOn the long night-time of that town,\nBut light from out the lurid sea\nStreams up the turrets silently\nUp thrones up long-forgotten bowers\nOf scultur\'d ivy and stone flowers\nUp domes up spires up kingly halls\nUp fanes up Babylon-like walls\nUp many a melancholy shrine\nWhose entablatures intertwine\nThe mask the viol and the vine.\n\xe2\x80\x89\nThere open temples open graves\nAre on a level with the waves\nBut not the riches there 