In [33]:
import numpy as np
import string
from collections import defaultdict, Counter

In [34]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [35]:
def to_probabilities(word_dict):
    total = sum(word_dict.values())
    return {word: count / total for word, count in word_dict.items()}

In [36]:
def sample_word(probs):
    r = np.random.random() # generate random number from 0 -> 1
    cumsum = 0
    for word, chance in probs.items():
        cumsum += chance
        if r < cumsum:
            return word
    return list(probs.keys())[-1] # in case theres some problem with prob values, we exit the loop, it takes the last word to not crash


In [37]:
path = 'robert_frost.txt'

In [38]:
def train_markov(path):
    initial_probs = Counter()
    first_order = defaultdict(list)
    second_order = defaultdict(list)

    with open(path, 'r') as f:
        lines = []
        for line in f:
            if line.strip():
                words = remove_punctuation(line.strip().lower()).split()
                lines.append(words)

    for words in lines:
        if len(words) == 0:
            continue
        initial_probs[words[0]] += 1

        for i, word in enumerate(words):
            if i == 0:
                continue
            elif i == 1:
                first_order[words[i-1]].append(word)
            else:
                second_order[(words[i-2], words[i-1])].append(word)

            if i == len(words) - 1:
                second_order[(words[i-1], word)].append('END')

    initial_probs = to_probabilities(initial_probs)
    first_order = {k: to_probabilities(Counter(v)) for k, v in first_order.items()}
    second_order = {k: to_probabilities(Counter(v)) for k, v in second_order.items()}

    return initial_probs, first_order, second_order

In [39]:
def generator(initial_probs, first_order, second_order):

    w1 = sample_word(initial_probs)
    w2 = sample_word(first_order[w1])
    line = [w1,w2]

    while True:
        next_word = sample_word(second_order[(w1, w2)])
        if next_word == 'END':
            break
        line.append(next_word)
        w1, w2 = w2, next_word
    return ' '.join(line)

In [40]:
def poem(initial_probs, first_order, second_order, num_of_lines = 4):
    for _ in range(num_of_lines):
        print(generator(initial_probs, first_order, second_order))

In [41]:
initial_probs, first_order, second_order = train_markov('robert_frost.txt')

In [43]:
poem(initial_probs, first_order, second_order, 8)

and scurf of plants and weary and overheated
he guessed theyd know what a name should matter between us
im waiting
of youthful forms and youthful faces
as one who clasps her heart of one that didnt thrive
nor never will be to have two towns fighting
take it as i say
my french indian esquimaux
