## Markov's text generator 

- Define the 1st word probability
- Define the 1st order Markov model for the second word
- Define the 2nd order Markov model for the rest of the text



In [3]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt


--2025-05-18 21:08:12--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8002::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving to: ‘robert_frost.txt’


2025-05-18 21:08:14 (110 KB/s) - ‘robert_frost.txt’ saved [56286/56286]



In [265]:
import string 
import numpy as np
import random

In [5]:
with open("robert_frost.txt", "r", encoding="utf-8") as f:
    lines_text = [line.strip() for line in f]

In [13]:
# remove blank or white-space lines
lines_text = [i for i in lines_text if i.strip() != '']

# make all lower case
lines_text = [i.lower() for i in lines_text]

In [27]:
lines_token = [line.translate(str.maketrans("", "", string.punctuation)).split() for line in lines_text]

## Count: 1st, 2nd and next word

In [151]:
# initial words count
initword_freq = {}

for line in lines_token:
    if line[0] not in initword_freq:
        initword_freq[line[0]] = 1
    else:
        initword_freq[line[0]] += 1

In [153]:
# second words count
secword_freq = {}
for line in lines_token: 
    if len(line) >= 2:
        first_wrd = line[0]
        second_wrd = line[1]
        if first_wrd not in secword_freq:
            secword_freq[first_wrd] = {
                second_wrd : 1
            }     
        elif second_wrd not in secword_freq[first_wrd]:
            secword_freq[first_wrd][second_wrd] = 1
        else:
            secword_freq[first_wrd][second_wrd] += 1

In [155]:
# third order word count
thrdword_freq = {}
for words_token in lines_token: 
    if len(words_token) >= 2:
        token1 = words_token[0]
        token2 = words_token[1]
        for token in words_token[2:]:
            if token1 not in thrdword_freq:
                thrdword_freq[token1] = {
                    token2: {
                        token: 1
                    }
                }
            elif token2 not in thrdword_freq[token1]:
                thrdword_freq[token1][token2] = {
                        token: 1
                    }
            elif token not in thrdword_freq[token1][token2]:
                thrdword_freq[token1][token2][token] = 1

            else:
                thrdword_freq[token1][token2][token] += 1
                
            token1 = token2
            token2 = token

## Probability: 1st, 2nd and next word

In [179]:
# probability of initial word
initword_prob = {} 
line_length = len(lines_token)
for token in initword_freq:
    initword_prob[token] = initword_freq[token]/line_length

In [195]:
# probability of second word
secword_prob = {}
for token1 in secword_freq:
    secword_prob[token1] = {}
    token1_sum = 0
    
    for token2 in secword_freq[token1]:
        token1_sum += secword_freq[token1][token2]
        
    for token2 in secword_freq[token1]:
        secword_prob[token1][token2] = secword_freq[token1][token2]/token1_sum

In [253]:
# probability of next word
thrdword_prob = {}
for token1 in thrdword_freq: 
    thrdword_prob[token1] = {}
    for token2 in thrdword_freq[token1]:
        thrdword_prob[token1][token2] = {}
        token2_sum = 0
        for token3 in thrdword_freq[token1][token2]:
            token2_sum += thrdword_freq[token1][token2][token3]
            
        for token3 in thrdword_freq[token1][token2]:
            thrdword_prob[token1][token2][token3] = thrdword_freq[token1][token2][token3]/token2_sum

In [261]:
initword_prob

{'two': 0.005571030640668524,
 'and': 0.08983286908077995,
 'to': 0.034818941504178275,
 'then': 0.008356545961002786,
 'because': 0.0006963788300835655,
 'though': 0.004874651810584958,
 'had': 0.002785515320334262,
 'in': 0.0201949860724234,
 'oh': 0.002785515320334262,
 'yet': 0.0020891364902506965,
 'i': 0.08217270194986072,
 'somewhere': 0.0006963788300835655,
 'whose': 0.001392757660167131,
 'his': 0.004874651810584958,
 'he': 0.023676880222841225,
 'my': 0.004874651810584958,
 'between': 0.0020891364902506965,
 'the': 0.057103064066852366,
 'of': 0.0201949860724234,
 'but': 0.035515320334261836,
 'some': 0.003481894150417827,
 'from': 0.006963788300835654,
 'is': 0.003481894150417827,
 'natures': 0.0006963788300835655,
 'her': 0.001392757660167131,
 'so': 0.009052924791086351,
 'nothing': 0.001392757660167131,
 'when': 0.006267409470752089,
 'came': 0.0006963788300835655,
 'one': 0.00766016713091922,
 'proclaimed': 0.0006963788300835655,
 'smoothlaid': 0.0006963788300835655,
 'h

In [278]:
def select_word(prob_dict):
    # Generate a random number between 0 and 1
    r = random.random()
    # Initialize cumulative probability
    cumulative_prob = 0.0
    
    # Iterate through the dictionary
    for word, prob in prob_dict.items():
        cumulative_prob += prob
        # If random number falls in this word's range, return the word
        if r <= cumulative_prob:
            return word
    
    # In case of rounding errors, return the last word
    return word


In [320]:
def generate_word():
    i = 0
    token1=None
    token2 = None
    generate_string = ""
    while i < 4:
        if (i == 0):
            selected_word = select_word(initword_prob)
            token1 = selected_word
        elif (i == 1):
            selected_word = select_word(secword_prob[token1])
            token2 = selected_word
        else:
            selected_word = select_word(thrdword_prob[token1][token2])
            token1 = token2
            token2 = selected_word
        generate_string += selected_word + ' '
        i += 1
    return generate_string
            

In [344]:
generate_word()

'son it left the '