# Markov Chain Poetry Generator
 * This notebook implements a simple Markov chain–based model
 * that learns word patterns from Robert Frost’s poems and
 * generates new poetic lines in a similar style.


In [1]:
# Import required libraries
import random
import re
from collections import defaultdict, Counter


# 1. Load and Preprocess Text

This step reads the poem file **`robert_frost.txt`**, cleans and prepares it for the Markov model.

**Process overview:**
- Reads the poem line by line
- Removes punctuation and converts all text to lowercase
- Splits each line into words
- Adds an `"END"` token to mark the end of each line


In [2]:
def read_poem(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    processed = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        line = re.sub(r"[^\w\s']", "", line).lower()
        words = line.split()
        if words:
            words.append("END")  # mark the end of each line
            processed.append(words)
    return processed


In [3]:
# Load and inspect the data
poem_lines = read_poem("robert_frost.txt")
print(f"Loaded {len(poem_lines)} lines.")
print("Example:", poem_lines[0][:10])


Loaded 1436 lines.
Example: ['two', 'roads', 'diverged', 'in', 'a', 'yellow', 'wood', 'END']


# 2. Train the Markov Model

In this section, we calculate the probability distributions that define the Markov chain.
The model learns how words tend to follow one another in Robert Frost’s poems.

**It computes:**
- **Initial word probabilities:** how likely each word is to start a line
- **First-order transitions:** P(w₂ | w₁) — the probability of a word given the previous one
- **Second-order transitions:** P(w₃ | w₁, w₂) — the probability of a word given the previous two


In [4]:
def train_model(lines):
    start_words = Counter()
    first_order = defaultdict(Counter)
    second_order = defaultdict(Counter)

    for words in lines:
        start_words[words[0]] += 1
        for i in range(len(words) - 1):
            w1, w2 = words[i], words[i + 1]
            first_order[w1][w2] += 1
            if i < len(words) - 2:
                w3 = words[i + 2]
                second_order[(w1, w2)][w3] += 1

    total_starts = sum(start_words.values())
    start_probs = {w: c / total_starts for w, c in start_words.items()}

    first_probs = {}
    for w1, nexts in first_order.items():
        total = sum(nexts.values())
        first_probs[w1] = {w2: c / total for w2, c in nexts.items()}

    second_probs = {}
    for pair, nexts in second_order.items():
        total = sum(nexts.values())
        second_probs[pair] = {w3: c / total for w3, c in nexts.items()}

    return start_probs, first_probs, second_probs


In [5]:
# Train the model
start_probs, first_probs, second_probs = train_model(poem_lines)

print("Initial words:", list(start_probs.keys())[:10])


Initial words: ['two', 'and', 'to', 'then', 'because', 'though', 'had', 'in', 'oh', 'yet']


# 3. Random Word Selection

This part defines how the next word is chosen based on its probability distribution.
It uses the **cumulative probability method**, which ensures that words with higher probabilities
have a proportionally higher chance of being selected.


In [6]:
def pick_word(prob_dict):
    r = random.random()
    total = 0
    for word, p in prob_dict.items():
        total += p
        if r <= total:
            return word
    return random.choice(list(prob_dict.keys()))



# 4. Generate new poetry


In [7]:
def generate_poem(start_probs, first_probs, second_probs, lines=4, max_len=12):
    poem = []
    for _ in range(lines):
        line = []

        # choose first word
        w1 = pick_word(start_probs)
        line.append(w1)

        # choose second word
        if w1 not in first_probs:
            continue
        w2 = pick_word(first_probs[w1])
        line.append(w2)

        # generate remaining words
        for _ in range(max_len - 2):
            pair = (w1, w2)
            if pair in second_probs:
                w3 = pick_word(second_probs[pair])
            elif w2 in first_probs:
                w3 = pick_word(first_probs[w2])
            else:
                break

            if w3 == "END":
                break

            line.append(w3)
            w1, w2 = w2, w3

        poem.append(" ".join(line))
    return "\n".join(poem)


In [8]:
print("\nGenerated Frost-Style Poem:\n")
print(generate_poem(start_probs, first_probs, second_probs))



Generated Frost-Style Poem:

i'll show you how she tended both or had them tended
there if you'll let a guide direct you
'don't make me get up forgive me
the chin
