In [1]:
import re
import random
from collections import Counter, defaultdict
import sys

INPUT_FILE = "robert_frost.txt"
END_TOKEN = "END"
LINES_TO_GENERATE = 4
MAX_WORDS_PER_LINE = 50


In [2]:
def clean_line(line: str) -> str:
    cleaned = re.sub(r"[^\w\s]", "", line)
    return cleaned.lower().strip()


In [3]:
def cumulative_choice(prob_dict):
    if not prob_dict:
        raise ValueError("Empty probability distribution in cumulative_choice.")
    items = list(prob_dict.items())
    total = sum(p for _, p in items)
    if total <= 0:
        raise ValueError("Non-positive total probability in cumulative_choice.")
    r = random.random()
    cumulative = 0.0
    for word, p in items:
        cumulative += (p / total)
        if r <= cumulative:
            return word
    return items[-1][0]


In [4]:
def train_from_file(filename: str):
    try:
        with open(filename, "r", encoding="utf-8") as f:
            raw_lines = f.readlines()
    except FileNotFoundError:
        print(f"Error: file '{filename}' not found. Put 'robert_frost.txt' in the same folder.", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading '{filename}': {e}", file=sys.stderr)
        return None

    initial_counts = Counter()
    unigram_counts = Counter()
    first_counts = defaultdict(Counter)
    second_counts = defaultdict(Counter)
    total_nonempty_lines = 0

    for raw in raw_lines:
        line = raw.strip()
        if not line:
            continue
        cleaned = clean_line(line)
        if not cleaned:
            continue
        words = cleaned.split()
        if not words:
            continue

        total_nonempty_lines += 1
        initial_counts[words[0]] += 1

        for i, w in enumerate(words):
            unigram_counts[w] += 1
            if i >= 1:
                prev = words[i-1]
                first_counts[prev][w] += 1
            if i >= 2:
                prevbig = (words[i-2], words[i-1])
                second_counts[prevbig][w] += 1

        last = words[-1]
        first_counts[last][END_TOKEN] += 1
        unigram_counts[END_TOKEN] += 1
        if len(words) >= 2:
            prevbig = (words[-2], words[-1])
            second_counts[prevbig][END_TOKEN] += 1

    if total_nonempty_lines == 0:
        print("Error: no non-empty lines found in the input file.", file=sys.stderr)
        return None

    initial_probs = {w: c / total_nonempty_lines for w, c in initial_counts.items()}

    first_probs = {}
    for prev, counter in first_counts.items():
        total_for_prev = sum(counter.values())
        if total_for_prev > 0:
            first_probs[prev] = {nxt: cnt / total_for_prev for nxt, cnt in counter.items()}

    second_probs = {}
    for prevbig, counter in second_counts.items():
        total_for_prevbig = sum(counter.values())
        if total_for_prevbig > 0:
            second_probs[prevbig] = {nxt: cnt / total_for_prevbig for nxt, cnt in counter.items()}

    total_unigrams = sum(unigram_counts.values())
    unigram_probs = {w: c / total_unigrams for w, c in unigram_counts.items()}

    return {
        "initial_probs": initial_probs,
        "first_probs": first_probs,
        "second_probs": second_probs,
        "unigram_probs": unigram_probs,
        "counts": {
            "initial_counts": initial_counts,
            "first_counts": first_counts,
            "second_counts": second_counts,
            "unigram_counts": unigram_counts,
            "total_lines": total_nonempty_lines
        }
    }


In [5]:
def generate_line(model):
    initial = model["initial_probs"]
    first_probs = model["first_probs"]
    second_probs = model["second_probs"]
    unigram_probs = model["unigram_probs"]

    w1 = cumulative_choice(initial)
    if w1 == END_TOKEN:
        return ""
    words = [w1]

    if w1 in first_probs:
        w2 = cumulative_choice(first_probs[w1])
    else:
        w2 = cumulative_choice(unigram_probs)
    if w2 == END_TOKEN:
        return " ".join(words)

    words.append(w2)

    for _ in range(MAX_WORDS_PER_LINE - 2):
        context = (words[-2], words[-1])
        next_word = None
        if context in second_probs:
            next_word = cumulative_choice(second_probs[context])
        elif words[-1] in first_probs:
            next_word = cumulative_choice(first_probs[words[-1]])
        else:
            next_word = cumulative_choice(unigram_probs)

        if next_word == END_TOKEN:
            break
        words.append(next_word)

    return " ".join(words)


In [6]:
def generate_poem(model, n_lines=LINES_TO_GENERATE):
    return [generate_line(model) for _ in range(n_lines)]


In [7]:
model = train_from_file(INPUT_FILE)
if model is None:
    print("Model training failed. Make sure 'robert_frost.txt' is available.")
else:
    print(f"Trained on {model['counts']['total_lines']} non-empty lines.\n")
    print("Sample initial word probabilities (top 10):")
    sorted_initial = sorted(model["initial_probs"].items(), key=lambda x: -x[1])
    for w, p in sorted_initial[:10]:
        print(f"  {w!r}: {p:.3f}")
    print("\nGenerated poem:\n")
    poem = generate_poem(model, LINES_TO_GENERATE)
    for line in poem:
        print(line)


Trained on 1436 non-empty lines.

Sample initial word probabilities (top 10):
  'and': 0.090
  'i': 0.082
  'the': 0.057
  'but': 0.036
  'to': 0.035
  'you': 0.028
  'he': 0.024
  'a': 0.021
  'in': 0.020
  'of': 0.020

Generated poem:

or vase or picture
i have no doubt its grown up some to woods around it
over
once told me
