In [1]:
import random
import re

In [None]:
def sample_paragraph_splits(
    file_path: str = "input.txt",
    num_samples: int = 10,
    k: int = 5,
    eot_token: str = "<|endoftext|>",
    train_pct: int = 80,
) -> tuple[list[list[str]], list[list[str]]]:
    """
    Read the text from `file_path`, split into paragraphs, randomly select
    `num_samples` of them, and for each:
      - Append the EOT marker to the paragraph text
      - Split on whitespace into tokens
      - Take the first k tokens into X
      - Take all remaining tokens (from k up to and including EOT) into Y

    Returns:
      X: list of length num_samples; each is a list of k tokens (strings)
      Y: list of length num_samples; each is a list of the remaining tokens
    """
    # 1) Load and split into paragraphs (blocks separated by blank lines)
    with open(file_path, "r", encoding="utf-8") as f:
        raw = f.read()
    paragraphs = [p.strip() for p in re.split(r"\n\s*\n", raw) if p.strip()]

    # 2) Sample paragraphs (without replacement if possible)
    if len(paragraphs) >= num_samples:
        selected = random.sample(paragraphs, num_samples)
    else:
        selected = [random.choice(paragraphs) for _ in range(num_samples)]

    X: list[list[str]] = []
    Y: list[list[str]] = []

    for para in selected:
        # 3) Append EOT marker
        text = f"{para} {eot_token}"

        # 4) Tokenize on whitespace
        tokens = text.split()

        # 5) Build prefix and suffix
        prefix = tokens[:k]
        suffix = tokens[k:]  # all remaining tokens, including the EOT marker

        X.append(prefix)
        Y.append(suffix)

    # Convert sequence of tokens to strings.
    X = [" ".join(x) for x in X]
    Y = [" ".join(y) for y in Y]

    Y = [x + " " + y for x, y in zip(X, Y)]

    # Slip into training and testing.
    train_size = int((train_pct / 100) * len(X))

    train_prompts = X[:train_size]
    train_references = Y[:train_size]
    test_prompts = X[train_size:]
    test_references = Y[train_size:]

    return (train_prompts, train_references), (test_prompts, test_references)

In [13]:
x, y = sample_paragraph_splits()

In [15]:
x0 = x[0]
y0 = y[0]

print(f"x0 = {x0}")
print(f"y0 = {y0}")

x0 = “Yes, confound it! Yes,” answered
y0 = “Yes, confound it! Yes,” answered Ned Land, “it is superb! I am mad at being obliged to admit it. No one has ever seen anything like it; but the sight may cost us dear. And, if I must say all, I think we are seeing here things which God never intended man to see.” <|endoftext|>
