In [1]:
import tiktoken

In [2]:
def format_text(text):
    """
    Converts multi-line text into a single line while preserving proper spacing.
    Handles cases where words are split across lines.

    Args:
        text (str): The input multi-line text

    Returns:
        str: Formatted single-line text
    """
    # Split text into lines
    lines = text.split("\n")

    # Remove empty lines and strip whitespace
    lines = [line.strip() for line in lines if line.strip()]

    # Join lines with proper spacing
    formatted_text = ""
    for i, line in enumerate(lines):
        if i > 0:
            # Add space only if the previous line doesn't end with hyphen
            if not formatted_text.endswith("-"):
                formatted_text += " "
        formatted_text += line

    return formatted_text

In [3]:
with open("wolf359_episodes_1_to_30.md") as f:
    input_text = f.read()

In [4]:
formatted = format_text(input_text)
with open("wolf359_episodes_1_to_30_formatted.md", "w") as f:
    f.write(formatted)

In [5]:
def num_tokens_from_string(string, encoding):
    """
    Returns the number of tokens in a text string, formatted with K/M suffixes.
    Examples: 1500 -> 1.5K, 1500000 -> 1.5M
    """
    num_tokens = len(encoding.encode(string))

    if num_tokens >= 1_000_000:
        return f"{num_tokens/1_000_000:.1f}M"
    elif num_tokens >= 1_000:
        return f"{num_tokens/1_000:.1f}K"
    else:
        return str(num_tokens)

In [6]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [7]:
num_tokens_from_string(input_text, encoding)

'186.7K'

In [8]:
num_tokens_from_string(formatted, encoding)

'161.5K'