In [11]:
# get num tokens for Zarf docs

import os
import tiktoken
from pathlib import Path

def count_tokens(text, model="gpt-4o-mini"):
    """Count the number of tokens in the given text."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def process_file(file_path, model="gpt-4o-mini"):
    """Process a single file and return its token count."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    token_count = count_tokens(content, model)
    return token_count

def process_folder(folder_path, model="gpt-4o-mini"):
    """Process all .md and .mdx files in the given folder and its subfolders."""
    total_tokens = 0
    file_counts = {}

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(('.md', '.mdx')):
                file_path = Path(root) / file
                token_count = process_file(file_path, model)
                total_tokens += token_count
                file_counts[str(file_path)] = token_count

    return total_tokens, file_counts

folder_path = "../zarf/site/src/content/docs"
model = "gpt-4o-mini"

total_tokens, file_counts = process_folder(folder_path, model)

print(f"\nTotal tokens across all files: {total_tokens}")
print("\nToken count for each file:")
for file, count in sorted(file_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{file}: {count} tokens")


Total tokens across all files: 86761

Token count for each file:
../zarf/site/src/content/docs/ref/init-package.mdx: 4219 tokens
../zarf/site/src/content/docs/ref/components.mdx: 3986 tokens
../zarf/site/src/content/docs/tutorials/0-creating-a-zarf-package.mdx: 2929 tokens
../zarf/site/src/content/docs/ref/actions.mdx: 2623 tokens
../zarf/site/src/content/docs/contribute/style-guide.mdx: 2575 tokens
../zarf/site/src/content/docs/tutorials/5-big-bang.mdx: 2096 tokens
../zarf/site/src/content/docs/ref/deploy.mdx: 2031 tokens
../zarf/site/src/content/docs/ref/packages.mdx: 1663 tokens
../zarf/site/src/content/docs/ref/values.mdx: 1630 tokens
../zarf/site/src/content/docs/tutorials/6-publish-and-deploy.mdx: 1629 tokens
../zarf/site/src/content/docs/faq.mdx: 1582 tokens
../zarf/site/src/content/docs/tutorials/7-custom-init-packages.mdx: 1545 tokens
../zarf/site/src/content/docs/contribute/testing.mdx: 1477 tokens
../zarf/site/src/content/docs/tutorials/2-deploying-zarf-packages.mdx: 1454 t

In [9]:
# get num tokens for event and log data

import os
import tiktoken
from pathlib import Path

def count_tokens(text, model="gpt-4"):
    """Count the number of tokens in the given text."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def process_file(file_path, model="gpt-4o-mini"):
    """Process a single file and return its token count and content."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    token_count = count_tokens(content, model)
    return token_count, content

def process_files(log_file, event_file, model="gpt-4o-mini"):
    """Process the log and event files."""
    log_tokens, log_content = process_file(log_file, model)
    event_tokens, event_content = process_file(event_file, model)
    total_tokens = log_tokens + event_tokens
    return total_tokens, log_tokens, event_tokens, log_content, event_content

log_file = "core-deployed-logs-2.txt"
event_file = "core-deployed-events-2.txt"
model = "gpt-4o-mini"

total_tokens, log_tokens, event_tokens, log_content, event_content = process_files(log_file, event_file, model)

print(f"\nTotal tokens: {total_tokens}")
print(f"Log file tokens: {log_tokens}")
print(f"Event file tokens: {event_tokens}")


Total tokens: 105715
Log file tokens: 68183
Event file tokens: 37532


In [4]:
# tokens for uds docs

import os
import tiktoken
from pathlib import Path

def count_tokens(text, model="gpt-4o-mini"):
    """Count the number of tokens in the given text."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def process_file(file_path, model="gpt-4o-mini"):
    """Process a single file and return its token count."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    token_count = count_tokens(content, model)
    return token_count

def process_folder(folder_path, model="gpt-4o-mini"):
    """Process all .md and .mdx files in the given folder and its subfolders."""
    total_tokens = 0
    file_counts = {}

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(('.md', '.mdx')):
                file_path = Path(root) / file
                token_count = process_file(file_path, model)
                total_tokens += token_count
                file_counts[str(file_path)] = token_count

    return total_tokens, file_counts

folder_path = "../uds/uds-core/docs/"
operator_docs_folder = "../uds/uds-core/src/pepr"
model = "gpt-4o-mini"

docs_tokens, docs_file_counts = process_folder(folder_path, model)
operator_docs_tokens, operator_file_counts = process_folder(operator_docs_folder, model)
file_counts = docs_file_counts | operator_file_counts

print(f"\nTotal tokens across all files: {docs_tokens + operator_docs_tokens}")
print("\nToken count for each file:")
for file, count in sorted(file_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{file}: {count} tokens")


Total tokens across all files: 11379

Token count for each file:
../uds/uds-core/src/pepr/policies/README.md: 2942 tokens
../uds/uds-core/src/pepr/operator/README.md: 1952 tokens
../uds/uds-core/docs/configuration/uds-operator.md: 1919 tokens
../uds/uds-core/docs/deployment/uds-deploy.md: 1218 tokens
../uds/uds-core/docs/configuration/uds-monitoring-metrics.md: 993 tokens
../uds/uds-core/docs/application-baseline.md: 603 tokens
../uds/uds-core/docs/configuration/uds-user-groups.md: 576 tokens
../uds/uds-core/docs/_index.md: 409 tokens
../uds/uds-core/docs/configuration/uds-configure-policy-exemptions.md: 273 tokens
../uds/uds-core/docs/deployment/distribution-support.md: 266 tokens
../uds/uds-core/src/pepr/README.md: 99 tokens
../uds/uds-core/docs/development/uds-development-maintenance.md: 50 tokens
../uds/uds-core/src/pepr/istio/README.md: 24 tokens
../uds/uds-core/docs/deployment/_index.md: 19 tokens
../uds/uds-core/docs/configuration/_index.md: 18 tokens
../uds/uds-core/docs/devel