In [1]:
from collections import OrderedDict

# -------- CONFIG --------
TEXT_FILE = "combined.txt"
BIN_SIZE = 25
MAX_WORDS = 274
# ------------------------

def count_words(line: str) -> int:
    # split() handles multiple spaces/tabs safely
    return len(line.strip().split())

# Build bins dynamically: 0â€“24, 25â€“49, ...
bins = OrderedDict()
for start in range(0, MAX_WORDS + 1, BIN_SIZE):
    end = min(start + BIN_SIZE - 1, MAX_WORDS)
    bins[(start, end)] = 0

total_lines = 0
empty_lines = 0

with open(TEXT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        total_lines += 1

        if not line.strip():
            empty_lines += 1
            bins[(0, BIN_SIZE - 1)] += 1
            continue

        wc = count_words(line)

        # Clamp anything above MAX_WORDS
        wc = min(wc, MAX_WORDS)

        bin_start = (wc // BIN_SIZE) * BIN_SIZE
        bin_end = min(bin_start + BIN_SIZE - 1, MAX_WORDS)

        bins[(bin_start, bin_end)] += 1

# -------- OUTPUT --------
print(f"Total lines   : {total_lines}")
print(f"Empty lines  : {empty_lines}\n")

print("Word-count distribution:")
for (start, end), count in bins.items():
    print(f"{start:>3}-{end:<3} words : {count}")


Total lines   : 26557763
Empty lines  : 5

Word-count distribution:
  0-24  words : 15558549
 25-49  words : 6464184
 50-74  words : 2390993
 75-99  words : 1069337
100-124 words : 505657
125-149 words : 258404
150-174 words : 140274
175-199 words : 78075
200-224 words : 45603
225-249 words : 28673
250-274 words : 18014
