# Tokenization compression ratios

In [1]:
import requests
from urllib.parse import urlparse

import string
import tiktoken

In [2]:
# GPT-4's Tokenizer
tokenizer = tiktoken.get_encoding('cl100k_base')

# get the books

In [3]:
# all books have the same url format;
# they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['84',    'Frankenstein'    ],
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['219',   'HeartDarkness'   ],
    ['2591',  'GrimmsTales'     ],
    ['2148',  'EdgarAllenPoe'   ],
    ['36',    'WarOfTheWorlds'  ],
    ['829',   'GulliversTravels']
]

In [4]:
print('  Book title     |  Chars  |  Tokens | Compression')
print('-'*50)

for code, title in bookurls:
    # get the text
    fullurl = baseurl + code + '/pg' + code + '.txt'
    text = requests.get(fullurl).text
    num_chars = len(text)

    # tokenize
    tokens = tokenizer.encode(text)
    num_tokens = len(tokens)

    # compression ratio
    compress = (num_tokens / num_chars) * 100

    print(f"{title:16} | {num_chars:>7,d} | {num_tokens:>7,d} | {compress:>3.2f}%")


  Book title     |  Chars  |  Tokens | Compression
--------------------------------------------------
Frankenstein     | 446,544 | 102,419 | 22.94%
GreatGatsby      | 296,858 |  70,343 | 23.70%
AliceWonderland  | 167,674 |  41,457 | 24.72%
RomeoJuliet      | 167,429 |  43,761 | 26.14%
HuckFinn         | 602,714 | 159,125 | 26.40%
HeartDarkness    | 232,885 |  56,483 | 24.25%
GrimmsTales      | 549,736 | 137,252 | 24.97%
EdgarAllenPoe    | 632,131 | 144,315 | 22.83%
WarOfTheWorlds   | 363,420 |  84,580 | 23.27%
GulliversTravels | 611,742 | 143,560 | 23.47%


# let's see some websites

In [5]:
weburls = [
    'http://python.org/',
    'https://pytorch.org/',
    'https://en.wikipedia.org/wiki/List_of_English_words_containing_Q_not_followed_by_U',
    'https://sudoku.com/',
    'https://reddit.com/',
    'https://visiteurope.com/en/',
    'https://sincxpress.com/',
    'https://openai.com/',
    'https://theuselessweb.com/',
    'https://maps.google.com/',
    'https://pigeonsarentreal.co.uk/',
]

In [6]:
print('    Website        |  Chars  |  Tokens | Compression')
print('-'*53)

for url in weburls:

    # get the text
    text = requests.get(url).text
    num_chars = len(text)

    # tokenize
    tokens = tokenizer.encode(text)
    num_tokens = len(tokens)

    # compression ratio
    compress = 100*num_tokens/num_chars

    print(f'{urlparse(url).hostname[:-4]:18} | {num_chars:>7,d} | {num_tokens:>7,d} |  {compress:>3.2f}%')

    Website        |  Chars  |  Tokens | Compression
-----------------------------------------------------
python             |  50,332 |  12,791 |  25.41%
pytorch            | 388,061 | 111,398 |  28.71%
en.wikipedia       |      92 |      26 |  28.26%
sudoku             | 139,673 |  51,094 |  36.58%
reddit             | 479,437 | 148,207 |  30.91%
visiteurope        | 116,010 |  31,668 |  27.30%
sincxpress         |  25,580 |   6,843 |  26.75%
openai             |  11,617 |   6,461 |  55.62%
theuselessweb      |   4,756 |   1,329 |  27.94%
maps.google        | 212,313 | 107,456 |  50.61%
pigeonsarentreal.c | 243,854 |  71,232 |  29.21%


# using the 'string' library

In [8]:
print('  Attribute     |  Chars  |  Tokens | Compression')
print('-'*50)

for k,v in string.__dict__.items():
    if isinstance(v,str) and (len(v)>0):

        # get the text
        num_chars = len(v)

        # tokenize
        tokens = tokenizer.encode(v)
        num_tokens = len(tokens)

        # compression ratio
        compress = 100*num_tokens/num_chars

        print(f'{k:15} | {num_chars:>7,d} | {num_tokens:>7,d} |  {compress:>5.2f}%')

  Attribute     |  Chars  |  Tokens | Compression
--------------------------------------------------
__name__        |       6 |       1 |  16.67%
__doc__         |     622 |     109 |  17.52%
__package__     |       6 |       1 |  16.67%
__file__        |      84 |      22 |  26.19%
__cached__      |     109 |      31 |  28.44%
whitespace      |       6 |       4 |  66.67%
ascii_lowercase |      26 |       1 |   3.85%
ascii_uppercase |      26 |       1 |   3.85%
ascii_letters   |      52 |       2 |   3.85%
digits          |      10 |       4 |  40.00%
hexdigits       |      22 |       7 |  31.82%
octdigits       |       8 |       3 |  37.50%
punctuation     |      32 |      21 |  65.62%
printable       |     100 |      31 |  31.00%


In [9]:
# e.g.,
string.__dict__['__doc__']
# string.ascii_lowercase

'A collection of string constants.\n\nPublic module variables:\n\nwhitespace -- a string containing all ASCII whitespace\nascii_lowercase -- a string containing all ASCII lowercase letters\nascii_uppercase -- a string containing all ASCII uppercase letters\nascii_letters -- a string containing all ASCII letters\ndigits -- a string containing all ASCII decimal digits\nhexdigits -- a string containing all ASCII hexadecimal digits\noctdigits -- a string containing all ASCII octal digits\npunctuation -- a string containing all ASCII punctuation characters\nprintable -- a string containing all ASCII characters considered printable\n\n'