|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[5] Is tokenization compression?</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import requests

# **Part 1: Text compression in English books**

In [None]:
# GPT2 tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
# all books have the same url format;
# they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['84',    'Frankenstein'    ],
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['219',   'HeartDarkness'   ],
    ['2591',  'GrimmsTales'     ],
    ['2148',  'EdgarAllenPoe'   ],
    ['36',    'WarOfTheWorlds'  ],
    ['829',   'GulliversTravels']
]

In [None]:
print('  Book title     |  Chars  |  Words  |  Tokens |  Compress')
print('-----------------+---------+---------+---------+-----------')

compression = np.zeros((len(bookurls),2))
i = 0

for code,title in bookurls:

  # get the text
  fullurl = baseurl + code + '/pg' + code + '.txt'
  text = requests.get(fullurl).text

  # counts
  num_chars  = len(text)
  num_words  = len(text.split())
  num_tokens = len(tokenizer.encode(text))

  # compression ratios
  compression[i,0] = 100*num_tokens/num_chars
  compression[i,1] = 100*num_tokens/num_words

  print(f'{title:16} | {num_chars:>7,d} | {num_words:>7,d} | {num_tokens:>7,d} | {compression[i,0]:>3.0f} / {compression[i,1]:>3.0f}')
  i += 1


# **Part 2: Compression in websites**

In [None]:
weburls = [
    'https://python.org/',
    'https://pytorch.org/',
    'https://duckduckgo.com/',
    'https://sudoku.com/',
    'https://oreilly.com/',
    'https://visiteurope.com/en/',
    'https://sincxpress.com/',
    'https://openai.com/',
    'https://theuselessweb.com/',
    'https://maps.google.com/',
    'https://pigeonsarentreal.co.uk/',
]

In [None]:
requests.get('https://python.org/').text

In [None]:
print('    Website      |  Chars  |  Bytes  |  Tokens |  Compression (%)')
print('-----------------+---------+---------+---------+-----------------')


compression = np.zeros((len(weburls),2))
urlnames = []

for i,url in enumerate(weburls):

  # get the text
  text = requests.get(url).text

  # count characters, bytes, tokens
  num_chars  = len(text)
  num_bytes  = len(text.encode('utf-8'))
  num_tokens = len(tokenizer.encode(text))

  # compression ratio
  compression[i,0] = 100*num_tokens/num_chars
  compression[i,1] = 100*num_tokens/num_bytes

  # url name
  name = weburls[i][8:]
  urlnames.append(name[:name.index('.')])

  print(f'{urlnames[i]:>16} | {num_chars:>7,d} | {num_bytes:>7,d} | {num_tokens:>7,d} |  {compression[i,0]:>3.2f} / {compression[i,1]:>3.2f}')
