In [18]:
#spaces_are_usually_grouped_at_starting_of_word
!pip install --upgrade tiktoken
!pip install --upgrade openai



In [19]:
import tiktoken

In [20]:
encoding=tiktoken.get_encoding("cl100k_base")

In [21]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [22]:
encoding.encode("tiktoken is great!")

[83, 1609, 5963, 374, 2294, 0]

In [23]:
#Count tokens by counting the length of the list returned by .encode().
def num_tokens_from_string(string:str, encoding_name:str)->int:
    """ Returns the number of tokens in a text string. """
    encoding=tiktoken.get_encoding(encoding_name)
    num_tokens=len(encoding.encode(string))
    return num_tokens

In [24]:
num_tokens_from_string("tiktoken is great!", "cl100k_base")

6

In [25]:
encoding.decode([83, 1609, 5963, 374, 2294, 0])

'tiktoken is great!'

In [26]:
[encoding.decode_single_token_bytes(token) for token in [83, 1609, 5963, 374, 2294, 0]]#b_in_starting_indicate_strings_are_byte_strings

[b't', b'ik', b'token', b' is', b' great', b'!']

In [27]:
def compare_encodings(example_string:str)->None:
    """ Prints a comparison of three strings encoding."""
    print('f\nExample String:"{example_string}"')
    for encoding_name in ["r50k_base", "p50k_base", "cl100k_base"]:
        encoding=tiktoken.get_encoding(encoding_name)
        token_integers=encoding.encode(example_string)
        num_tokens=len(token_integers)
        token_bytes=[encoding.decode_single_token_bytes(token) for token in token_integers]
        print()
        print(f"{encoding_name}:{num_tokens} tokens")
        print(f" token integers:{token_integers}")
        print(f" token bytes:{token_bytes}")
    

In [28]:
compare_encodings("antidisestablishmentarianism")

f
Example String:"{example_string}"

r50k_base:5 tokens
 token integers:[415, 29207, 44390, 3699, 1042]
 token bytes:[b'ant', b'idis', b'establishment', b'arian', b'ism']

p50k_base:5 tokens
 token integers:[415, 29207, 44390, 3699, 1042]
 token bytes:[b'ant', b'idis', b'establishment', b'arian', b'ism']

cl100k_base:6 tokens
 token integers:[519, 85342, 34500, 479, 8997, 2191]
 token bytes:[b'ant', b'idis', b'establish', b'ment', b'arian', b'ism']


In [29]:
compare_encodings("2+2=4")

f
Example String:"{example_string}"

r50k_base:5 tokens
 token integers:[17, 10, 17, 28, 19]
 token bytes:[b'2', b'+', b'2', b'=', b'4']

p50k_base:5 tokens
 token integers:[17, 10, 17, 28, 19]
 token bytes:[b'2', b'+', b'2', b'=', b'4']

cl100k_base:5 tokens
 token integers:[17, 10, 17, 28, 19]
 token bytes:[b'2', b'+', b'2', b'=', b'4']


In [30]:
compare_encodings("お誕生日おめでとう")

f
Example String:"{example_string}"

r50k_base:14 tokens
 token integers:[2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]
 token bytes:[b'\xe3\x81', b'\x8a', b'\xe8\xaa', b'\x95', b'\xe7\x94\x9f', b'\xe6\x97', b'\xa5', b'\xe3\x81', b'\x8a', b'\xe3\x82', b'\x81', b'\xe3\x81\xa7', b'\xe3\x81\xa8', b'\xe3\x81\x86']

p50k_base:14 tokens
 token integers:[2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]
 token bytes:[b'\xe3\x81', b'\x8a', b'\xe8\xaa', b'\x95', b'\xe7\x94\x9f', b'\xe6\x97', b'\xa5', b'\xe3\x81', b'\x8a', b'\xe3\x82', b'\x81', b'\xe3\x81\xa7', b'\xe3\x81\xa8', b'\xe3\x81\x86']

cl100k_base:9 tokens
 token integers:[33334, 45918, 243, 21990, 9080, 33334, 62004, 16556, 78699]
 token bytes:[b'\xe3\x81\x8a', b'\xe8\xaa', b'\x95', b'\xe7\x94\x9f', b'\xe6\x97\xa5', b'\xe3\x81\x8a', b'\xe3\x82\x81', b'\xe3\x81\xa7', b'\xe3\x81\xa8\xe3\x81\x86']


In [31]:
#count-tokens_in_chat

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """ Return the number of tokens used by a list of messages."""
    try:
        encoding=tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding=tiktoken.get_encoding("cl100k_base")
    if model in{
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
    }:
        tokens_per_message=3
        tokens_per_name=1
    elif model=="gpt-3.5-turbo-0301":
        tokens_per_message=4
        tokens_per_name=-1
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num-tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f""" num_tokens_from_messages() is not implemented for model {model}.""" 
            )
    num_tokens=0
    for message in messages:
        num_tokens+=tokens_per_message
        for key, value in message.items():
            num_tokens+=len(encoding.encode(value))
            if key=="name":
                num_tokens+=tokens_per_name
    num_tokens+=3
    return num_tokens
    