# Example Tiktoken

In [1]:
# !pip install tiktoken
import tiktoken

In [2]:
encoding_name="cl100k_base"
llm_model_name="gpt-3.5-turbo"

In [3]:
#Load encoding:
encoding = tiktoken.get_encoding(encoding_name)
encoding = tiktoken.encoding_for_model(llm_model_name)

#Load encoding:
texto = "¡Hola! Esto es un ejemplo de texto =)"

In [4]:
# Simple example - text 2 tokens:
encoding.encode(texto)

[40932, 69112, 0, 93125, 1560, 653, 58300, 409, 33125, 284, 8]

In [5]:
# With function example:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    print("Returns the number of tokens in a text string:")   
    print("*"*50)
    tokens = encoding.encode(string)
    len_tokens = len(tokens)
    
    return tokens,len_tokens

response=num_tokens_from_string(texto, encoding_name)

print(response)

Returns the number of tokens in a text string:
**************************************************
([40932, 69112, 0, 93125, 1560, 653, 58300, 409, 33125, 284, 8], 11)


In [6]:
# Simple example - tokens 2 text:
encoding.decode(response[0])

'¡Hola! Esto es un ejemplo de texto =)'

In [7]:
# each token:
[encoding.decode_single_token_bytes(token) for token in response[0]]

[b'\xc2\xa1',
 b'Hola',
 b'!',
 b' Esto',
 b' es',
 b' un',
 b' ejemplo',
 b' de',
 b' texto',
 b' =',
 b')']

In [8]:
# Comapare encodings:
def compare_encodings(string_to_evaluate , encoding_names) :
    # print the example string
    print(f'\nExample string: "{string_to_evaluate}"')
    # for each encoding, print the # of tokens, the token integers, and the token bytes
    for encoding_name in encoding_names:
        encoding = tiktoken.get_encoding(encoding_name)
        token_integers = encoding.encode(string_to_evaluate)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print("*"*100)
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")
        

In [9]:
encoding_names= ["r50k_base", "p50k_base", "cl100k_base"]
compare_encodings(texto, encoding_names)



Example string: "¡Hola! Esto es un ejemplo de texto =)"
****************************************************************************************************
r50k_base: 18 tokens
token integers: [126, 94, 39, 5708, 0, 10062, 78, 1658, 555, 304, 73, 18856, 78, 390, 2420, 78, 796, 8]
token bytes: [b'\xc2', b'\xa1', b'H', b'ola', b'!', b' Est', b'o', b' es', b' un', b' e', b'j', b'empl', b'o', b' de', b' text', b'o', b' =', b')']
****************************************************************************************************
p50k_base: 18 tokens
token integers: [126, 94, 39, 5708, 0, 10062, 78, 1658, 555, 304, 73, 18856, 78, 390, 2420, 78, 796, 8]
token bytes: [b'\xc2', b'\xa1', b'H', b'ola', b'!', b' Est', b'o', b' es', b' un', b' e', b'j', b'empl', b'o', b' de', b' text', b'o', b' =', b')']
****************************************************************************************************
cl100k_base: 11 tokens
token integers: [40932, 69112, 0, 93125, 1560, 653, 58300, 409, 33125, 

In [10]:
compare_encodings("2 + 2 = 4", encoding_names)


Example string: "2 + 2 = 4"
****************************************************************************************************
r50k_base: 5 tokens
token integers: [17, 1343, 362, 796, 604]
token bytes: [b'2', b' +', b' 2', b' =', b' 4']
****************************************************************************************************
p50k_base: 5 tokens
token integers: [17, 1343, 362, 796, 604]
token bytes: [b'2', b' +', b' 2', b' =', b' 4']
****************************************************************************************************
cl100k_base: 7 tokens
token integers: [17, 489, 220, 17, 284, 220, 19]
token bytes: [b'2', b' +', b' ', b'2', b' =', b' ', b'4']


# Otra Opcion

In [17]:
import pandas as pd
import tiktoken

# Fun para evaluar cantidad de tokens a procesar:
def num_tokens_from_string(text, encodig_name):
    encoding = tiktoken.get_encoding(encodig_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens


In [14]:
# Cargando DataSet de prueba
df = pd.read_csv('../../Info_data/generic-food.csv')
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables


In [15]:
# Crea nueva columna con el los token que representa la columna FOOD NAME
encoding_name="cl100k_base"
df['total_tokens'] = df['FOOD NAME'].apply(lambda x : num_tokens_from_string(x,encoding_name) )
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6


In [16]:
# Total de token que se gastaria procesar todo:
sum(df['total_tokens'])

2947