# Tokenizers Experimentation

The goal is to experiment with different Tokenizers and evaluate the results.

# Setup Notebook

## Imports

In [1]:
# Import Standard Libraries
import tiktoken
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Experimentation

## TikToken

It is a library with many different Tokenizers defined.

In [2]:
# Define the sentence to encode
sentence = '    Hello World!!!'

# Instance a 'GPT-2' tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

print('Sentence:', sentence)
print('GPT-2 Tokenizer: ', tokenizer.encode(sentence))

# Instance the GPT-4 Tokenizer: cl100k_base
tokenizer = tiktoken.get_encoding('cl100k_base')

print('Sentence:', sentence)
print('GPT-2 Tokenizer: ', tokenizer.encode(sentence))

Sentence:     Hello World!!!
GPT-2 Tokenizer:  [220, 220, 220, 18435, 2159, 10185]
Sentence:     Hello World!!!
GPT-2 Tokenizer:  [262, 22691, 4435, 12340]


- In the GPT-4 Tokenizer, the white spaces are merged together with the other words.
- This is due to the fact that the REGEX for splitting the words has changed

## AutoModelForSequenceClassification

In [10]:
sentence = "Using a Transformer network is simple"

model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print('Sentence:', sentence)

# Feed forward the sentence into the Tokenizer
output = tokenizer(sentence)

print('Token IDs:', output['input_ids'])
print('Attention Mask:', output['attention_mask'], "[1] = Important / [0] = Ignore")

# Generate tokens
tokens = tokenizer.tokenize(sentence)

# Compute token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print('Tokens:', tokens)
print('Token IDs (Without Special Tokens)', token_ids)
print('Decoded Sentence:', tokenizer.decode(token_ids))

Sentence: Using a Transformer network is simple
Token IDs: [101, 2478, 1037, 10938, 2121, 2897, 2003, 3722, 102]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1] [1] = Important / [0] = Ignore
Tokens: ['using', 'a', 'transform', '##er', 'network', 'is', 'simple']
Token IDs (Without Special Tokens) [2478, 1037, 10938, 2121, 2897, 2003, 3722]
Decoded Sentence: using a transformer network is simple
