In [1]:
from datasets import load_dataset
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, DistilBertTokenizer

### Character tokenization

In [2]:
# Character tokenization
# Under the hood strings are lists of characters
text = "Tokenizing text is a cora task of NLP."
tokenized_text = list(text)
print(tokenized_text)

['T', 'o', 'k', 'e', 'n', 'i', 'z', 'i', 'n', 'g', ' ', 't', 'e', 'x', 't', ' ', 'i', 's', ' ', 'a', ' ', 'c', 'o', 'r', 'a', ' ', 't', 'a', 's', 'k', ' ', 'o', 'f', ' ', 'N', 'L', 'P', '.']


In [3]:
# Converting characters to indices
token2idx = {
    ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))
}
print("Token to index mapping: ")
print(token2idx)

# Mapping characters to indices using the token2idx dictionary
input_ids = [token2idx[ch] for ch in tokenized_text]
print("Character to index mapping: ")
print(input_ids)

Token to index mapping: 
{' ': 0, '.': 1, 'L': 2, 'N': 3, 'P': 4, 'T': 5, 'a': 6, 'c': 7, 'e': 8, 'f': 9, 'g': 10, 'i': 11, 'k': 12, 'n': 13, 'o': 14, 'r': 15, 's': 16, 't': 17, 'x': 18, 'z': 19}
Character to index mapping: 
[5, 14, 12, 8, 13, 11, 19, 11, 13, 10, 0, 17, 8, 18, 17, 0, 11, 16, 0, 6, 0, 7, 14, 15, 6, 0, 17, 6, 16, 12, 0, 14, 9, 0, 3, 2, 4, 1]


In [4]:
#Pandas has a built-in function to get one-hot encodings
df = pd.DataFrame(
    {   
        "Name": ["Alice", "Bob", "Charlie"],
        'Label ID': [0, 1, 2]
    }
)
df

Unnamed: 0,Name,Label ID
0,Alice,0
1,Bob,1
2,Charlie,2


In [5]:
# Create a one hot encoding of the input ids
one_hot = pd.get_dummies(df.Name, dtype=int) 
one_hot

Unnamed: 0,Alice,Bob,Charlie
0,1,0,0
1,0,1,0
2,0,0,1


In [6]:
# Let's create a one-hot encoding of the input_ids
# Important to set num classes to the length of the vocabulary, if not all the vocab tokens are present in the input_ids
input_ids = torch.tensor(input_ids)
print(f"Input IDs: {input_ids}")
one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
print(f"One-hot Encodings shape: {one_hot_encodings.shape}")

Input IDs: tensor([ 5, 14, 12,  8, 13, 11, 19, 11, 13, 10,  0, 17,  8, 18, 17,  0, 11, 16,
         0,  6,  0,  7, 14, 15,  6,  0, 17,  6, 16, 12,  0, 14,  9,  0,  3,  2,
         4,  1])
One-hot Encodings shape: torch.Size([38, 20])


In [7]:
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids[0]}")
print(f"One-hot encoding: {one_hot_encodings[0]}")

Token: T
Tensor index: 5
One-hot encoding: tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


### Word tokenization

In [8]:
# Helps the model by skipping the process of learning words from charagters
# Word tokenization problems: The vocab size can increase dramatically, punctuation and misspelled words can be problematic
# some solutions are stemming and lemmatization
# Common approach is to limit the vocab size to the most common words nad throw the rest into an unknown token
tokenized_text_w = text.split()
print(tokenized_text_w)

['Tokenizing', 'text', 'is', 'a', 'cora', 'task', 'of', 'NLP.']


### Subword tokenization

In [9]:
# Combine the best of both world. Common words will remain as is, and rare words will be split into subwords to be dealt with by the model
# Keep the lenth of the vocab reasonable

# AutoTokenizer allows to laod the tokenizer from one model
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
#Loading it manually
distilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
# Tokenizing the text using the tokenizer
# Convert the ids back to tokens
encoded_text = tokenizer(text)
print(encoded_text)
tokens = tokenizer.convert_ids_to_tokens(encoded_text['input_ids'])
print(tokens)

{'input_ids': [101, 19204, 6026, 3793, 2003, 1037, 17195, 4708, 1997, 17953, 2361, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'token', '##izing', 'text', 'is', 'a', 'cora', 'task', 'of', 'nl', '##p', '.', '[SEP]']


In [11]:
# \## menas that the preceding string is not whitespace separated
print(tokenizer.convert_tokens_to_string(tokens))
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Max model length: {tokenizer.model_max_length}")
print(f"Model input names: {tokenizer.model_input_names}")

[CLS] tokenizing text is a cora task of nlp. [SEP]
Vocab size: 30522
Max model length: 512
Model input names: ['input_ids', 'attention_mask']


In [12]:
#Load back the emotions dataset
emotions = load_dataset("emotion", trust_remote_code=True)

#Tokenizing the whole dataset
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

print(tokenize(emotions['train'][:2]))

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [13]:
#Apply the processing function to the whole dataset
# If batch_size is None, the function will be applied to the whole dataset
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
print("Emotions dataset columns: ")
print(emotions_encoded['train'].column_names)
print("Emotions dataset example: ")
print(emotions_encoded['train'][0])
print("Emotions dataset features: ")
print(emotions_encoded['train'].features)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Emotions dataset columns: 
['text', 'label', 'input_ids', 'attention_mask']
Emotions dataset example: 
{'text': 'i didnt feel humiliated', 'label': 0, 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
Emotions dataset features: 
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(f

In [14]:
# Extra: Let's check that the length of the input_ids is the same as the length of the longest text of the original emotions dataset
len_input_ids = len(emotions_encoded['train'][0]['input_ids'])
print(f"Length of input_ids from first item: {len_input_ids}")
# probably there is a more elegant way to map the tokeniozer into the whole dataset preserving the column of tokens
len_max_text = max(len(tokenizer.convert_ids_to_tokens(tokenizer(ex['text'])['input_ids'])) for ex in emotions['train'])
print(f"Length of the tokens from the longest text: {len_max_text}")

Length of input_ids from first item: 87
Length of the tokens from the longest text: 87
