# Deep Learning with Python
# 6.1 - One-Hot Encoding

- A tokenization technique for text-based data.
- Tokenization is the process of breaking text-based data into individual units such as words or characters (called tokens) that can then be encoded as vectors.
- Tokenization allows us to convert text-based data into numeric tensors which can then be passed to a Deep Learning model.

## Word-Level One-Hot Encoding
Every **word** in the sentence is considered an individual building block for text.

In [2]:
import numpy as np

In [3]:
# Initial data: one entry per sample 
# In this example, each sample is a sentence
# But practically, a sample could be an entire document
samples = ['The cat sat on the mat', 'The dog ate my homework']

In [4]:
# token_index is a dictionary mapping words to index numbers
token_index = {}

# For every sample in the corpus/collection of samples/documents
for sample in samples:
    for word in sample.split(): 
        if word not in token_index:
            token_index[word] = len(token_index) + 1

In [5]:
# Will only consider the first `max_length` words in each sample
max_length = 10

# [number of samples, 
# the max number of words that will be considered per sample 
# number of unique words in our dictionary]
results = np.zeros(shape=(len(samples), 
                         max_length, 
                         max(token_index.values()) + 1))

# For each sample in the collection of documents
for i, sample in enumerate(samples):
    # For every word in each sample up to the defined number of words
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.

In [7]:
token_index

{'The': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'the': 5,
 'mat': 6,
 'dog': 7,
 'ate': 8,
 'my': 9,
 'homework': 10}

In [41]:
results[1].shape

(10, 11)

In [34]:
results[1]

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

## Character-level One-Hot Encoding

In [14]:
import string

In [17]:
samples = ['The cat sat on the mat', 'The dog ate my homework']
characters = string.printable # All printable ASCII characters
token_index = dict(zip(range(1, len(characters) + 1), characters))
max_length = 50
results = np.zeros((len(samples), max_length, 
                    max(token_index.keys()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        results[i, j, index] = 1.

In [18]:
results 

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

## `keras` for 1-Hot Encoding

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [22]:
samples = ['The cat sat on the mat', 'The dog ate my homework']

In [23]:
# Create tokenizer configured to take into account only the 1000 most common words
tokenizer = Tokenizer(num_words=1000)

In [24]:
# Build the word index
tokenizer.fit_on_texts(samples)

In [25]:
# Turn strings into lists of integer indices
sequences = tokenizer.texts_to_sequences(samples)

In [26]:
# Could also directly get 1-hot binary representations
# Other vectorization modes than 1-Hot also supported
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

In [27]:
# Recovering the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.'%len(word_index))

Found 9 unique tokens.
