# 单词级的one-hot编码

单词级的one-hot编码就是将文本进行分词，给文本中出现的每个单词一个编号，将单词和编号对应关系保存

最终的结果：单词的编号位置置1，其余位置置0

In [1]:
import numpy as np

# 样本数据
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# split()函数用于分词
for sample in samples:
    print(sample.split())

['The', 'cat', 'sat', 'on', 'the', 'mat.']
['The', 'dog', 'ate', 'my', 'homework.']


In [2]:
# 构建数据中所有标记索引
token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            # 为每个唯一单词指定唯一的索引
            token_index[word] = len(token_index) + 1
            # 注意没有编号0的索引
            
print('token index: \n', token_index)

# 对样本进行分词，只考虑前max_length个词
max_length = 10

# 将结果保存在results中,第三项+1的原因是索引是从1开始，但是矩阵从0开始，所以矩阵第一列全部为0
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1)) 
print('results shape: ', results.shape)

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.

print('results: \n', results)

token index: 
 {'The': 1, 'cat': 2, 'sat': 3, 'on': 4, 'the': 5, 'mat.': 6, 'dog': 7, 'ate': 8, 'my': 9, 'homework.': 10}
results shape:  (2, 10, 11)
results: 
 [[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


# 字符级的one-hot编码

和单词级的one-hot原理一致，但是把编码的单位从单词变成ASCII字符

省去了单词和索引对应关系的构建，但是需要更多的存储空间

In [3]:
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable  # 所有可以打印的ASCII字符
print('characters length: ', len(characters))

token_index = dict(zip(characters, range(1, len(characters) + 1)))

print('token index: \n', token_index)

max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
print('results shape: ', results.shape)
for i, sample in enumerate(samples):
    for j, character in enumerate(sample[:max_length]):
        index = token_index.get(character)
        results[i, j, index] = 1.
        
print('results: \n', results)

characters length:  100
token index: 
 {'0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'a': 11, 'b': 12, 'c': 13, 'd': 14, 'e': 15, 'f': 16, 'g': 17, 'h': 18, 'i': 19, 'j': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'o': 25, 'p': 26, 'q': 27, 'r': 28, 's': 29, 't': 30, 'u': 31, 'v': 32, 'w': 33, 'x': 34, 'y': 35, 'z': 36, 'A': 37, 'B': 38, 'C': 39, 'D': 40, 'E': 41, 'F': 42, 'G': 43, 'H': 44, 'I': 45, 'J': 46, 'K': 47, 'L': 48, 'M': 49, 'N': 50, 'O': 51, 'P': 52, 'Q': 53, 'R': 54, 'S': 55, 'T': 56, 'U': 57, 'V': 58, 'W': 59, 'X': 60, 'Y': 61, 'Z': 62, '!': 63, '"': 64, '#': 65, '$': 66, '%': 67, '&': 68, "'": 69, '(': 70, ')': 71, '*': 72, '+': 73, ',': 74, '-': 75, '.': 76, '/': 77, ':': 78, ';': 79, '<': 80, '=': 81, '>': 82, '?': 83, '@': 84, '[': 85, '\\': 86, ']': 87, '^': 88, '_': 89, '`': 90, '{': 91, '|': 92, '}': 93, '~': 94, ' ': 95, '\t': 96, '\n': 97, '\r': 98, '\x0b': 99, '\x0c': 100}
results shape:  (2, 50, 101)
results: 
 [[[0. 0. 0. ... 0.

# 使用keras构建单词级one-hot编码

In [4]:
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# 创建一个分词器，统计出现频率前10的常见词
tokenizer = Tokenizer(num_words=10)
# 构建单词索引
tokenizer.fit_on_texts(samples)

# 将字符串转化为整数索引列表
sequences = tokenizer.texts_to_sequences(samples)

print('transfer string to integer index (store in list): \n', sequences)

Using TensorFlow backend.


transfer string to integer index (store in list): 
 [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]


In [5]:
# 也可以直接使用二进制的方式
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
print('transfer string to binary index (store in matrix): \n',one_hot_results)

word_index = tokenizer.word_index
print(word_index)
print('Found %s unique tokens.' % len(word_index))

transfer string to binary index (store in matrix): 
 [[0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1. 1. 1. 1.]]
{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}
Found 9 unique tokens.
