# Text to 1-hot vectors

Text is mostly fed to a neunal network in the form of 1-hot vectors. 
This can be done at word level and at character level

In this notebook, we will do
    1. Word level 1-hot vectors from first principals
    2. Character level 1-hot vectors from first principals
    3. Word level 1-hot vectors using keras library
    4. Character level 1-hot vectors using keras library
    5. Hashing trick

### Basic Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

In [3]:
import pixiedust

Pixiedust database opened successfully


### Toy Data

In [4]:
sample_set = ['The cat sat on the mat.', 'The dog is eating food The dog is eating food']
len(sample_set)

2

# --------------------------

1. Word level 1-hot vectors from first principals

# --------------------------

### Map words to ids

In [5]:
word_to_id = {}

for sentence in sample_set:
    for word in sentence.split():
        
        if word not in word_to_id:
            word_to_id[word] = len(word_to_id) + 1
            
        

In [6]:
print(len(word_to_id))

print(word_to_id.keys())
print(word_to_id.values())

10
['on', 'eating', 'food', 'is', 'dog', 'cat', 'mat.', 'The', 'the', 'sat']
[4, 9, 10, 8, 7, 2, 6, 1, 5, 3]


### Create data-structures for data transformation

In [7]:
seq_max_len = 10

ds = np.zeros(shape=(len(sample_set), seq_max_len, len(word_to_id)+1))

### Convert sentences to input_tensor

In [8]:
# populate the ds

# %pixie_debugger

for i, sentence in enumerate(sample_set):
    for j, word in enumerate(sentence.split()[:seq_max_len]):
        
        #print(word)
        k = word_to_id[word]
        
        ds[i,j,k] = 1

In [9]:
ds.shape

(2, 10, 11)

In [10]:
ds[0]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [11]:
ds[1]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

# --------------------------

2. Character level 1-hot vectors from first principals

# --------------------------

In [12]:
import string
print(string.printable)
print(len(string.printable))

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

100


In [13]:
char_to_int = {}

char_to_int = dict(zip(string.printable, xrange(1, len(string.printable) + 1)))

In [14]:
print(char_to_int.keys())

print(char_to_int.values())

['\x0c', ' ', '$', '(', ',', '0', '4', '8', '<', '@', 'D', 'H', 'L', 'P', 'T', 'X', '\\', '`', 'd', 'h', 'l', 'p', 't', 'x', '|', '\x0b', '#', "'", '+', '/', '3', '7', ';', '?', 'C', 'G', 'K', 'O', 'S', 'W', '[', '_', 'c', 'g', 'k', 'o', 's', 'w', '{', '\n', '"', '&', '*', '.', '2', '6', ':', '>', 'B', 'F', 'J', 'N', 'R', 'V', 'Z', '^', 'b', 'f', 'j', 'n', 'r', 'v', 'z', '~', '\t', '\r', '!', '%', ')', '-', '1', '5', '9', '=', 'A', 'E', 'I', 'M', 'Q', 'U', 'Y', ']', 'a', 'e', 'i', 'm', 'q', 'u', 'y', '}']
[100, 95, 66, 70, 74, 1, 5, 9, 80, 84, 40, 44, 48, 52, 56, 60, 86, 90, 14, 18, 22, 26, 30, 34, 92, 99, 65, 69, 73, 77, 4, 8, 79, 83, 39, 43, 47, 51, 55, 59, 85, 89, 13, 17, 21, 25, 29, 33, 91, 97, 64, 68, 72, 76, 3, 7, 78, 82, 38, 42, 46, 50, 54, 58, 62, 88, 12, 16, 20, 24, 28, 32, 36, 94, 96, 98, 63, 67, 71, 75, 2, 6, 10, 81, 37, 41, 45, 49, 53, 57, 61, 87, 11, 15, 19, 23, 27, 31, 35, 93]


In [15]:
max_len = 50

ds = np.zeros(shape=(len(sample_set), max_len, len(string.printable)))

In [16]:
ds.shape

(2, 50, 100)

In [17]:
for i, sentence in enumerate(sample_set):
    for j, char in enumerate(sentence):
        
        k = char_to_int[char]
        
        ds[i,j,k] = 1

In [18]:
ds[0]

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

# --------------------------

3. Word level 1-hot vectors from using Keras

# --------------------------

In [19]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [20]:
max_words = 15 #we take 100 most frequent words in the text corpus
max_len = 10 #max number of words per sentence

In [21]:
tokenizer = Tokenizer(num_words=max_words)

In [22]:
# This builds the word index
tokenizer.fit_on_texts(sample_set)


In [23]:
# conver data to matrix directly 

ds1 = tokenizer.texts_to_sequences(sample_set)
ds1

[[1, 6, 7, 8, 1, 9], [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]]

In [24]:
ds1[0]

[1, 6, 7, 8, 1, 9]

In [25]:
# conver data to matrix directly 

ds2 = tokenizer.texts_to_matrix(sample_set, mode='binary')
ds2.shape

(2, 15)

In [26]:
ds2[1]

array([ 0.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.])

In [27]:
# note in the above case ^, kears gives bag of words

# --------------------------

4. Character level 1-hot vectors from using Keras

# --------------------------

In [28]:
max_chars = 50

In [29]:
tokenizer2 = Tokenizer(num_words=max_chars, char_level=True)

In [30]:

tokenizer2.fit_on_texts(sample_set)

In [32]:
ds3 = tokenizer2.texts_to_sequences(sample_set)
print(ds3)

[[2, 6, 4, 1, 13, 5, 2, 1, 10, 5, 2, 1, 3, 11, 1, 2, 6, 4, 1, 14, 5, 2, 15], [2, 6, 4, 1, 7, 3, 8, 1, 9, 10, 1, 4, 5, 2, 9, 11, 8, 1, 12, 3, 3, 7, 1, 2, 6, 4, 1, 7, 3, 8, 1, 9, 10, 1, 4, 5, 2, 9, 11, 8, 1, 12, 3, 3, 7]]


In [33]:
ds3[0]

[2, 6, 4, 1, 13, 5, 2, 1, 10, 5, 2, 1, 3, 11, 1, 2, 6, 4, 1, 14, 5, 2, 15]

In [34]:
ds4 = tokenizer2.texts_to_matrix(sample_set)
print(ds4)

[[ 0.  1.  1.  1.  1.  1.  1.  0.  0.  0.  1.  1.  0.  1.  1.  1.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [35]:
tokenizer2.word_counts

OrderedDict([('t', 9),
             ('h', 4),
             ('e', 6),
             (' ', 14),
             ('c', 1),
             ('a', 5),
             ('s', 3),
             ('o', 7),
             ('n', 3),
             ('m', 1),
             ('.', 1),
             ('d', 4),
             ('g', 4),
             ('i', 4),
             ('f', 2)])

--------------------------


# --------------------------

5. Hashing Trick

# --------------------------

In [41]:
max_len = 10

In [42]:
# This is like saying We consider only 1000 most important words. the space is defined by these
dimension = 1000 

In [43]:
ds = np.zeros(shape=(len(sample_set), max_len, dimension))

In [44]:
for i, sentence in enumerate(sample_set):
    for j, word in enumerate(sentence.split()[:max_len]):
        
        k = abs(hash(word))%dimension
        
        ds[i,j,k] = 1