# Text to basic vectorization (1-hot)

Text is mostly fed to a neunal network in the form of vectors (1-hot). 
This can be done at word level and at character level (often called tokenization)

In this notebook, we will do
    1. Word level 1-hot vectors from first principals
    2. Character level 1-hot vectors from first principals
    3. Word level 1-hot vectors using keras library
    4. Character level 1-hot vectors using keras library
    5. Hashing trick

In [1]:
%load_ext autoreload
%autoreload 2

### Basic Imports

In [2]:
import numpy as np
import string

### Sample data

In [3]:
sample_set = ['The cat sat on the mat.', 'The dog is eating food The dog is eating food']
len(sample_set)

2

# --------------------------

1. Word level vectorization. 1-hot vectors from first principals

# --------------------------

#### Build word to id mapping/dictionary

In [4]:
max_len = 10

word_to_id = {}

for sentence in sample_set:
    for word in sentence.split()[:max_len]:
        
        if word not in word_to_id:
            word_to_id[word] = len(word_to_id)
            

In [5]:
print(len(word_to_id))

print(word_to_id.keys())

print(word_to_id.values())

10
['on', 'eating', 'food', 'is', 'dog', 'cat', 'mat.', 'The', 'the', 'sat']
[3, 8, 9, 7, 6, 1, 5, 0, 4, 2]


#### Data structure for data

In [6]:
DS = np.zeros(shape=(len(sample_set), max_len, len(word_to_id)+1))

In [7]:
DS.shape

(2, 10, 11)

#### Populate DS with data

In [8]:
for i, sentence in enumerate(sample_set):
    for j, word in enumerate(sentence.split()[:max_len]):
        
        k = word_to_id[word]
        
        try:
            DS[i,j,k] = 1
        except IndexError:
            print(i)

In [9]:
print(DS[0])

[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [10]:
print(DS[1])

[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]]


# --------------------------

2. Character level vectorization. 1-hot vectors from first principals

# --------------------------

#### Character to id mapping/dictionary

In [11]:
character_set = string.printable

char_to_id = dict(zip(character_set, range(0, len(character_set))))

In [12]:
print(len(char_to_id))

print(char_to_id.keys())

print(char_to_id.values())

100
['\x0c', ' ', '$', '(', ',', '0', '4', '8', '<', '@', 'D', 'H', 'L', 'P', 'T', 'X', '\\', '`', 'd', 'h', 'l', 'p', 't', 'x', '|', '\x0b', '#', "'", '+', '/', '3', '7', ';', '?', 'C', 'G', 'K', 'O', 'S', 'W', '[', '_', 'c', 'g', 'k', 'o', 's', 'w', '{', '\n', '"', '&', '*', '.', '2', '6', ':', '>', 'B', 'F', 'J', 'N', 'R', 'V', 'Z', '^', 'b', 'f', 'j', 'n', 'r', 'v', 'z', '~', '\t', '\r', '!', '%', ')', '-', '1', '5', '9', '=', 'A', 'E', 'I', 'M', 'Q', 'U', 'Y', ']', 'a', 'e', 'i', 'm', 'q', 'u', 'y', '}']
[99, 94, 65, 69, 73, 0, 4, 8, 79, 83, 39, 43, 47, 51, 55, 59, 85, 89, 13, 17, 21, 25, 29, 33, 91, 98, 64, 68, 72, 76, 3, 7, 78, 82, 38, 42, 46, 50, 54, 58, 84, 88, 12, 16, 20, 24, 28, 32, 90, 96, 63, 67, 71, 75, 2, 6, 77, 81, 37, 41, 45, 49, 53, 57, 61, 87, 11, 15, 19, 23, 27, 31, 35, 93, 95, 97, 62, 66, 70, 74, 1, 5, 9, 80, 36, 40, 44, 48, 52, 56, 60, 86, 10, 14, 18, 22, 26, 30, 34, 92]


#### Data store for data

In [13]:
max_len = 50

DS = np.zeros(shape=(len(sample_set), max_len, len(char_to_id)+1))

#### Populate Data

In [14]:
for i, sentence in enumerate(sample_set):
    for j, char in enumerate(sentence[:max_len]):
        
        k = char_to_id[char]
        
        try:
            DS[i,j,k] = 1
        except IndexError:
            print(i.j,k)

In [15]:
print(DS.shape)

print(DS[0][0])

print(DS[1][1])

(2, 50, 101)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


# --------------------------

3. Word level vectorization. 1-hot vectors using keras

# --------------------------

In [16]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [17]:
max_words = 15 #we take 100 most frequent words in the text corpus
max_len = 10 #max number of words per sentence

tok = Tokenizer(num_words=max_words)

tok.fit_on_texts(sample_set)

In [18]:
# summarize what was learned

print(len(tok.word_index))

print(tok.word_counts)

print(tok.document_count)

print(tok.word_index)
print(tok.word_docs)

9
OrderedDict([('the', 4), ('cat', 1), ('sat', 1), ('on', 1), ('mat', 1), ('dog', 2), ('is', 2), ('eating', 2), ('food', 2)])
2
{'on': 8, 'eating': 4, 'mat': 9, 'food': 5, 'is': 3, 'dog': 2, 'cat': 6, 'the': 1, 'sat': 7}
defaultdict(<type 'int'>, {'on': 1, 'eating': 1, 'mat': 1, 'food': 1, 'is': 1, 'dog': 1, 'cat': 1, 'the': 2, 'sat': 1})


In [19]:
# conver data to matrix directly 

encoding = tok.texts_to_matrix(sample_set)

In [20]:
print(encoding.shape)

print(encoding[0])

(2, 15)
[ 0.  1.  0.  0.  0.  0.  1.  1.  1.  1.  0.  0.  0.  0.  0.]


In [21]:
# note in the above case ^, kears gives bag of words

# --------------------------

4. Character level 1-hot vectors from using Keras

# --------------------------

In [22]:
max_words = 15 #we take 100 most frequent words in the text corpus
max_len = 10 #max number of words per sentence

tok2 = Tokenizer(num_words=max_words, char_level=True)

tok2.fit_on_texts(sample_set)

In [23]:
# summarize what was learned

print(len(tok2.word_index))

print(tok2.word_counts)

print(tok2.document_count)

print(tok2.word_index)
print(tok2.word_docs)

15
OrderedDict([('t', 9), ('h', 4), ('e', 6), (' ', 14), ('c', 1), ('a', 5), ('s', 3), ('o', 7), ('n', 3), ('m', 1), ('.', 1), ('d', 4), ('g', 4), ('i', 4), ('f', 2)])
2
{'a': 5, ' ': 1, 'c': 13, 'e': 4, 'd': 7, 'g': 8, 'f': 12, 'i': 9, 'h': 6, 'm': 14, 'o': 3, 'n': 11, 's': 10, 't': 2, '.': 15}
defaultdict(<type 'int'>, {'a': 2, ' ': 2, 'c': 1, 'e': 2, 'd': 1, 'g': 1, 'f': 1, 'i': 1, 'h': 2, 'm': 1, 'o': 2, 'n': 2, 's': 2, 't': 2, '.': 1})


In [24]:
encoding = tok2.texts_to_matrix(sample_set)

In [25]:
print(encoding.shape)

print(encoding[0])

print(encoding[1])


(2, 15)
[ 0.  1.  1.  1.  1.  1.  1.  0.  0.  0.  1.  1.  0.  1.  1.]
[ 0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.]


--------------------------


# --------------------------

5. Hashing Trick

# --------------------------

In [26]:
max_len = 10

dimension = 1000 #hashing space
# This is like saying We consider only 1000 most important words. the space is defined by these

In [27]:
DS = np.zeros(shape=(len(sample_set), max_len, dimension))

In [28]:
for i, sentence in enumerate(sample_set):
    for j, word in enumerate(sentence.split()[:max_len]):
        
        k = abs(hash(word))%dimension
        
        DS[i,j,k] = 1

In [29]:
print(DS.shape)

print(DS[0])

print(DS[1])

print(DS[0][0])

(2, 10, 1000)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  