## text data encoding

In [15]:
#imports
from IPython.display import display, Markdown #just to display markdown
import numpy as np  #for vector operation
import string   # provides strings variations for character embedding
from keras.preprocessing.text import Tokenizer


In [18]:
display(Markdown("## samples texts"))

#creating sample text
samples = ["The cat sat on the mat", " The dog ate my homework"] 

#initializing word-encoding dictionary
token_index = {} 

samples

## samples texts

['The cat sat on the mat', ' The dog ate my homework']

In [31]:
display(Markdown("## word level one hot encoding"))

def word_one_hot_embeddings(text_samples,max_sentence_length=8):
    """
        # Creating sentence vectors by represent each word as a vector
        # word as a vector: vector of length of vocabulary, with "1" for that specified word and zero elsewhere
    """

    # Encoding words in the corpus
    print("\nStarting to create token_index for every distinct vocabulary")
    for sample in text_samples:
        for word in sample.split():
            if word not in token_index.keys():
                token_index[word] = len(token_index) + 1
    print("token_index created.\n")

    # Word level one hot encoding
    print("\nStarting to create sentence vector using token_index")
    results = np.zeros(shape=(len(text_samples),max_sentence_length,len(token_index)+1))
    for i , sample in enumerate(text_samples):
        for j , word in enumerate(sample.split()):
            index_ = token_index[word]
            results[i,j,index_] = 1
    print("sentences vectorised.\n")
    return (results)

word_level_vectorised_sample = word_one_hot_embeddings(text_samples=samples,max_sentence_length=8)
shape = (word_level_vectorised_sample.shape)
print("Output shape {}".format(word_level_vectorised_sample.shape))
print("# sample:{},  specified max length:{},  # vocabulary:{}".format(shape[0],shape[1],shape[2]))
print(word_level_vectorised_sample)

## word level one hot encoding


Starting to create token_index for every distinct vocabulary
token_index created.


Starting to create sentence vector using token_index
sentences vectorised.

Output shape (2, 8, 11)
# sample:2,  specified max length:8,  # vocabulary:11
[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


In [42]:
display(Markdown("## character level one hot encoding"))

def character_one_hot_embeddings(characters):
    """
        # Creating sentence vectors by using character level encoding
    """
    token_index =  dict(zip((range(1,len(characters)+1)),characters))
    max_length  = 50

    results = np.zeros(shape=(len(samples),max_length,len(characters)+1))

    for i , sample in enumerate(samples):
        for j , character in enumerate(samples):
            results[i,j,token_index.get(character)] = 1
    return (results)
            
characters = string.printable
char_level_vectorised_sample = character_one_hot_embeddings(characters)
shape = (char_level_vectorised_sample.shape)

display(Markdown("### characters used for encoding"))
print ("{} distinct character vocab present".format(len(characters)))
print (characters)

print("Output shape {}".format(char_level_vectorised_sample.shape))
print("# sample:{},  specified max length:{},  # vocabulary:{}".format(shape[0],shape[1],shape[2]))
print(char_level_vectorised_sample)

## character level one hot encoding

### characters used for encoding

100 distinct character vocab present
0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

Output shape (2, 50, 101)
# sample:2,  specified max length:50,  # vocabulary:101
[[[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 1. 1. ... 1. 1. 1.]
  [1. 1. 1. ... 1. 1. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [48]:
display(Markdown("## encoding - KERAS"))
# One-hot encoding using keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(samples)

sequences = tokenizer.texts_to_sequences(samples)
one_hot_results = tokenizer.texts_to_matrix(samples,mode='binary')
word_index = tokenizer.word_index
one_hot_results

## encoding - KERAS

array([[0., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1., 1., 1., 1.]])