# One Hot Encoding

## Manual One-Hot Encoding

In [6]:
def onehot_word(word):
    lookup = {v[1]: v[0] for v in enumerate(set(word))}

    word_vector = []
    for c in word:
        one_hot_vector = [0] * len(lookup)
        one_hot_vector[lookup[c]] = 1
        word_vector.append(one_hot_vector)
    return word_vector

In [7]:
onehot_word('data')

[[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]]

## One Hot Encoding Using Keras

In [8]:
import keras
keras.preprocessing.text.one_hot('dawn of man', n=5)

[3, 3, 3]

In [9]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.text import one_hot
import pandas as pd


In [11]:
with open('../data/100lines.txt') as lines_file:
    movie_lines = [ line.strip()  for line in lines_file.readlines()]
    
movie_lines[:10]

['They do not!',
 'They do to!',
 'I hope so.',
 'She okay?',
 "Let's go.",
 'Wow',
 "Okay -- you're gonna need to learn how to lie.",
 'No',
 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'Like my fear of wearing pastels?']

In [12]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(movie_lines)

In [13]:
lines_as_integers = tokenizer.texts_to_sequences(movie_lines)

In [14]:
lines_as_integers[:4]

[[4, 8, 2, 12, 1, 13, 3, 1, 6, 3, 4, 33],
 [4, 8, 2, 12, 1, 13, 3, 1, 4, 3, 33],
 [7, 1, 8, 3, 24, 2, 1, 9, 3, 15],
 [9, 8, 2, 1, 3, 21, 5, 12, 25]]

In [15]:
vocabulary = set(int_value for line_values in lines_as_integers for int_value in line_values)

In [16]:
vocabulary_size = len(vocabulary)

In [17]:
movie_lines_one_hot = [keras.utils.to_categorical(line, vocabulary_size+1) for line in lines_as_integers]

In [22]:
movie_lines = pd.read_csv('../data/100lines.txt', sep='\t', header=None)
movie_lines.columns = ['line']

In [23]:
movie_lines_one_hot[0:4]

[array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       

In [24]:
tokenizer.fit_on_texts(movie_lines)

In [25]:
int_sequence = tokenizer.texts_to_sequences(movie_lines)

In [26]:
int_sequence 

[[14, 7, 6, 2]]

In [27]:
vocabulary = set(int_value for line_values in int_sequence for int_value in line_values)

In [28]:
vocabulary_size = len(vocabulary)

In [29]:
keras.utils.to_categorical([1,2,3], vocabulary_size)

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]], dtype=float32)

In [30]:
to_categorical([1,2,3,1, 0], 5)

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.]], dtype=float32)

In [31]:
import numpy as np
np.array(int_sequence)

array([[14,  7,  6,  2]])

In [32]:
movie_lines

Unnamed: 0,line
0,They do not!
1,They do to!
2,I hope so.
3,She okay?
4,Let's go.
5,Wow
6,Okay -- you're gonna need to learn how to lie.
7,No
8,I'm kidding. You know how sometimes you just ...
9,Like my fear of wearing pastels?


### Character Level Encoding Using Keras

In [33]:
text = 'One small step for man'

In [34]:
from keras.preprocessing.text import Tokenizer
import numpy as np
char_tokenizer = Tokenizer(char_level=True)

In [35]:
char_tokenizer.fit_on_texts(text)

In [36]:
char_tokenizer.texts_to_sequences(text)

[[2],
 [3],
 [4],
 [1],
 [5],
 [6],
 [7],
 [8],
 [8],
 [1],
 [5],
 [9],
 [4],
 [10],
 [1],
 [11],
 [2],
 [12],
 [1],
 [6],
 [7],
 [3]]

In [37]:
char_tokenizer.index_word

{1: ' ',
 2: 'o',
 3: 'n',
 4: 'e',
 5: 's',
 6: 'm',
 7: 'a',
 8: 'l',
 9: 't',
 10: 'p',
 11: 'f',
 12: 'r'}

In [38]:
char_tokenizer.word_index

{' ': 1,
 'o': 2,
 'n': 3,
 'e': 4,
 's': 5,
 'm': 6,
 'a': 7,
 'l': 8,
 't': 9,
 'p': 10,
 'f': 11,
 'r': 12}

In [43]:
char_vectors = char_tokenizer.texts_to_matrix(text)
char_vectors

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 

In [44]:
char_vectors.shape

(22, 13)

In [45]:
char_tokenizer.index_word[np.argmax(char_vectors[0])]

'o'

## One Hot Encoding Words

In [49]:
with open('../data/100lines.txt') as lines_file:
    movie_lines = [ line.strip()  for line in lines_file.readlines()]
    
movie_lines[:10]

['They do not!',
 'They do to!',
 'I hope so.',
 'She okay?',
 "Let's go.",
 'Wow',
 "Okay -- you're gonna need to learn how to lie.",
 'No',
 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'Like my fear of wearing pastels?']

In [50]:
lines_array = np.array(movie_lines)
lines_array.reshape(-1,1)
lines_array.shape

(100,)

In [51]:
from sklearn import preprocessing

In [54]:
wordOneHotEncoder = preprocessing.OneHotEncoder()
labelEncoder = preprocessing.LabelEncoder()
movie_labels = labelEncoder.fit_transform(lines_array)

In [55]:
movie_labels

array([72, 73, 30, 58, 41, 87, 53, 48, 34, 42, 67, 80, 28, 64, 45, 79, 99,
       51, 69, 88,  3, 70, 77, 94, 33, 31, 74, 15, 23, 91, 44, 19, 83, 60,
       39, 59, 35, 32, 63,  8, 21, 98, 90, 24, 89, 82, 71, 78, 22, 11, 66,
       27, 56,  4, 40, 13, 65, 75, 85, 57, 68,  1, 49, 12, 96, 54, 52, 76,
        5, 25, 95, 26, 29,  2, 62,  0,  6, 36, 16, 86, 14, 38, 84, 81, 93,
       50,  7, 92, 43, 97, 18, 17, 46, 47, 37, 55, 61, 20,  9, 10],
      dtype=int64)

In [56]:
movie_labels.reshape(-1,1)

array([[72],
       [73],
       [30],
       [58],
       [41],
       [87],
       [53],
       [48],
       [34],
       [42],
       [67],
       [80],
       [28],
       [64],
       [45],
       [79],
       [99],
       [51],
       [69],
       [88],
       [ 3],
       [70],
       [77],
       [94],
       [33],
       [31],
       [74],
       [15],
       [23],
       [91],
       [44],
       [19],
       [83],
       [60],
       [39],
       [59],
       [35],
       [32],
       [63],
       [ 8],
       [21],
       [98],
       [90],
       [24],
       [89],
       [82],
       [71],
       [78],
       [22],
       [11],
       [66],
       [27],
       [56],
       [ 4],
       [40],
       [13],
       [65],
       [75],
       [85],
       [57],
       [68],
       [ 1],
       [49],
       [12],
       [96],
       [54],
       [52],
       [76],
       [ 5],
       [25],
       [95],
       [26],
       [29],
       [ 2],
       [62],
       [ 0],
       [ 6],

In [57]:
movie_onehot = wordOneHotEncoder.fit_transform(movie_labels.reshape(-1,1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [58]:
movie_onehot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])