### One-hot Encoding of Text for use in an LSTM Neural Net work

One-hot encoding is a representation of **categorical** variables as binary vectors.

The first step in this is to map the categorical variable to integer values.

The integer is then represented as a binary vector, with each value zero, except for the index of the variable.

In [116]:
# Example - red green

categorical = ['red', 'green','red','red','green','green']

# Assign to red the integer value 1 and to green the integer value 0

integers = []

for colour in categorical:
    if colour == 'red':
        integers.append(1)
    else:
        integers.append(0)
        
# Convert integer to one-hot: for 0 convert to [0,1], for 1 convert to [1,0]

one_hot = []

for integer in integers:
    if integer == 0:
        one_hot.append([0,1])
    else:
        one_hot.append([1,0])

# print one_hot

# one_hot is now a one-hot representaton of 

# This example (manually coded) is only really practical when you have a binary variable



In [115]:
# Example 2 - Hello World

from numpy import argmax

data = 'hello world'

# Possible input values

alphabet = 'abcdefghijklmnopqrstuvwxyz '

# Define mapping from character to integers

char_to_int = dict((c,i) for i, c in enumerate(alphabet))

int_to_char = dict((i,c) for i,c in enumerate(alphabet))

# Integer encode input data

integer_encoded = [char_to_int[char] for char in data]

# one-hot encode

one_hot = [] 

for value in integer_encoded:
    letter = [0 for _ in range(len(alphabet))]
    letter[value] = 1
    one_hot.append(letter)
    
# invert encoding

inverted = ''

for element in one_hot:
    
    inverted = ''.join([inverted,int_to_char[argmax(element)]])
    
# print data
# print
# print integer_encoded
# print
# print one_hot





In [102]:
# sci-kit learn

# Encoding "Cold, Warm, Hot" in an ordinal capacity

import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Example
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = np.array(data)

# Integer Encoding
label_encoder = LabelEncoder()

integer_encoded = label_encoder.fit_transform(values)

# One-hot Encoding

onehot_encoder = OneHotEncoder(sparse = False)

integer_encoded = integer_encoded.reshape(len(integer_encoded),1)

one_hot_encoded = onehot_encoder.fit_transform(integer_encoded)

# Invert one example

inverted = label_encoder.inverse_transform([np.argmax(one_hot_encoded[0,:])])

# Argmax returns the index of the largest value of an object 



In [114]:
# one-hot with Keras

# This example is for when you already have integer-encoded values

import numpy as np
from keras.utils import to_categorical
import random

# Example
random.seed(5)

data = []

for i in range(10):
    data.append(random.randint(0,10))

data = np.array(data)

# one-hot encoded

one_hot = to_categorical(data)

# inverted

inverted = np.argmax(one_hot[0])

#print data
#print inverted


In [131]:
import pandas as pd

df = pd.read_fwf('austen/pride.txt')



﻿The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen    This eBook is for the use of anyone anywhere a...
Name: 0, dtype: object


In [177]:
def convert_from_alphabet(a):
    """Encode a character
    :param a: one character
    :return: the encoded value
    """
    if a == 9:
        return 1
    if a == 10:
        return 127 - 30  # LF
    elif 32 <= a <= 126:
        return a - 30
    else:
        return 0  # unknown
    
def encode_text(s):
    """Encode a string.
    :param s: a text string
    :return: encoded list of code points
    """
    return list(map(lambda a: convert_from_alphabet(ord(a)), s))


def decode_to_text(c, avoid_tab_and_lf=False):
    """Decode an encoded string.
    :param c: encoded list of code points
    :param avoid_tab_and_lf: if True, tab and line feed characters are replaced by '\'
    :return:
    """
    return "".join(map(lambda a: chr(convert_to_alphabet(a, avoid_tab_and_lf)), c))


#for i in range(97):
#    print decode_to_text([i])
    
#[decode_to_text(x) for x in [range(97)]]





array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])