# Word embeddings for Beginners

## One Hot-Encoding

In [94]:
from keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
from glob import glob


In [1]:
text0="Lucy in the sky with diamonds."

text1="Guess who is back, back again \
Shady is back, tell a friend \
Guess who is back, guess who is back? \
Guess who is back, guess who is back? \
Guess who is back, guess who is back? \
Guess who is back?"

text2="Speculate who retruned, returned once more \
Shady has returned, announce it to your pal \
Speculate who returned, speculate who returned! \
Speculate who returned, speculate who returned! \
Speculate who returned, speculate who returned! \
Speculate who returned!"


In [67]:

def one_hot(list_of_texts):
    """input: takes a list of strings 
    returns:
    1. one-hot-encoding of entire corpus
    2. dict: dict_word_reps[token]=array(one-hot-encoding)
    3. vocabulary
    """
    corpus= str([string[0:-1] for string in list_of_texts])
    vocabulary= list(set(text_to_word_sequence(corpus)))
    integers=[idx for idx, word in enumerate(vocabulary)]
    one_hot= to_categorical(integers)
    each_word= list(zip(vocabulary, one_hot))
    dict_word_reps={}
    for pair in each_word:
        dict_word_reps[pair[0]] = pair[1]

    return one_hot, dict_word_reps, vocabulary 



### One-hot-encoding of *text0*

In [68]:

one_hot_text0, wv_text0, vocab_text0 = one_hot([text0])
print("\u2022 This is the one-hot representation of text0")
print(one_hot_text0)
print(" ")
print('\u2022 This is the one-hot representation of the token "sky"')
print(wv_text0["sky"]) 
print(" ")
print('\u2022 Text0 has a vocabulary of size', str(len(vocab_text0)) )


• This is the one-hot representation of text0
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]
 
• This is the one-hot representation of the token "sky"
[0. 0. 0. 1. 0. 0.]
 
• Text0 has a vocabulary of size 6


### One-hot-encoding with *text1* and *text2* as corpus

In [69]:
one_hot_text12, wv_text12, vocab_text12= one_hot([text1, text2])

In [109]:
print("The corpus consists of", len(vocab_text12), "tokens")

The corpus consists of 24 tokens


In [110]:
print(wv_text12["shady"])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [113]:
print("Comparing representations of tell and announce:")
print(wv_text12["tell"])
print(wv_text12["announce"])

Comparing representations of tell and announce:
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [114]:
print("Comparing representations of friend and pal:")
print(wv_text12["friend"])
print(wv_text12["pal"])

Comparing representations of friend and pal:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [115]:
pre_processed_text1= text_to_word_sequence(text1)
pre_processed_text2= text_to_word_sequence(text2)

representation_text1=[wv_text12[word] for word in pre_processed_text1]
representation_text2=[wv_text12[word] for word in pre_processed_text2]

In [116]:
print("This is the one_hot_encoding of text1")
for vector in representation_text1:
    print(vector)

print(" ")
print("This is the one_hot_encoding of text2")
for vector in representation_text1:
    print(vector)

This is the one_hot_encoding of text1
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


In [117]:
i=0
for idx, vector1 in enumerate(representation_text1): 
    if idx <len(representation_text2):
        if np.array_equal(vector1, representation_text2[idx]):
            i += 1
print("Number of times where the same token appears at the same place in both texts :", i)

Number of times where the same token appears at the same place in both texts : 4
