# Word embeddings

This workbook demonstrates how to use word embeddings as a feature extraction technique. It uses the implementation of the word embedding algorithm from gensim package. 



In [None]:
!pip install -q gensim


In [75]:
from gensim.models import word2vec

In [None]:
# now, we need to prepare a dataset
# in our case, let's just read a dataset that is a code of a program

# in this example, I use the file from an open source component - Azure NetX
# the actual part is not that important, as long as we have a set of 
# tokens that we want to analyze
path = './nx_icmp_checksum_compute.c'

# read all lines into an array
with open(path, 'r') as r:
  lines = r.readlines()

# and see how many lines we got
print(f'The file (and thus our corpus) contains {len(lines)} lines')

The file (and thus our corpus) contains 184 lines


In [76]:
# we need to pass splitted sentences to the model
tokenized_sentences = [sentence.split() for sentence in lines]

model = word2vec.Word2Vec(tokenized_sentences, 
                          vector_size=10, 
                          window=1, 
                          min_count=0, 
                          workers=4)

In [77]:
# now, let's see the vocabulary of the model
model.wv.key_to_index

{'*/': 0,
 '/*': 1,
 'the': 2,
 '=': 3,
 'checksum': 4,
 '->': 5,
 'packet': 6,
 'if': 7,
 'of': 8,
 '/**************************************************************************/': 9,
 '}': 10,
 '{': 11,
 '+': 12,
 'checksum.': 13,
 'to': 14,
 'word_ptr': 15,
 'length': 16,
 'in': 17,
 'a': 18,
 '/**': 19,
 'at': 20,
 'and': 21,
 'ULONG': 22,
 'is': 23,
 'Determine': 24,
 'byte': 25,
 'into': 26,
 '(checksum': 27,
 '&': 28,
 'word': 29,
 'Microsoft': 30,
 'Add': 31,
 'ICMP': 32,
 'pointer.': 33,
 'current_packet': 34,
 'Setup': 35,
 '0;': 36,
 'nx_packet_append_ptr)': 37,
 'we': 38,
 'word.': 39,
 '>>': 40,
 'packet.': 41,
 'NX_LOWER_16_MASK);': 42,
 'Move': 43,
 'pointer': 44,
 'Yuxin': 45,
 '-': 46,
 '(UCHAR': 47,
 'current': 48,
 '>=': 49,
 'else': 50,
 'end': 51,
 '#include': 52,
 'nx_packet_prepend_ptr;': 53,
 'RELEASE': 54,
 'None': 55,
 'zero': 56,
 'short_temp;': 57,
 'nx_packet_last)': 58,
 'there': 59,
 '6.1': 60,
 'packet_ptr': 61,
 'Pickup': 62,
 'We': 63,
 'DESCRIPTION': 6

In [78]:
model.wv.most_similar(positive=['add'])

[('NX_LOWER_16_MASK;', 0.8372778296470642),
 ('Mask', 0.8019374012947083),
 ('DESCRIPTION', 0.7171915173530579),
 ('Version', 0.7050908803939819),
 ('Pickup', 0.6866066455841064),
 ('are', 0.6395519971847534),
 ('Do', 0.6153941750526428),
 ('BY', 0.6148180365562439),
 ('NX_LITTLE_ENDIAN', 0.5829669833183289),
 ('under', 0.5750270485877991)]

In [None]:
# since this is not a tokenizer, we cannot use it to tokenize our simple C program
# but we can do other things, like finding the similar words

# in the below statement, we mean something like:
# file + function - found = similar
similar = model.wv.most_similar(positive=['file', 'function'], negative=['found'])

similar

[('again', 0.24998697638511658),
 ('word', 0.21356187760829926),
 ('05-19-2020', 0.21174617111682892),
 ('*current_packet;', 0.2079058289527893),
 ('current_packet', 0.2042725533246994),
 ('short_temp', 0.1908235102891922),
 ('NX_SHIFT_BY_16)', 0.18824389576911926),
 ('packet.', 0.18712595105171204),
 ('length;', 0.18539084494113922),
 ('computed', 0.18160541355609894)]

In [None]:
# if we want to use our C program, we need to use it in the training data,
# otherwise we get an exeption like this:

# and check the similar words to "return"
similar = model.wv.most_similar(positive=['return'])

similar 

## FastText

To address the problem of words that do not exist, we can use the FastText model instead, which is able to guess the words that do not exist

In [None]:
from gensim.models import FastText

# create the instance of the model
model = FastText(vector_size=4, 
                 window=3, 
                 min_count=1)

# build a vocabulary
model.build_vocab(corpus_iterable=tokenized_sentences)

# and train the model
model.train(corpus_iterable=tokenized_sentences, 
            total_examples=len(tokenized_sentences), 
            epochs=10)

(16, 130)

In [None]:
# now, let's try to use the word that is not in the vocabulary
similar = model.wv.most_similar(positive=['return'])

# and voila, here it is, the model can approximate 
# the words that are not part of the vocabulary
similar

[('void', 0.5913326740264893),
 ('int', 0.43626993894577026),
 ('{', 0.2602742612361908),
 ('"Hello', 0.23500549793243408),
 ('}', 0.21387670934200287),
 ('argc,', 0.1757005751132965),
 ('World', 0.17528733611106873),
 ('");', 0.15775901079177856),
 ('0;', 0.1009957417845726),
 ('**argc)', -0.29264017939567566)]