https://jaketae.github.io/study/word2vec/

In [2]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

In [7]:
import re

def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text)

In [8]:
tokens = tokenize(text)

In [9]:
def mapping(tokens):
    word_to_id = {}
    id_to_word = {}

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word

In [10]:
word_to_id, id_to_word = mapping(tokens)
word_to_id

{'make': 0,
 'the': 1,
 'difficult': 2,
 'based': 3,
 'through': 4,
 'artificial': 5,
 'training': 6,
 'filtering': 7,
 'known': 8,
 'computer': 9,
 'build': 10,
 'being': 11,
 'in': 12,
 'applications': 13,
 'needed': 14,
 'Machine': 15,
 'It': 16,
 'develop': 17,
 'as': 18,
 'that': 19,
 'algorithms': 20,
 'experience': 21,
 'such': 22,
 'of': 23,
 'a': 24,
 'order': 25,
 'used': 26,
 'where': 27,
 'it': 28,
 'so': 29,
 'sample': 30,
 'study': 31,
 'are': 32,
 'to': 33,
 'learning': 34,
 'predictions': 35,
 'decisions': 36,
 'mathematical': 37,
 'model': 38,
 'explicitly': 39,
 'email': 40,
 'infeasible': 41,
 'subset': 42,
 'seen': 43,
 'improve': 44,
 'or': 45,
 'vision': 46,
 'programmed': 47,
 'do': 48,
 'data': 49,
 'tasks': 50,
 'and': 51,
 'variety': 52,
 'perform': 53,
 'is': 54,
 'without': 55,
 'on': 56,
 'wide': 57,
 'automatically': 58,
 'conventional': 59,
 'intelligence': 60}

In [15]:
import numpy as np

def concat(*iterables):
    for iterable in iterables:
        yield from iterable

def one_hot_encode(id, vocab_size):
    res = [0] * vocab_size
    res[id] = 1
    return res

np.random.seed(42)

def generate_training_data(tokens, word_to_id, window):
    x = []
    y = []
    n_tokens = len(tokens)

    for i in range(n_tokens):
        idx = concat(
            range(max(0, i - window), i),
            range(i + 1, min(n_tokens, i + window + 1))
        )
        for j in idx:
            if i == j:
                continue
            x.append(one_hot_encode(word_to_id[tokens[i]], len(word_to_id)))
            y.append(one_hot_encode(word_to_id[tokens[j]], len(word_to_id)))

    return x, y

In [16]:
x, y = generate_training_data(tokens, word_to_id, 2)