https://jaketae.github.io/study/word2vec/

In [1]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

In [2]:
import re

def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text)

In [3]:
tokens = tokenize(text)

In [4]:
def mapping(tokens):
    word_to_id = {}
    id_to_word = {}

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word

In [5]:
word_to_id, id_to_word = mapping(tokens)
word_to_id

{'mathematical': 0,
 'being': 1,
 'subset': 2,
 'make': 3,
 'based': 4,
 'of': 5,
 'learning': 6,
 'sample': 7,
 'to': 8,
 'build': 9,
 'without': 10,
 'are': 11,
 'is': 12,
 'computer': 13,
 'on': 14,
 'variety': 15,
 'wide': 16,
 'data': 17,
 'intelligence': 18,
 'a': 19,
 'automatically': 20,
 'in': 21,
 'algorithms': 22,
 'and': 23,
 'seen': 24,
 'explicitly': 25,
 'Machine': 26,
 'conventional': 27,
 'artificial': 28,
 'through': 29,
 'predictions': 30,
 'experience': 31,
 'do': 32,
 'where': 33,
 'known': 34,
 'used': 35,
 'develop': 36,
 'needed': 37,
 'It': 38,
 'order': 39,
 'that': 40,
 'or': 41,
 'perform': 42,
 'improve': 43,
 'tasks': 44,
 'filtering': 45,
 'the': 46,
 'so': 47,
 'applications': 48,
 'training': 49,
 'vision': 50,
 'infeasible': 51,
 'email': 52,
 'it': 53,
 'decisions': 54,
 'difficult': 55,
 'model': 56,
 'as': 57,
 'study': 58,
 'such': 59,
 'programmed': 60}

In [15]:
import numpy as np

def concat(*iterables):
    for iterable in iterables:
        yield from iterable

def one_hot_encode(id, vocab_size):
    res = [0] * vocab_size
    res[id] = 1
    return res

np.random.seed(42)


def generate_training_data(tokens, word_to_id, window):
    X = []
    y = []
    n_tokens = len(tokens)

    for i in range(n_tokens):
        idx = concat(
            range(max(0, i - window), i),
            range(i, min(n_tokens, i + window + 1))
        )
        for j in idx:
            if i == j:
                continue
            X.append(one_hot_encode(word_to_id[tokens[i]], len(word_to_id)))
            y.append(one_hot_encode(word_to_id[tokens[j]], len(word_to_id)))

    return np.asarray(X), np.asarray(y)

In [16]:
x, y = generate_training_data(tokens, word_to_id, 2)

In [17]:
x.shape

(330, 61)

In [18]:
y.shape

(330, 61)

In [19]:
x[60]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])