In [9]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [10]:
corpus = "a quick brown fox jumps over the lazy dog"

In [11]:
def preprocess_data(corpus):
    corpus_lower = corpus.lower()
    return (corpus_lower.split(" "))

In [12]:
preprocess_data(corpus)

['a', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [13]:
def one_hot_encoding(word_list):
    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform((np.array(word_list).reshape(-1,1)))
    return encoded_data

In [14]:
# one hot encoding
encoding = one_hot_encoding(preprocess_data(corpus))

In [15]:
def dot_product (vec1, vec2):
    return sum(v1 * v2 for v1,v2 in zip(vec1, vec2))

In [16]:
def softmax(score, all_scores):
    exp_score = [np.exp(score) for score in all_scores]
    sum_exp_scores = sum(exp_score)
    return np.exp(score)/sum_exp_scores

In [17]:
def compute_gradient(probability, target, word_embedding, context_word_embedding):
    gradient = (probability - target) * context_word_embedding
    return gradient

In [18]:
def get_context_words(sentence, source_word_index, Window_size):
    context_words = []
    start = max(0, source_word_index - Window_size)
    end = min(len(sentence), source_word_index + Window_size + 1)
    for j in range(start, end):
        if j != source_word_index:
            context_words.append(sentence[j])
    return context_words

In [19]:
V = len(preprocess_data(corpus)) #length of vocabulary
embedding_size = 5
window_size = 2
epochs = 10
learning_rate = 0.001

In [20]:
# ramdonly initialize W and W_ between -0.5 and 0.5
W = np.random.uniform(-0.5,0.5, (V, embedding_size))
W_ = np.random.uniform(-0.5, 0.5, (V, embedding_size))

In [21]:
word_list = preprocess_data(corpus)
for epoch in range(epochs):
    for i in range(len(word_list)):
        source_word = word_list[i]
        context_words = get_context_words(word_list, i, window_size)
        source_word_embedding = W[word_list.index(source_word)]
        all_scores = []
        for word in word_list:
            context_word_embedding = W_[word_list.index(word)]
            score = dot_product(source_word_embedding, context_word_embedding)
            all_scores.append(score)
        for context_word in context_words:
            context_word_embedding = W_[word_list.index(context_word)]
            score = dot_product(source_word_embedding, context_word_embedding)
            probability = softmax(score, all_scores)
            target = 1 if context_word in context_words else 0
            gradient_W = compute_gradient(probability, target, source_word_embedding, context_word_embedding)
            gradient_W_ = compute_gradient(probability, target, context_word_embedding, source_word_embedding)
            W[word_list.index(source_word)] = W[word_list.index(source_word)] - learning_rate * gradient_W
            W_[word_list.index(context_word)] = W_[word_list.index(context_word)] - learning_rate * gradient_W_

        
W, W_


(array([[ 0.35675099, -0.19165689, -0.04307702,  0.26098336,  0.35200362],
        [-0.0842466 , -0.15733899,  0.4232296 ,  0.42572331,  0.06262268],
        [ 0.22329653, -0.04691784, -0.00349711,  0.49174253, -0.18585359],
        [ 0.20109137, -0.00946429,  0.37030383, -0.4542065 , -0.13572945],
        [ 0.35492043,  0.08679703, -0.21207638,  0.25893624,  0.44234139],
        [-0.25336049,  0.01079983,  0.48926434,  0.04995064,  0.35617572],
        [-0.24701855, -0.46682775,  0.02733163,  0.46211667, -0.1452077 ],
        [ 0.01328174, -0.27103376,  0.47498057,  0.16986278, -0.18414042],
        [-0.2775434 ,  0.17604357,  0.28247009,  0.46732447, -0.34650421]]),
 array([[-0.15888281, -0.47322279,  0.39039164,  0.09378718,  0.20708696],
        [ 0.50332802, -0.29785753, -0.17298933,  0.42819127, -0.27032192],
        [-0.49078463, -0.04745013,  0.14573869,  0.4200493 ,  0.18152726],
        [ 0.21978838,  0.13941535, -0.24852388, -0.09108882,  0.25747329],
        [-0.04627517, -

In [22]:
def test (word, W, W_, word_list, window_size):
    source_word_index = word_list.index(word)
    source_word_embedding = W[source_word_index]
    all_scores = []
    for context_word_index, context_word in enumerate(word_list):
        context_word_embedding = W_[context_word_index]
        score = np.dot(source_word_embedding, context_word_embedding)
        all_scores.append(score)
    exp_scores = np.exp(all_scores)
    sum_exp_scores = np.sum(exp_scores)
    probabilities = exp_scores/sum_exp_scores
    
    predicted_targets = {}
    for i,prob in enumerate(probabilities):
        target_word = word_list[i]
        if source_word_index - window_size <= i <= source_word_index + window_size and i != source_word_index:
            predicted_targets[target_word] = prob
    return predicted_targets

In [24]:
test('brown', W, W_, word_list, window_size)

{'a': 0.10350231719615158,
 'quick': 0.1536035133646259,
 'fox': 0.0992224988029641,
 'jumps': 0.10225346923393337}