In [1]:
import numpy as np

In [24]:
sentences = ['this tutorial wordvec scratch chapter toxic comment classification', 'this chapter focuses toxic comment classification aditionally wordvec tutorial']


In [25]:
all_words = []
for each in sentences:
    all_words.extend(each.split(' '))
len(all_words), all_words

(17,
 ['this',
  'tutorial',
  'wordvec',
  'scratch',
  'chapter',
  'toxic',
  'comment',
  'classification',
  'this',
  'chapter',
  'focuses',
  'toxic',
  'comment',
  'classification',
  'aditionally',
  'wordvec',
  'tutorial'])

In [26]:
word_index = {}
word_index['UNK'] = 0
index = 1
for each in all_words:
    if each in word_index.keys():
        continue
    word_index[each] = index
    index += 1

vocab_words = list(word_index.keys())
vocab_len = len(vocab_words)
word_index, vocab_len

({'UNK': 0,
  'this': 1,
  'tutorial': 2,
  'wordvec': 3,
  'scratch': 4,
  'chapter': 5,
  'toxic': 6,
  'comment': 7,
  'classification': 8,
  'focuses': 9,
  'aditionally': 10},
 11)

In [27]:
window_size = 2

contexts = []
targets = []
for sentence in sentences:
    split_sent = sentence.split(' ')
    for index, word in enumerate(split_sent):
        
        center_word = [0 for v in range(vocab_len)]
        context_word = [0 for v in range(vocab_len)]
        center_word[word_index[word]] = 1
        for index2 in range(index - window_size, index + window_size):
            if index != index2 and index2 >= 0 and index2 <= len(split_sent) - 1:
                context_word[word_index[split_sent[index2]]] = 1
        targets.append(center_word)
        contexts.append(context_word)
        

In [28]:
np.array(targets).shape, np.array(contexts).shape


((17, 11), (17, 11))

In [29]:
def softmax(u):
    e_u = np.exp(u - np.max(u))
    return e_u / e_u.sum()

In [30]:
def initialize(vocab_len, vocab_words, embedding_dim):
    W = np.random.uniform(-0.8, 0.8, (vocab_len, embedding_dim))
    W1 = np.random.uniform(-0.8, 0.8, (embedding_dim, vocab_len))
    return W, W1
        

In [31]:
def feed_forward(data, W, W1, embedding_dim):
    h = np.dot(W.T, data).reshape(embedding_dim,1)
    u = np.dot(W1.T,h)
    y = softmax(u)
    return y, u, h

In [32]:
def backpropagation(xdata, ydata, y, vocab_len, alpha, h, W, W1):
    e = y - np.asarray(ydata).reshape(vocab_len, 1)
    dLdW1 = np.dot(h, e.T)
    x = np.array(xdata).reshape(vocab_len, 1)
    dLdW = np.dot(x, np.dot(W1, e).T)
    W1 = W1 - alpha * dLdW1
    W = W - alpha * dLdW
    return W, W1

In [33]:
def train(epochs, x_train, y_train, W, W1, embedding_dim, vocab_len, alpha):
    for epoch in range(1, epochs):
        loss = 0
        for index, data in enumerate(x_train):
            y, u, h = feed_forward(data, W, W1, embedding_dim)
            W, W1 = backpropagation(data, y_train[index], y, vocab_len, alpha, h, W, W1)
            
            c = 0
            for m in range(vocab_len):
                if y_train[index][m]:
                    loss += -1 * u[m][0]
                    c += 1
            loss += c * np.log(np.sum(np.exp(u)))
        print('epoch-> ', epoch, 'loss = ', loss)
        alpha *= 1/((1 + alpha * epoch))
    return W, W1
    
    

In [34]:
#targets, contexts
embedding_dim = 10
epochs = 1000
alpha = 0.001

W, W1 = initialize(vocab_len, vocab_words, embedding_dim)

W, W1 = train(epochs, contexts, targets, 
      W, W1, embedding_dim, vocab_len, alpha)


epoch->  1 loss =  46.69122747745587
epoch->  2 loss =  46.4781388025228
epoch->  3 loss =  46.267532453553976
epoch->  4 loss =  46.059560265282464
epoch->  5 loss =  45.85436472745877
epoch->  6 loss =  45.65207838162873
epoch->  7 loss =  45.452823321991495
epoch->  8 loss =  45.256710804336336
epoch->  9 loss =  45.06384096484762
epoch->  10 loss =  44.874302648401965
epoch->  11 loss =  44.688173343926465
epoch->  12 loss =  44.50551922250663
epoch->  13 loss =  44.32639527227079
epoch->  14 loss =  44.15084552267802
epoch->  15 loss =  43.978903349717456
epoch->  16 loss =  43.81059185270226
epoch->  17 loss =  43.64592429280856
epoch->  18 loss =  43.484904583256096
epoch->  19 loss =  43.32752782103146
epoch->  20 loss =  43.173780850288786
epoch->  21 loss =  43.02364284799213
epoch->  22 loss =  42.8770859229534
epoch->  23 loss =  42.734075720133134
epoch->  24 loss =  42.59457202287139
epoch->  25 loss =  42.45852934657106
epoch->  26 loss =  42.325897518234164
epoch->  27 

epoch->  704 loss =  35.838074976369704
epoch->  705 loss =  35.837575370034045
epoch->  706 loss =  35.83707718789459
epoch->  707 loss =  35.83658042387807
epoch->  708 loss =  35.83608507194566
epoch->  709 loss =  35.83559112609274
epoch->  710 loss =  35.83509858034866
epoch->  711 loss =  35.83460742877641
epoch->  712 loss =  35.834117665472554
epoch->  713 loss =  35.83362928456683
epoch->  714 loss =  35.83314228022202
epoch->  715 loss =  35.83265664663363
epoch->  716 loss =  35.83217237802979
epoch->  717 loss =  35.831689468670874
epoch->  718 loss =  35.83120791284942
epoch->  719 loss =  35.830727704889775
epoch->  720 loss =  35.83024883914798
epoch->  721 loss =  35.829771310011495
epoch->  722 loss =  35.82929511189893
epoch->  723 loss =  35.82882023925997
epoch->  724 loss =  35.828346686575046
epoch->  725 loss =  35.82787444835516
epoch->  726 loss =  35.82740351914167
epoch->  727 loss =  35.82693389350606
epoch->  728 loss =  35.82646556604976
epoch->  729 loss 

In [35]:
def predict(word, number_of_prediction, vocab_words, word_index, vocab_len,
           W, W1, embedding_dim):
    if word in vocab_words:
        index = word_index[word]
        x = [0 for i in range(vocab_len)]
        x[index] = 1
        prediction, _, _ = feed_forward(x, W, W1, embedding_dim)
        output = {}
        for i in range(vocab_len):
            output[prediction[i][0]] = i
        top_context_words = []
        for k in sorted(output, reverse = True):
            top_context_words.append(vocab_words[output[k]])
            if (len(top_context_words) >= number_of_prediction):
                break
        return top_context_words   
    else:
        print('word not found in dictionary so')
        index = word_index['UNK']
        x = [0 for i in range(vocab_len)]
        x[index] = 1
        prediction, _, _ = feed_forward(x, W, W1, embedding_dim)
        output = {}
        for i in range(vocab_len):
            output[prediction[i][0]] = i
        top_context_words = []
        for k in sorted(output, reverse = True):
            top_context_words.append(vocab_words[output[k]])
            if (len(top_context_words) >= number_of_prediction):
                break
        return top_context_words
        

In [36]:
preds = predict('tutorial', 3, vocab_words, word_index, vocab_len, W, W1, embedding_dim)
preds

['wordvec', 'scratch', 'UNK']