## Data Preparation

In [34]:
text = "natural language processing and machine learning is fun and exciting"

corpus = [ [ word for word in text.lower().split() ] ]

## Hyperparameters

In [35]:
#Hyperparameters

settings = {'window_size': 2, 'n': 10, 'epochs': 50, 'learning_rate': 0.01}

# window size - context window around target word
# n - dimensions of word embeddings (size of hidden layer)
# epoch - number of training epochs
# learning_rate - learning rate controls the adjustment made to weights w.r.t. the loss gradient

# Generate Training Data

In [66]:
import numpy as np

class word2vec(object):
    def __init__(self):
        self.n = settings['n']
        self.lr = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
    
    def generate_training_data(self, settings, corpus):
        # dictionary of unique word counts in corpus
        word_counts = dict()
        for sentence in corpus:
            for word in sentence:
                word_counts[word] = word_counts.get(word, 0) + 1
        
        # vocabulary
        self.words_list = list(word_counts.keys())
        
        # length of vocabulary (number of unique words in corpus)
        self.v_count =  len(word_counts.keys())
        
        # generate word:index dictionary
        self.word_index = { word : idx for idx, word in enumerate(self.words_list) }
        
        # generate index:word dictionary
        self.index_word = { idx : word for idx, word in enumerate(self.words_list) }
        
        training_data = list()
        for sentence in corpus:
            sent_len = len(sentence)
            for i, word in enumerate(sentence):
                w_target = self.word2onehot(sentence[i])
                
                #initialize context window
                w_context = list()
                for j in range(i - self.window, i + self.window + 1):
                    # Target word cannot be context word (j!=i)
                    # Index must be greater than 0 (j>=0)
                    # Index must be le length of sentence (j<=sent_len-1)
                    if (j!=i and j>=0 and j<=sent_len-1):
                        w_context.append(self.word2onehot(sentence[j]))
                        #print(sentence[i], sentence[j])
                    
                training_data.append([w_target, w_context])
            
        return np.array(training_data)
    
    def word2onehot(self, word):
        # initialize a zero vector
        word_vec = np.zeros(self.v_count, dtype=int)
        
        # retrieve index of word from word:index dictionary
        word_index = self.word_index[word]
        
        # Set value to 1 
        word_vec[word_index] = 1
        
        return word_vec
    
    def train(self, training_data):
        self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
        self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
    
        for i in range(self.epochs):
            # initialize loss to 0
            self.loss = 0
            
            for w_t, w_c in training_data:
                y_pred, h, u = self.forward_pass(w_t)
                
                EI = np.sum([ np.subtract(y_pred, word) for word in w_c ],axis=0)
                
                self.backprop(EI, h, w_t)
                
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c)*np.log(np.sum(np.exp(u)))
                
            print('Epoch:', i, "Loss:", self.loss)
    
    def forward_pass(self, vec):
        # vec is the one-hot vector for target word
        
        # Dot product of vec and weight matrix w1 gives the hidden layer h
        h = np.dot(vec, self.w1)
        
        # Dot product of hidden layer with weight matrix w2 gives output layer u
        u = np.dot(h, self.w2)
        
        # Apply softmax function on output layer to force each element to range [0, 1]
        y_c = self.softmax(u)
        
        return y_c, h, u
    
    def backprop(self, e, h, x):
        dl_dw2 = np.outrt(h, e)
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
        
        # Update weights
        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)
    
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)
    
    """
    def word_vec(self, word):
        #Get vector from word
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w
    
    
    def vec_sim(self, word, top_n):
        v_w1 = self.word_vec(word)
        word_sim = dict()
        
        for i in range(self.v_count):
            # find similarity score for each word in vocab
            v_w2 = self.w1[i]
            theta_sum = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_sum / theta_den
            
            word = self.index_word[i]
            word_sim[word] = theta
        
        words_sorted = sorted(word_sim.items(), key=lambda x : x[1], reverse=True)
        
        for word, sim in words_sorted[:top_n]:
            print(word, sim)
    
    """

In [65]:
w2v = word2vec()
training_data = w2v.generate_training_data(settings, corpus)
#vec = w2v.word_vec("machine")

In [63]:
training_data[0][0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0])

In [64]:
training_data[0][1]

[array([0, 1, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 1, 0, 0, 0, 0, 0, 0])]