In [1]:
 
import numpy as np
import string
from nltk.corpus import stopwords 
  
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()
  
class word2vec(object):
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.y_train = []
        self.window_size = 2
        self.alpha = 0.001
        self.words = []
        self.word_index = {}
  
    def initialize(self,V,data):
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
          
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i
  
      
    def feed_forward(self,X):
        self.h = np.dot(self.W.T,X).reshape(self.N,1)
        self.u = np.dot(self.W1.T,self.h)
        #print(self.u)
        self.y = softmax(self.u)  
        return self.y
          
    def backpropagate(self,x,t):
        e = self.y - np.asarray(t).reshape(self.V,1)
        # e.shape is V x 1
        dLdW1 = np.dot(self.h,e.T)
        X = np.array(x).reshape(self.V,1)
        dLdW = np.dot(X, np.dot(self.W1,e).T)
        self.W1 = self.W1 - self.alpha*dLdW1
        self.W = self.W - self.alpha*dLdW
          
    def train(self,epochs):
        for x in range(1,epochs):        
            self.loss = 0
            for j in range(len(self.X_train)):
                self.feed_forward(self.X_train[j])
                self.backpropagate(self.X_train[j],self.y_train[j])
                C = 0
                for m in range(self.V):
                    if(self.y_train[j][m]):
                        self.loss += -1*self.u[m][0]
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u)))
            print("epoch ",x, " loss = ",self.loss)
            self.alpha *= 1/( (1+self.alpha*x) )
             
    def predict(self,word,number_of_predictions):
        if word in self.words:
            index = self.word_index[word]
            X = [0 for i in range(self.V)]
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {}
            for i in range(self.V):
                output[prediction[i][0]] = i
              
            top_context_words = []
            for k in sorted(output,reverse=True):
                top_context_words.append(self.words[output[k]])
                if(len(top_context_words)>=number_of_predictions):
                    break
      
            return top_context_words
        else:
            print("Word not found in dictionary")

In [2]:

def preprocessing(corpus):
    stop_words = set(stopwords.words('english'))    
    training_data = []
    sentences = corpus.split(".")
    for i in range(len(sentences)):
        sentences[i] = sentences[i].strip()
        sentence = sentences[i].split()
        x = [word.strip(string.punctuation) for word in sentence
                                     if word not in stop_words]
        x = [word.lower() for word in x]
        training_data.append(x)
    return training_data
      
  
def prepare_data_for_training(sentences,w2v):
    data = {}
    for sentence in sentences:
        for word in sentence:
            if word not in data:
                data[word] = 1
            else:
                data[word] += 1
    V = len(data)
    data = sorted(list(data.keys()))
    vocab = {}
    for i in range(len(data)):
        vocab[data[i]] = i
      
    #for i in range(len(words)):
    for sentence in sentences:
        for i in range(len(sentence)):
            center_word = [0 for x in range(V)]
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)]
             
            for j in range(i-w2v.window_size,i+w2v.window_size):
                if i!=j and j>=0 and j<len(sentence):
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(center_word)
            w2v.y_train.append(context)
    w2v.initialize(V,data)
  
    return w2v.X_train,w2v.y_train

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/priyansh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
 
corpus = ""
corpus += "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000
 
training_data = preprocessing(corpus)
w2v = word2vec()
 
prepare_data_for_training(training_data,w2v)
w2v.train(epochs) 
 
print(w2v.predict("around",3)) 

epoch  1  loss =  43.46880559729603
epoch  2  loss =  43.37035007829897
epoch  3  loss =  43.272789130727304
epoch  4  loss =  43.176207340223414
epoch  5  loss =  43.080685262116745
epoch  6  loss =  42.986299026117315
epoch  7  loss =  42.893119992651066
epoch  8  loss =  42.80121446431154
epoch  9  loss =  42.71064345467749
epoch  10  loss =  42.621462515512256
epoch  11  loss =  42.53372162216374
epoch  12  loss =  42.44746511585658
epoch  13  loss =  42.36273170054809
epoch  14  loss =  42.27955449112943
epoch  15  loss =  42.197961109012766
epoch  16  loss =  42.11797382056328
epoch  17  loss =  42.03960971341697
epoch  18  loss =  41.962880905466065
epoch  19  loss =  41.887794781184304
epoch  20  loss =  41.81435424999287
epoch  21  loss =  41.74255802151387
epoch  22  loss =  41.67240089280628
epoch  23  loss =  41.60387404300578
epoch  24  loss =  41.536965331178706
epoch  25  loss =  41.47165959362768
epoch  26  loss =  41.40793893733989
epoch  27  loss =  41.34578302672807


In [13]:
 
corpus = ""
corpus += "This course is very well structured and easy to learn. Anyone with zero experience of data science, python or ML can learn from this. This course makes things so easy that anybody can learn on their own. It's helping me a lot. Thanks for creating such a great course.-  Ayushi Jain | Placed at Microsoft Now's your chance to unlock high-earning job opportunities as a Data Scientist! Join our Complete Machine Learning & Data Science Program and get a 360-degree learning experience mentored by industry experts."

epochs = 1000
 
training_data = preprocessing(corpus)
w2v = word2vec()
 
prepare_data_for_training(training_data,w2v)
w2v.train(epochs) 
 
print(w2v.predict("course",3)) 

epoch  1  loss =  594.497906631536
epoch  2  loss =  593.7737019081763
epoch  3  loss =  593.0527943426798
epoch  4  loss =  592.3358558339218
epoch  5  loss =  591.6235404068383
epoch  6  loss =  590.9164806657723
epoch  7  loss =  590.2152845186534
epoch  8  loss =  589.5205322126502
epoch  9  loss =  588.8327737144413
epoch  10  loss =  588.1525264601812
epoch  11  loss =  587.4802734918848
epoch  12  loss =  586.8164619885366
epoch  13  loss =  586.1615021922122
epoch  14  loss =  585.5157667218702
epoch  15  loss =  584.8795902606802
epoch  16  loss =  584.253269596794
epoch  17  loss =  583.6370639925163
epoch  18  loss =  583.0311958529527
epoch  19  loss =  582.4358516623385
epoch  20  loss =  581.8511831545113
epoch  21  loss =  581.2773086831068
epoch  22  loss =  580.7143147571289
epoch  23  loss =  580.1622577083313
epoch  24  loss =  579.6211654583262
epoch  25  loss =  579.091039355243
epoch  26  loss =  578.5718560521747
epoch  27  loss =  578.0635694022084
epoch  28  lo