In [25]:
import numpy as np
import string
from nltk.corpus import stopwords


def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


class word2vec(object):
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.y_train = []
        self.window_size = 3
        self.alpha = 0.001
        self.words = []
        self.word_index = {}

    def initialize(self, V, data):
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))  # vector
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))  # weight
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i

    def feed_forward(self, X):
        self.h = np.dot(self.W.T, X).reshape(self.N, 1)
        self.u = np.dot(self.W1.T, self.h)
        # print(self.u)
        self.y = softmax(self.u)
        return self.y

    def backpropagate(self, x, t):
        e = self.y - np.asarray(t).reshape(self.V, 1)
        # e.shape is V x 1
        dLdW1 = np.dot(self.h, e.T)
        X = np.array(x).reshape(self.V, 1)
        dLdW = np.dot(X, np.dot(self.W1, e).T)
        self.W1 = self.W1 - self.alpha * dLdW1  # adjusted weights and vectors
        self.W = self.W - self.alpha * dLdW

    def train(self, epochs):
        for x in range(1, epochs):
            self.loss = 0
            for j in range(len(self.X_train)):
                self.feed_forward(self.X_train[j])
                self.backpropagate(self.X_train[j], self.y_train[j])
                C = 0
                for m in range(self.V):
                    if (self.y_train[j][m]):
                        self.loss += -1 * self.u[m][0]
                        C += 1
                self.loss += C * np.log(np.sum(np.exp(self.u)))
            print(x, "epoch loss = ", self.loss)
            self.alpha *= 1 / ((1 + self.alpha * x))

    def predict(self, word, number_of_predictions):
        l = list(word.split(' '))
        if len(l) < 1:
            if word in self.words:
                index = self.word_index[word]
                X = [0 for i in range(self.V)]
                X[index] = 1
                prediction = self.feed_forward(X)
                output = {}
                for i in range(self.V):
                    output[prediction[i][0]] = i

                top_context_words = []
                for k in sorted(output, reverse=True):
                    top_context_words.append(self.words[output[k]])
                    if len(top_context_words) >= number_of_predictions:
                        break

                return top_context_words
            else:
                print("Word not found in dictionary")
        else:
            result = []
            for x in l:
                if x in self.words:
                    index = self.word_index[x]
                    X = [0 for i in range(self.V)]
                    X[index] = 1
                    prediction = self.feed_forward(X)
                    output = {}
                    for i in range(self.V):
                        output[prediction[i][0]] = i

                    top_context_words = []
                    for k in sorted(output, reverse=True):
                        top_context_words.append(self.words[output[k]])
                        if len(top_context_words) >= number_of_predictions:
                            break
                    result.append(top_context_words)
                else:
                    print("{} not found in dictionary".format(x))
            return result



def preprocessing(corpus):
    stop_words = set(stopwords.words('english'))
    training_data = []
    sentences = corpus.split(".")
    for i in range(len(sentences)):
        sentences[i] = sentences[i].strip()
        sentence = sentences[i].split()
        x = [word.strip(string.punctuation) for word in sentence
             if (word not in stop_words and word.isalpha() == True)]
        x = [word.lower() for word in x]
        training_data.append(x)
    return training_data


def prepare_data_for_training(sentences, w2v):
    data = {}
    for sentence in sentences:
        for word in sentence:
            if word not in data:
                data[word] = 1
            else:
                data[word] += 1
    V = len(data)

    data = sorted(list(data.keys()))
    vocab = {}
    for i in range(len(data)):
        vocab[data[i]] = i
    print(vocab)

    # for i in range(len(words)):
    for sentence in sentences:
        for i in range(len(sentence)):
            center_word = [0 for x in range(V)] #window chala rhe
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)]

            for j in range(i - w2v.window_size, i + w2v.window_size):
                if i != j and j >= 0 and j < len(sentence):
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(center_word)
            w2v.y_train.append(context)
    w2v.initialize(V, data)

    return w2v.X_train, w2v.y_train, vocab


file = open("HarryPotter1.txt", "r", encoding="utf8")
corpus = file.read()
epochs = 15  # accuracy bad because data large so less number of epochs iterated
# data cleaning also requires more work punctuation marks and gaps and also numbers



In [28]:
training_data = preprocessing(corpus)


w2v = word2vec()

prepare_data_for_training(training_data, w2v)

w2v.train(epochs)

print()
print("Predictions : ")
print(w2v.predict("ron", 3))


{'a': 0, 'able': 1, 'about': 2, 'across': 3, 'act': 4, 'acting': 5, 'admiring': 6, 'affect': 7, 'after': 8, 'afternoon': 9, 'age': 10, 'agree': 11, 'air': 12, 'albus': 13, 'all': 14, 'allowed': 15, 'almost': 16, 'also': 17, 'although': 18, 'always': 19, 'amount': 20, 'amuse': 21, 'and': 22, 'angrily': 23, 'angry': 24, 'another': 25, 'answer': 26, 'anxious': 27, 'anyone': 28, 'anything': 29, 'anywhere': 30, 'apart': 31, 'appeared': 32, 'approve': 33, 'arm': 34, 'armchair': 35, 'arms': 36, 'around': 37, 'arrived': 38, 'as': 39, 'ask': 40, 'asked': 41, 'asleep': 42, 'astonishing': 43, 'astounding': 44, 'astride': 45, 'at': 46, 'aunt': 47, 'away': 48, 'baby': 49, 'back': 50, 'backed': 51, 'bakery': 52, 'balls': 53, 'bear': 54, 'beard': 55, 'bed': 56, 'bedroom': 57, 'beefy': 58, 'behaving': 59, 'behind': 60, 'believe': 61, 'belt': 62, 'beneath': 63, 'bent': 64, 'beside': 65, 'best': 66, 'bet': 67, 'better': 68, 'big': 69, 'bike': 70, 'birds': 71, 'bit': 72, 'black': 73, 'blame': 74, 'blanke

1 epoch loss =  57393.70048310324
2 epoch loss =  57318.80950635782
3 epoch loss =  57246.31363650881
4 epoch loss =  57175.90475678455
5 epoch loss =  57107.29793169755
6 epoch loss =  57040.228266839375
7 epoch loss =  56974.44800981127
8 epoch loss =  56909.72390595598
9 epoch loss =  56845.83481943902
10 epoch loss =  56782.56962804117
11 epoch loss =  56719.72539877922
12 epoch loss =  56657.10585133976
13 epoch loss =  56594.520117415836
14 epoch loss =  56531.7818066184

Predictions : 
ron not found in dictionary
[]
