### Implementing Word2Vec from Scratch
* Continuous Bag of Words(CBOW) : Take the neighboring words, take two words before and two words after, predict middle word through these context words.
* Skip Gram : Take middle word and present context word.

In [13]:
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ritikagupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### 1. Loading the Dataset

In [14]:
fd = open('big.txt','r')
text_data = fd.read()
fd.close()

# From the whole corpus take only the 1st 100,000 words
text_data = text_data[:100000]

In [15]:
len(text_data) # No of characters

100000

#### 2. Data Preprocessing | Removing Special Characters

In [16]:
special_chars = '!@#$%^&*()_+=-|\()[]{}:;<>?\n'

for char in special_chars:
    text_data = text_data.replace(char, ' ')

text_data = text_data.replace("'", ' ')
text_data = text_data.replace("  ", ' ')
text_data = text_data.replace("  ", ' ')

In [17]:
text_data[:100]

'The Project Gutenberg EBook of The Adventures of Sherlock Holmes by Sir Arthur Conan Doyle 15 in our'

#### 3. Creating Word Index and Sentences

In [18]:
# Splitting words in a sentence
words = word_tokenize(text_data)
# Removing null
words = [word for word in words if len(word)!=0]
# Every unique word in the corpus is assigned with a number
word_index = {word:i for i, word in enumerate(set(words))}
# Get word from number assigned to it
index_word = {word_index[word] : word for word in word_index}

sents = [word_tokenize(sent) for sent in sent_tokenize(text_data)]

In [19]:
' '.join(sents[0])
# The Project Gutenberg EBook of 
# Gutenberg : Middle word
# Project Gutenberg EBook of The : Next pair

'The Project Gutenberg EBook of The Adventures of Sherlock Holmes by Sir Arthur Conan Doyle 15 in our series by Sir Arthur Conan Doyle Copyright laws are changing all over the world .'

#### 4. Creating Training Data

In [20]:
features = []
labels   = []
window_size = 2 # Gutenberg : Output || The, Project,Ebook, of : Input
for sent in sents:
    # Total no of pairs
    # 'The', 'Project', 'Gutenberg', 'EBook', 'of', 'The', 'Adventures', 'of', 'Sherlock' : Total Pairs: 7 for window size =1 , No of words : 9
    # for window_size = 2, Pairs = 5, n = 9 || 9 -(2*2) = 5
    # for window_size = 3, Pairs = 3, n (no of words in sentence) = 9 || 9-(3*2) = 3 {Length of sentence - window_size*2}
    for i in range(len(sent) - window_size*2):
        # Input pair
        features.append(sent[i:i+window_size] + sent[i+window_size+1 : i+window_size*2+1])
        # Output
        labels.append(sent[i+window_size])

In [21]:
for i in range(10):
    print(features[i],labels[i])
# Convert the features and labels into one-hot encoded

['The', 'Project', 'EBook', 'of'] Gutenberg
['Project', 'Gutenberg', 'of', 'The'] EBook
['Gutenberg', 'EBook', 'The', 'Adventures'] of
['EBook', 'of', 'Adventures', 'of'] The
['of', 'The', 'of', 'Sherlock'] Adventures
['The', 'Adventures', 'Sherlock', 'Holmes'] of
['Adventures', 'of', 'Holmes', 'by'] Sherlock
['of', 'Sherlock', 'by', 'Sir'] Holmes
['Sherlock', 'Holmes', 'Sir', 'Arthur'] by
['Holmes', 'by', 'Arthur', 'Conan'] Sir


In [22]:
X_train = []
y_train = []

for feature in features:
    # Creating a vector of all values as zero
    enc = np.zeros(len(word_index))
    for word in feature:
        # Find word index of context word and replace 0 with 1 at those index
        enc[word_index[word]] = 1
    X_train.append(enc)
    
for label in labels:
    enc = np.zeros(len(word_index))
    enc[word_index[label]] = 1
    y_train.append(enc)
    
X_train = np.array(X_train)
y_train = np.array(y_train)

In [23]:
X_train.shape # 3457 : no of words  # 17652 : no of pairs

(17652, 3457)

In [24]:
y_train.shape

(17652, 3457)

In [25]:
X_train[0].sum()

4.0

In [26]:
y_train[0].sum()

1.0

#### 5. Building neural Network

In [27]:
# No of neurons in input, hidden, output layer : 3425 , 100, 3425

from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(100,input_dim = len(word_index),activation = 'relu'))
model.add(Dense(len(word_index),activation= 'softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train,y_train,epochs=50,batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x290a7e390>

#### Extract word embedding

In [28]:
# For every word we are having a vector of size 100
word_embedding = model.get_weights()[0]

#### Finding similar words

In [29]:
word_embedding.shape

(3457, 100)

In [30]:
def most_similar(word):    
    target_word = word_embedding[word_index[word]]
    # Save distance of all words from the target word
    distances = np.dot(word_embedding, target_word)
    # Returns the index of 10 most similar words
    most_similar = np.argsort(distances)[::-1][:5]

    print("Most similar word of  ",word,"is :",[index_word[i] for i in most_similar])

In [31]:
most_similar('EBook')

Most similar word of   EBook is : ['EBook', 'breathing', 'wrist', 'Mortimer', 'fish']
