## Skipgram Word2Vec Model

In [24]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

### Data

In [25]:
text_data="There was once a hare who was friends with a tortoise. One day, he challenged the tortoise to a race. Seeing how slow the tortoise was going, the hare thought he’d win this easily. So, he took a nap while the tortoise kept on going. When the hare woke, he saw that the tortoise was already at the finish line. Much to his chagrin, the tortoise won the race while he was busy sleeping"

In [26]:
def remove_punctuations(text,list_of_punctuations=[",",".","'","!",";",":","?"]):
  for i in list_of_punctuations:
    text=text.replace(i,"")
  return text

def preprocess_text(text):
  text=re.findall("[A-Za-z]+",text)
  text=[a for a in text if a not in ['a','an','the','is','are','to','was']]
  text=[b for b in text if len(b)>1]
  text=[i.lower() for i in text]
  text=" ".join(text)
  return text

In [27]:
text_data=remove_punctuations(text_data)
text_data=preprocess_text(text_data)

In [28]:
print(text_data)

there once hare who friends with tortoise one day he challenged tortoise race seeing how slow tortoise going hare thought he win this easily so he took nap while tortoise kept on going when hare woke he saw that tortoise already at finish line much his chagrin tortoise won race while he busy sleeping


In [29]:
def generate_data_dictionary(text):
  corpus=text.split(" ")
  word_to_index={}
  index_to_word={}
  ind=0
  for c in corpus:
    if c not in word_to_index.keys():
      word_to_index[c]=ind
      index_to_word[ind]=c
      ind+=1
  vocab_size=len(word_to_index)
  return corpus, word_to_index,index_to_word,vocab_size

In [30]:
corpus, word_to_index,index_to_word,vocab_size=generate_data_dictionary(text_data)

In [31]:
def create_target_context_indices_pairs(corpus,context_window=2):
  target_context_map={}
  for i,word in enumerate(corpus):
    context_indices_list=[]
    if i==0:
      context_indices=list(range(1,context_window+1))
      context_indices_list.extend(context_indices)
      target_context_map[i]=context_indices_list
    elif i==len(corpus)-1:
      context_indices=list(range(len(corpus)-2,len(corpus)-2-context_window,-1))
      context_indices_list.extend(context_indices)
      target_context_map[i]=context_indices_list
    else:
      right_context_indices=list(range(i+1,i+context_window+1))
      right_context_indices=[i for i in right_context_indices if i<=len(corpus)-1]
      left_context_indices=list(range(i-1,i-1-context_window,-1))
      left_context_indices=[i for i in left_context_indices if i>=0]
      left_context_indices=[i for i in left_context_indices if i<=len(corpus)-1]
      context_indices_list.extend(right_context_indices+left_context_indices)
      target_context_map[i]=context_indices_list
  return target_context_map

In [32]:
def crete_target_context_pair(target_context_map,corpus):
  pair_list=[]
  for i in target_context_map.keys():
    for j in target_context_map[i]:
      pair_list.append((corpus[i],corpus[j]))
  return pair_list


In [33]:
map_=create_target_context_indices_pairs(corpus,context_window=2)
target_context_pair=crete_target_context_pair(map_,corpus)

In [34]:
target_context_pair[:10]

[('there', 'once'),
 ('there', 'hare'),
 ('once', 'hare'),
 ('once', 'who'),
 ('once', 'there'),
 ('hare', 'who'),
 ('hare', 'friends'),
 ('hare', 'once'),
 ('hare', 'there'),
 ('who', 'friends')]

In [35]:
def create_tarining_data(target_context_pair,word_to_index):
  x_feature=[]
  y_feature=[]
  for pair in target_context_pair:
    target=pair[0]
    context=pair[1]
    target_vec=[0]*len(word_to_index)
    context_vec=[0]*len(word_to_index)
    target_vec[word_to_index[target]]=1
    context_vec[word_to_index[context]]=1
    target_vec=torch.tensor(target_vec,dtype=torch.float32)
    context_vec=torch.tensor(context_vec,dtype=torch.float32)
    x_feature.append(target_vec)
    y_feature.append(context_vec)
  return x_feature,y_feature

In [36]:
x,y=create_tarining_data(target_context_pair,word_to_index)

In [37]:
class skipgram_dataset(Dataset):

  def __init__(self,x_feat,y_feat):
    super().__init__()
    self.x_feat=x_feat
    self.y_feat=y_feat

  def __getitem__(self,idx):
    x_value=self.x_feat[idx]
    y_value=self.y_feat[idx]
    return x_value,y_value

  def __len__(self):
    return len(self.y_feat)

In [38]:
training_data=skipgram_dataset(x,y)

In [39]:
train_data_loader=DataLoader(training_data,batch_size=10,shuffle=False)

In [40]:
data_inputs, data_labels = next(iter(train_data_loader))
print("Data inputs", data_inputs.squeeze())
print("Data labels", data_labels.squeeze())

Data inputs tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,

In [41]:
class skipgram_model(nn.Module):

  def __init__(self,vocab_size,embedding_dim):
    super().__init__()
    self.linear_layer1=nn.Linear(vocab_size,embedding_dim)
    self.linear_layer2=nn.Linear(embedding_dim,vocab_size)
    self.softmax_layer=nn.Softmax(dim=1)

  def forward(self,x):
    return self.softmax_layer(self.linear_layer2(self.linear_layer1(x)))

In [42]:
w2v_skipgram_model=skipgram_model(vocab_size,100)

In [43]:
loss=nn.CrossEntropyLoss()
optimizer=torch.optim.SGD(w2v_skipgram_model.parameters(),lr=0.01)

In [44]:
device= "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [45]:
w2v_skipgram_model.to(device)

skipgram_model(
  (linear_layer1): Linear(in_features=40, out_features=100, bias=True)
  (linear_layer2): Linear(in_features=100, out_features=40, bias=True)
  (softmax_layer): Softmax(dim=1)
)

In [46]:
def training_loop(model,loss,optimizer,data_loader,epochs):

  model.train()

  for epoch in range(1,epochs+1):
    loss_item=[]
    for x,y in data_loader:
      x=x.to(device)
      y=y.squeeze()
      y=y.to(device)

      pred=model(x)
      pred=pred.squeeze()
      loss_=loss(pred,y)
      loss_item.append(loss_.item())
      optimizer.zero_grad()
      loss_.backward()
      optimizer.step()

    print(f"epoch-{epoch} | loss-{np.sum(loss_item)}")

In [47]:
training_loop(w2v_skipgram_model,loss,optimizer,train_data_loader,100)

epoch-1 | loss-77.46647238731384
epoch-2 | loss-77.46637344360352
epoch-3 | loss-77.4662721157074
epoch-4 | loss-77.46616983413696
epoch-5 | loss-77.46606779098511
epoch-6 | loss-77.46596813201904
epoch-7 | loss-77.46586608886719
epoch-8 | loss-77.46576499938965
epoch-9 | loss-77.46566343307495
epoch-10 | loss-77.46556282043457
epoch-11 | loss-77.46545958518982
epoch-12 | loss-77.4653570652008
epoch-13 | loss-77.46525454521179
epoch-14 | loss-77.46515226364136
epoch-15 | loss-77.46504998207092
epoch-16 | loss-77.46494913101196
epoch-17 | loss-77.46484565734863
epoch-18 | loss-77.46474194526672
epoch-19 | loss-77.46463918685913
epoch-20 | loss-77.4645369052887
epoch-21 | loss-77.46443152427673
epoch-22 | loss-77.4643292427063
epoch-23 | loss-77.46422576904297
epoch-24 | loss-77.4641215801239
epoch-25 | loss-77.46401810646057
epoch-26 | loss-77.46391415596008
epoch-27 | loss-77.46380949020386
epoch-28 | loss-77.46370482444763
epoch-29 | loss-77.46360063552856
epoch-30 | loss-77.463494539

In [48]:
print(w2v_skipgram_model)

skipgram_model(
  (linear_layer1): Linear(in_features=40, out_features=100, bias=True)
  (linear_layer2): Linear(in_features=100, out_features=40, bias=True)
  (softmax_layer): Softmax(dim=1)
)


In [50]:
print(index_to_word[6], "embeddings","-->",torch.tensor(list(w2v_skipgram_model.parameters())[0]).t()[6])

  print(index_to_word[6], "embeddings","-->",torch.tensor(list(w2v_skipgram_model.parameters())[0]).t()[6])


tortoise embeddings --> tensor([-0.1158,  0.1237,  0.1015,  0.0521, -0.1310,  0.0940,  0.1091,  0.0940,
        -0.0267,  0.1248, -0.0875, -0.0898, -0.1469,  0.0975,  0.0377,  0.0764,
         0.0523,  0.0957, -0.1402,  0.1049,  0.1524, -0.0167,  0.1404, -0.0204,
        -0.1102,  0.0039,  0.0204, -0.0752,  0.0081, -0.0895, -0.0216, -0.0283,
         0.0907, -0.0492, -0.1136,  0.0066, -0.0815, -0.1505,  0.0039,  0.0951,
        -0.0345,  0.0713, -0.1242, -0.1322, -0.0165, -0.0737,  0.0794,  0.1476,
         0.1064, -0.0431,  0.1445,  0.1408, -0.1415,  0.1344, -0.0036, -0.1023,
         0.0272,  0.0918,  0.0821, -0.1136, -0.0437, -0.1256,  0.1211,  0.1200,
         0.0104,  0.1026,  0.1368,  0.0076, -0.1439, -0.0457,  0.0578, -0.1115,
        -0.0345, -0.1441,  0.1392,  0.0637,  0.0326, -0.0171, -0.1329,  0.1563,
         0.0529,  0.1378,  0.1007,  0.1172,  0.1577,  0.1429, -0.1398, -0.1053,
        -0.1538,  0.0481, -0.0720,  0.0282, -0.0041,  0.0258,  0.0241, -0.0384,
         0.1440,