# GloVE


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
np.__version__, torch.__version__

('1.26.3', '2.1.2+cpu')

In [3]:
import matplotlib
matplotlib.__version__

'3.8.2'

## 1. Load data

In [4]:
with open ("./data.txt", "r") as doc:
    corpus = doc.read()

In [5]:
corpus = [corpus.split(" ") for sent in corpus]
corpus

[['Harry',
  'Potter',
  'is',
  'a',
  'popular',
  'fantasy',
  'book',
  'series',
  'written',
  'by',
  'J.K.',
  'Rowling.',
  'The',
  'series',
  'consists',
  'of',
  'seven',
  'books,',
  'following',
  'the',
  'life',
  'and',
  'adventures',
  'of',
  'a',
  'young',
  'wizard',
  'named',
  'Harry',
  'Potter.',
  'The',
  'story',
  'begins',
  'with',
  'Harry',
  'discovering',
  'on',
  'his',
  'eleventh',
  'birthday',
  'that',
  'he',
  'is',
  'a',
  'wizard',
  'and',
  'has',
  'been',
  'accepted',
  'to',
  'Hogwarts',
  'School',
  'of',
  'Witchcraft',
  'and',
  'Wizardry.',
  'At',
  'Hogwarts,',
  'Harry',
  'makes',
  'friends',
  'such',
  'as',
  'Hermione',
  'Granger',
  'and',
  'Ron',
  'Weasley,',
  'and',
  'together',
  'they',
  'uncover',
  'mysteries',
  'and',
  'face',
  'various',
  'magical',
  'challenges.\n\nThe',
  'overarching',
  'plot',
  'revolves',
  'around',
  "Harry's",
  'struggle',
  'against',
  'the',
  'dark',
  'wizard'

In [6]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab

['Hogwarts,',
 'Granger',
 'Wizardry.',
 'evil',
 'such',
 'Hermione',
 'Stone,"',
 'confronts',
 'Voldemort',
 'significance',
 'book',
 'following',
 'people.',
 'as',
 'Potter.',
 'connection',
 'Weasley,',
 'revolves',
 'faces',
 'themes',
 'dark',
 'accepted',
 'they',
 'prophecy',
 'birthday',
 'courage,',
 'are',
 'been',
 'seven',
 "Philosopher's",
 'overarching',
 'characters,',
 'Hallows,"',
 'Stone,',
 'all',
 'its',
 'seeks',
 'of',
 'histories',
 'book,',
 'learns',
 'The',
 'friends',
 'conquer',
 'non-magical',
 'series,',
 'world,',
 'revelations.',
 'popular',
 'Lord',
 'begins',
 'the',
 'to',
 'subjugate',
 'School',
 'battle',
 'wizard',
 'life',
 'between',
 'Hallows.\n\nThe',
 'It',
 'together',
 'brings',
 'prevalent.',
 'their',
 'plot',
 'film',
 'As',
 'also',
 'struggle',
 'a',
 'ages.',
 'final',
 'complexities',
 'uncover',
 'first',
 'consists',
 'magical',
 'rich',
 'matures,',
 'explores',
 'J.K.',
 'climactic',
 'families,',
 'fantasy',
 'foretells',
 '

In [7]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)

{'Hogwarts,': 0, 'Granger': 1, 'Wizardry.': 2, 'evil': 3, 'such': 4, 'Hermione': 5, 'Stone,"': 6, 'confronts': 7, 'Voldemort': 8, 'significance': 9, 'book': 10, 'following': 11, 'people.': 12, 'as': 13, 'Potter.': 14, 'connection': 15, 'Weasley,': 16, 'revolves': 17, 'faces': 18, 'themes': 19, 'dark': 20, 'accepted': 21, 'they': 22, 'prophecy': 23, 'birthday': 24, 'courage,': 25, 'are': 26, 'been': 27, 'seven': 28, "Philosopher's": 29, 'overarching': 30, 'characters,': 31, 'Hallows,"': 32, 'Stone,': 33, 'all': 34, 'its': 35, 'seeks': 36, 'of': 37, 'histories': 38, 'book,': 39, 'learns': 40, 'The': 41, 'friends': 42, 'conquer': 43, 'non-magical': 44, 'series,': 45, 'world,': 46, 'revelations.': 47, 'popular': 48, 'Lord': 49, 'begins': 50, 'the': 51, 'to': 52, 'subjugate': 53, 'School': 54, 'battle': 55, 'wizard': 56, 'life': 57, 'between': 58, 'Hallows.\n\nThe': 59, 'It': 60, 'together': 61, 'brings': 62, 'prevalent.': 63, 'their': 64, 'plot': 65, 'film': 66, 'As': 67, 'also': 68, 'stru

In [8]:
#vocab size
voc_size = len(vocab)
print(voc_size)

159


In [9]:
#append UNK
vocab.append('<UNK>')

In [10]:
vocab

['Hogwarts,',
 'Granger',
 'Wizardry.',
 'evil',
 'such',
 'Hermione',
 'Stone,"',
 'confronts',
 'Voldemort',
 'significance',
 'book',
 'following',
 'people.',
 'as',
 'Potter.',
 'connection',
 'Weasley,',
 'revolves',
 'faces',
 'themes',
 'dark',
 'accepted',
 'they',
 'prophecy',
 'birthday',
 'courage,',
 'are',
 'been',
 'seven',
 "Philosopher's",
 'overarching',
 'characters,',
 'Hallows,"',
 'Stone,',
 'all',
 'its',
 'seeks',
 'of',
 'histories',
 'book,',
 'learns',
 'The',
 'friends',
 'conquer',
 'non-magical',
 'series,',
 'world,',
 'revelations.',
 'popular',
 'Lord',
 'begins',
 'the',
 'to',
 'subjugate',
 'School',
 'battle',
 'wizard',
 'life',
 'between',
 'Hallows.\n\nThe',
 'It',
 'together',
 'brings',
 'prevalent.',
 'their',
 'plot',
 'film',
 'As',
 'also',
 'struggle',
 'a',
 'ages.',
 'final',
 'complexities',
 'uncover',
 'first',
 'consists',
 'magical',
 'rich',
 'matures,',
 'explores',
 'J.K.',
 'climactic',
 'families,',
 'fantasy',
 'foretells',
 '

In [12]:
word2index['<UNK>'] = 159

In [13]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [14]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({'the': 31521,
         'and': 28203,
         'of': 16590,
         'Harry': 11613,
         'a': 9954,
         'to': 9954,
         'series': 8295,
         'The': 6636,
         'has': 6636,
         'Potter': 4977,
         'wizard': 4977,
         'been': 4977,
         'as': 4977,
         'magical': 4977,
         'is': 3318,
         'story': 3318,
         'his': 3318,
         'that': 3318,
         'he': 3318,
         'such': 3318,
         'wizarding': 3318,
         'Voldemort': 3318,
         'final': 3318,
         'into': 3318,
         "Philosopher's": 3318,
         'Deathly': 3318,
         'book,': 3318,
         '"Harry': 3318,
         'in': 3318,
         'its': 3318,
         'popular': 1659,
         'fantasy': 1659,
         'book': 1659,
         'written': 1659,
         'by': 1659,
         'J.K.': 1659,
         'Rowling.': 1659,
         'consists': 1659,
         'seven': 1659,
         'books,': 1659,
         'following': 1659,
         'life

In [15]:
skip_grams = []

for doc in corpus:
    for i in range(1, len(doc)-1):
        center = doc[i]
        outside = [doc[i-1], doc[i+1]]
        for each_out in outside:
            skip_grams.append((center, each_out))
skip_grams

[('Potter', 'Harry'),
 ('Potter', 'is'),
 ('is', 'Potter'),
 ('is', 'a'),
 ('a', 'is'),
 ('a', 'popular'),
 ('popular', 'a'),
 ('popular', 'fantasy'),
 ('fantasy', 'popular'),
 ('fantasy', 'book'),
 ('book', 'fantasy'),
 ('book', 'series'),
 ('series', 'book'),
 ('series', 'written'),
 ('written', 'series'),
 ('written', 'by'),
 ('by', 'written'),
 ('by', 'J.K.'),
 ('J.K.', 'by'),
 ('J.K.', 'Rowling.'),
 ('Rowling.', 'J.K.'),
 ('Rowling.', 'The'),
 ('The', 'Rowling.'),
 ('The', 'series'),
 ('series', 'The'),
 ('series', 'consists'),
 ('consists', 'series'),
 ('consists', 'of'),
 ('of', 'consists'),
 ('of', 'seven'),
 ('seven', 'of'),
 ('seven', 'books,'),
 ('books,', 'seven'),
 ('books,', 'following'),
 ('following', 'books,'),
 ('following', 'the'),
 ('the', 'following'),
 ('the', 'life'),
 ('life', 'the'),
 ('life', 'and'),
 ('and', 'life'),
 ('and', 'adventures'),
 ('adventures', 'and'),
 ('adventures', 'of'),
 ('of', 'adventures'),
 ('of', 'a'),
 ('a', 'of'),
 ('a', 'young'),
 ('yo

In [16]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('and', 'the'): 8295,
         ('the', 'and'): 8295,
         ('is', 'a'): 3318,
         ('a', 'is'): 3318,
         ('The', 'series'): 3318,
         ('series', 'The'): 3318,
         ('The', 'story'): 3318,
         ('story', 'The'): 3318,
         ('and', 'has'): 3318,
         ('has', 'and'): 3318,
         ('has', 'been'): 3318,
         ('been', 'has'): 3318,
         ('such', 'as'): 3318,
         ('as', 'such'): 3318,
         ('of', 'the'): 3318,
         ('the', 'of'): 3318,
         ('the', "Philosopher's"): 3318,
         ("Philosopher's", 'the'): 3318,
         ('the', 'Deathly'): 3318,
         ('Deathly', 'the'): 3318,
         ('book,', '"Harry'): 3318,
         ('"Harry', 'book,'): 3318,
         ('"Harry', 'Potter'): 3318,
         ('Potter', '"Harry'): 3318,
         ('Potter', 'and'): 3318,
         ('and', 'Potter'): 3318,
         ('to', 'appeal'): 3318,
         ('appeal', 'to'): 3318,
         ('Potter', 'Harry'): 1659,
         ('Potter', 'is'): 1659,

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [17]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [18]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [19]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [20]:
batch_size = 2
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

In [21]:
x

array([[115],
       [146]])

In [22]:
y

array([[ 14],
       [157]])

In [23]:
cooc

array([[7.41457288],
       [8.10741881]])

In [24]:
weighting

array([[1],
       [1]])

## 4. Model

<img src ="../figures/glove.png" width=400>

In [25]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [26]:
#test our system
voc_size = len(vocab)
emb_size = 2
model = Glove(voc_size, emb_size)

In [27]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [28]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [29]:
loss

tensor(123.1610, grad_fn=<SumBackward0>)

## 5. Training

In [30]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [31]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [33]:
import time

# Training
num_epochs = 3000
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 1000 | cost: 324.335724 | time: 0m 0s
Epoch: 2000 | cost: 227.313766 | time: 0m 0s
Epoch: 3000 | cost: 301.072937 | time: 0m 0s
