# GloVE

Let's work on implementation of GloVE.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

## 1. Load data

In [2]:
import nltk
from nltk.corpus import semcor

nltk.download('semcor')

[nltk_data] Downloading package semcor to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package semcor is already up-to-date!


True

In [3]:
#1. tokenization
corpus = semcor.sents()

In [6]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab

['kept',
 'gate',
 'featured',
 'servants',
 'roots',
 'clerk',
 'Black',
 'partial',
 "o'clock",
 'chairs',
 'Ca',
 'observers',
 'concerning',
 'proof',
 'disposed',
 'wages',
 'machine',
 'Masters',
 'alone',
 'output',
 'masses',
 'owner',
 'Day',
 'peak',
 'worth',
 'line',
 'female',
 'Trig',
 'consciously',
 'Quite',
 'walking',
 'technology',
 'connections',
 'permits',
 'Greg',
 'least',
 'shadows',
 'boots',
 'parents',
 'Country',
 'prevent',
 'per',
 'pilot',
 'rank',
 'preceded',
 'staring',
 'advocate',
 'territorial',
 'tune',
 '1958',
 'delay',
 'strand',
 'Beyond',
 'stem',
 'tube',
 'appearing',
 'stretch',
 'brains',
 'doctrine',
 'square',
 'proclaimed',
 'Congolese',
 'mere',
 'sources',
 'swiftly',
 'Russian',
 'combat',
 'technical',
 'official',
 'wealth',
 'fool',
 '50',
 'workshop',
 'mountains',
 'tailored',
 'Southeast',
 'caused',
 'energetic',
 'engines',
 'spend',
 'rocks',
 'tangent',
 'Go',
 'Never',
 'Force',
 'shooting',
 'events',
 'Certain',
 'accou

In [7]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [8]:
#vocab size
voc_size = len(vocab)
print(voc_size)

8141


In [9]:
#append UNK
vocab.append('<UNK>')

In [10]:
vocab

['kept',
 'gate',
 'featured',
 'servants',
 'roots',
 'clerk',
 'Black',
 'partial',
 "o'clock",
 'chairs',
 'Ca',
 'observers',
 'concerning',
 'proof',
 'disposed',
 'wages',
 'machine',
 'Masters',
 'alone',
 'output',
 'masses',
 'owner',
 'Day',
 'peak',
 'worth',
 'line',
 'female',
 'Trig',
 'consciously',
 'Quite',
 'walking',
 'technology',
 'connections',
 'permits',
 'Greg',
 'least',
 'shadows',
 'boots',
 'parents',
 'Country',
 'prevent',
 'per',
 'pilot',
 'rank',
 'preceded',
 'staring',
 'advocate',
 'territorial',
 'tune',
 '1958',
 'delay',
 'strand',
 'Beyond',
 'stem',
 'tube',
 'appearing',
 'stretch',
 'brains',
 'doctrine',
 'square',
 'proclaimed',
 'Congolese',
 'mere',
 'sources',
 'swiftly',
 'Russian',
 'combat',
 'technical',
 'official',
 'wealth',
 'fool',
 '50',
 'workshop',
 'mountains',
 'tailored',
 'Southeast',
 'caused',
 'energetic',
 'engines',
 'spend',
 'rocks',
 'tangent',
 'Go',
 'Never',
 'Force',
 'shooting',
 'events',
 'Certain',
 'accou

In [11]:
word2index['<UNK>'] = 0

In [12]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [13]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({"''": 4086,
         '``': 4068,
         "'s": 2910,
         'one': 1564,
         'would': 1533,
         'said': 1157,
         "n't": 1069,
         'could': 948,
         'time': 834,
         'two': 739,
         'may': 709,
         'man': 681,
         'like': 673,
         'first': 667,
         'made': 586,
         'also': 571,
         'new': 569,
         'must': 556,
         'back': 536,
         'years': 521,
         'many': 495,
         'even': 495,
         'much': 475,
         'way': 475,
         'Mr.': 472,
         'good': 449,
         'f': 446,
         'people': 445,
         'make': 441,
         'little': 433,
         'year': 425,
         'get': 405,
         'work': 395,
         'long': 395,
         'see': 388,
         'men': 386,
         'well': 380,
         'still': 376,
         'world': 365,
         'us': 352,
         'might': 346,
         'last': 346,
         'life': 343,
         'day': 342,
         'take': 337,
         'know'

In [14]:
window_size = 5

def random_batch(corpus, window_size=2):
    skip_grams = []

    for doc in corpus:
        for i in range(window_size, len(doc)-window_size):
            center = doc[i]
            # endpoints of the window
            outside_start =  i - window_size
            outside_end =  i + window_size + 1

            for j in range(outside_start, outside_end):
                if i != j:  # Skip the center word
                    outside = doc[j]
                    skip_grams.append((center, outside))

            return skip_grams
        
skip_grams = random_batch(corpus, window_size)

In [15]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('investigation', 'Fulton'): 1,
         ('investigation', 'County'): 1,
         ('investigation', 'Grand'): 1,
         ('investigation', 'said'): 1,
         ('investigation', 'Friday'): 1,
         ('investigation', 'Atlanta'): 1,
         ('investigation', "'s"): 1,
         ('investigation', 'recent'): 1,
         ('investigation', 'primary'): 1,
         ('investigation', 'election'): 1})

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [16]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [17]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [18]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [19]:
batch_size = 2
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

In [20]:
x

array([[4856],
       [4856]])

In [21]:
y

array([[2546],
       [4988]])

In [22]:
cooc

array([[0.        ],
       [0.69314718]])

In [23]:
weighting

array([[0.03162278],
       [0.05318296]])

## 4. Model

<img src ="../figures/glove.png" width=400>

In [24]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [25]:
#test our system
voc_size = len(vocab)
emb_size = 2
model = Glove(voc_size, emb_size)

In [26]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [27]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [28]:
loss

tensor(0.0630, grad_fn=<SumBackward0>)

## 5. Training

In [29]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [30]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [31]:
import time

# Training
num_epochs = 100
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 10 | cost: 0.596864 | time: 0m 0s
Epoch: 20 | cost: 0.573665 | time: 0m 0s
Epoch: 30 | cost: 0.551771 | time: 0m 0s
Epoch: 40 | cost: 0.531193 | time: 0m 0s
Epoch: 50 | cost: 0.511891 | time: 0m 0s
Epoch: 60 | cost: 0.493793 | time: 0m 0s
Epoch: 70 | cost: 0.476816 | time: 0m 0s
Epoch: 80 | cost: 0.460870 | time: 0m 0s
Epoch: 90 | cost: 0.445870 | time: 0m 0s
Epoch: 100 | cost: 0.431736 | time: 0m 0s


## 6. Plotting the embeddings

In [32]:
#list of vocabs
vocab[:10]

['kept',
 'gate',
 'featured',
 'servants',
 'roots',
 'clerk',
 'Black',
 'partial',
 "o'clock",
 'chairs']

In [33]:
word = vocab[0]

In [34]:
#numericalization
id = word2index[word]
id

0

In [35]:
id_tensor = torch.LongTensor([id])
id_tensor

tensor([0])

In [36]:
#get the embedding by averaging
v_embed = model.center_embedding(id_tensor)
u_embed = model.outside_embedding(id_tensor)

v_embed, u_embed

(tensor([[ 0.0508, -1.6715]], grad_fn=<EmbeddingBackward0>),
 tensor([[0.3629, 0.5799]], grad_fn=<EmbeddingBackward0>))

In [37]:
#average to get the word embedding
word_embed = (v_embed + u_embed) / 2
word_embed

tensor([[ 0.2068, -0.5458]], grad_fn=<DivBackward0>)

In [38]:
#let's write a function to get embedding given a word
def get_embed(word):
    id_tensor = torch.LongTensor([word2index[word]])
    v_embed = model.center_embedding(id_tensor)
    u_embed = model.outside_embedding(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

## 7. Cosine similarity

Formally the [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) $s$ between two vectors $p$ and $q$ is defined as:

$$s = \frac{p \cdot q}{||p|| ||q||}, \textrm{ where } s \in [-1, 1] $$ 

If $p$ and $q$ is super similar, the result is 1 otherwise 0.

In [39]:
vocab

['kept',
 'gate',
 'featured',
 'servants',
 'roots',
 'clerk',
 'Black',
 'partial',
 "o'clock",
 'chairs',
 'Ca',
 'observers',
 'concerning',
 'proof',
 'disposed',
 'wages',
 'machine',
 'Masters',
 'alone',
 'output',
 'masses',
 'owner',
 'Day',
 'peak',
 'worth',
 'line',
 'female',
 'Trig',
 'consciously',
 'Quite',
 'walking',
 'technology',
 'connections',
 'permits',
 'Greg',
 'least',
 'shadows',
 'boots',
 'parents',
 'Country',
 'prevent',
 'per',
 'pilot',
 'rank',
 'preceded',
 'staring',
 'advocate',
 'territorial',
 'tune',
 '1958',
 'delay',
 'strand',
 'Beyond',
 'stem',
 'tube',
 'appearing',
 'stretch',
 'brains',
 'doctrine',
 'square',
 'proclaimed',
 'Congolese',
 'mere',
 'sources',
 'swiftly',
 'Russian',
 'combat',
 'technical',
 'official',
 'wealth',
 'fool',
 '50',
 'workshop',
 'mountains',
 'tailored',
 'Southeast',
 'caused',
 'energetic',
 'engines',
 'spend',
 'rocks',
 'tangent',
 'Go',
 'Never',
 'Force',
 'shooting',
 'events',
 'Certain',
 'accou

In [40]:
#let's try similarity between first and second, and second and third
cat          = get_embed('cat')
fruit        = get_embed('fruit')
animal       = get_embed('animal')

In [41]:
#numpy version
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim
    
print(f"cat vs. fruit: ",        cos_sim(cat, fruit))
print(f"cat vs. animal: ",       cos_sim(cat, animal))
print(f"cat vs. cat: ",          cos_sim(cat, cat))

cat vs. fruit:  -0.42989645515882857
cat vs. animal:  0.9463488105041457
cat vs. cat:  1.0000000000000002


In [42]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

print(f"cat vs. fruit: ",        cos_sim(cat, fruit))
print(f"cat vs. animal: ",       cos_sim(cat, animal))
print(f"cat vs. cat: ",          cos_sim(cat, cat))

cat vs. fruit:  -0.4298964551588287
cat vs. animal:  0.9463488105041455
cat vs. cat:  1.0


## Export Model

In [43]:
import pickle

In [44]:
filename = 'glove_model.pkl'
pickle.dump(model, open(filename, 'wb'))

## Export word2index and index2word

In [46]:
pickle.dump(word2index, open('glove_word2index.pkl', 'wb'))
pickle.dump(index2word, open('glove_index2word.pkl', 'wb'))

## Syntactic and Semantic accuracy