# Language Analogies Using Pretrained Glove Model (word distance)

<span style='color:red'>Note: Torchtext is not currently supported with torch 2.0</span>

In [1]:
import torch
import torchtext.vocab

# We are defaulting to CPU, but you may want to compare CPU vs GPU performance
# device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu" # Use cpu since this dataset is too small to take advantage of the GPU

OSError: /opt/conda/lib/python3.9/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefINS2_6SymIntEEENS2_8optionalINS2_10ScalarTypeEEENS6_INS2_6LayoutEEENS6_INS2_6DeviceEEENS6_IbEENS6_INS2_12MemoryFormatEEE

In [None]:
glove = torchtext.vocab.GloVe(name='6B', dim=100) # Pre-trained word model trained using 6 billion words
print(f'There are {len(glove.itos)} words in the vocabulary')

In [None]:
glove.itos[:15] # First 15 words

In [None]:
glove.stoi['the'] # Find index

In [None]:
glove.stoi['dazzle'] # Find index

In [None]:
glove.stoi['shenanigans'] # Find index

In [None]:
# Find vector representation (meaning) of the word
def get_vector(embeddings, word):    
    assert word in embeddings.stoi, f'*{word}* is not in the vocab!'    
    return embeddings.vectors[embeddings.stoi[word]].to(device)
    
get_vector(glove, 'paper')

In [None]:
# Find other words that are closest vector distance (meaning) to this word
def closest(embeddings, vector, n_words_returned = 6):    
    distances = []    
    for neighbor in embeddings.itos:
        distances.append((neighbor, torch.dist(vector, get_vector(embeddings, neighbor))))    
    return sorted(distances, key = lambda x: x[1])[:n_words_returned]
    
closest(glove, get_vector(glove, 'paper'))

In [None]:
# Print both the neighboring word and the distance to that word
def print_tuples(tuples):    
    for t in tuples:
        print('(%.4f) %s' % (t[1], t[0]))
        
print_tuples(closest(glove, get_vector(glove, 'stupendous')))

In [None]:
# Find Analogy (w4) of relationship between w1 and w2 given just w3
def analogy(embeddings, w1, w2, w3, n = 6):    
    print('\n[%s : %s :: %s : ?]' % (w1, w2, w3))   
    closest_words = closest(embeddings, \
                            get_vector(embeddings, w2)
                          - get_vector(embeddings, w1) \
                          + get_vector(embeddings, w3), \
                            n + 3) 
    closest_words = [x for x in closest_words if x[0] not in [w1, w2, w3]][:n]        
    return closest_words
    
print_tuples(analogy(glove, 'moon', 'night', 'sun')) # not perfect as would have prefered answer of 'day', but 'morning', or 'afternoon' are not far off

In [None]:
print_tuples(analogy(glove, 'fly', 'bird', 'swim'))