<a href="https://colab.research.google.com/github/Niko1909/Contexto-Like/blob/main/contexto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import string
import numpy as np
from numpy.linalg import norm
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import nltk
from nltk.stem import WordNetLemmatizer as wnl
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = open('/content/glove.6B.50d.txt', 'r') # 50-dim GloVe word vectors

In [5]:
print(data.read(100))

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0


In [6]:
# build and clean embeddings (only take lemmatized words, no punctuation in word, no numbers in word)
embeddings = {}
for word in data:
  word_vec = word.split()
  if wnl().lemmatize(word_vec[0]) == word_vec[0] and all(char not in word_vec[0] for char in string.punctuation + '0123456789'):
    embeddings[word_vec[0]] = np.asarray(word_vec[1:], dtype=np.float32)

In [7]:
# identify bad data (wrong shape)
for key, val in embeddings.items():
  if val.shape != (50,):
    print(key)

In [8]:
len(embeddings)

307535

In [9]:
embeddings['a'].shape == (50,)

True

In [18]:
def contexto(word, similarity, num_words=15):
  """
  Prints the top num_words most similar words to the word inputted, using one of
  three vector similarity metrics.

  word: the term that the other words similarities are ranked on
  similarity: the similarity metric to use: one of dot product, euclidean
  distance, or cosine similarity
  num_words: the number of similar words to show in the ranking
  """
  lem_word = wnl().lemmatize(word)
  if lem_word not in embeddings.keys():
    print(f'{lem_word} does not have an embedding')
    return
  if num_words > len(embeddings):
    print(f'num_words is greater than the number of embeddings! choose a num_words <= {len(embeddings)}')
    return

  if similarity == 'dot':
    sorted_words = sorted(embeddings.keys(), reverse=True, key=lambda word1: np.dot(embeddings[word1], embeddings[word]))
  elif similarity == 'euclidean':
    sorted_words = sorted(embeddings.keys(), key=lambda word1: spatial.distance.euclidean(embeddings[word1], embeddings[word]))
  elif similarity == 'cosine':
    sorted_words = sorted(embeddings.keys(), reverse=True, key=lambda word1: np.dot(embeddings[word1], embeddings[word])/(norm(embeddings[word1])*norm(embeddings[word])))
  else:
    print(f'{similarity} is not one of dot, euclidean, or cosine')
    return

  print(f'The top {num_words} most similar words to {lem_word}:')
  for i in range(1, num_words+1):
    print(f'{i}. {sorted_words[i]}')
  return

In [11]:
contexto('the', 'euclidean')

the does not have an embedding


In [12]:
contexto('a', 'euclidean', 1000000)

num_words is greater than the number of embeddings! choose a num_words <= 307535


In [13]:
contexto('fish', 'euclidean', 15)

The top 15 most similar words to fish:
1. salmon
2. shrimp
3. meat
4. bird
5. wild
6. seafood
7. shellfish
8. crab
9. tuna
10. trout
11. animal
12. eat
13. eaten
14. chicken
15. herring


In [14]:
contexto('fish', 'dot', 15)

The top 15 most similar words to fish:
1. meat
2. shrimp
3. eat
4. freshwater
5. salmon
6. chicken
7. tuna
8. bacteria
9. water
10. seafood
11. fishing
12. cyprinid
13. trout
14. poultry
15. sea


In [15]:
contexto('fish', 'cosine', 15)

The top 15 most similar words to fish:
1. salmon
2. meat
3. shrimp
4. bird
5. wild
6. eat
7. seafood
8. tuna
9. shellfish
10. chicken
11. crab
12. trout
13. eaten
14. animal
15. whale


In [16]:
all(char not in ".'," for char in 'test')

True