In [123]:
import numpy as np
import pandas as pd
import pickle
import random
from scipy.spatial.distance import cosine
from google.colab import drive
from numpy.linalg import norm

In [124]:
# Download the Data Frame
drive.mount('/content/drive', force_remount=True)
file_path='/content/drive/My Drive/Numeric_Programming/word_embeddings_subset.p'

with open(file_path, 'rb') as f:
  word_embeddings = pickle.load(f)

# Extracting three-dimensional vectors
words = list(word_embeddings.keys())
print('The first 10 words from DataFrame')
display(words[:10])

embeddings = np.array([word_embeddings[word] for word in words]) # Take only first 3 measures

df = pd.DataFrame(embeddings, index=words, columns=[f'dim_{i+1}' for i in range(300)])
print(df.head(10))

df.describe()
df.info()

Mounted at /content/drive
The first 10 words from DataFrame


['country',
 'city',
 'China',
 'Iraq',
 'oil',
 'town',
 'Canada',
 'London',
 'England',
 'Australia']

              dim_1     dim_2     dim_3     dim_4     dim_5     dim_6  \
country   -0.080078  0.133789  0.143555  0.094727 -0.047363 -0.023560   
city      -0.010071  0.057373  0.183594 -0.040039 -0.029785 -0.079102   
China     -0.073242  0.135742  0.108887  0.083008 -0.127930 -0.227539   
Iraq       0.191406  0.125000 -0.065430  0.060059 -0.285156 -0.102539   
oil       -0.139648  0.062256 -0.279297  0.063965  0.044434 -0.154297   
town       0.123535  0.159180  0.030029 -0.161133  0.015625  0.111816   
Canada    -0.136719 -0.154297  0.269531  0.273438  0.086914 -0.076172   
London    -0.267578  0.092773 -0.238281  0.115234 -0.006836  0.221680   
England   -0.198242  0.115234  0.062500 -0.058350  0.226562  0.045898   
Australia  0.048828 -0.194336 -0.041504  0.084473 -0.114258 -0.208008   

              dim_7     dim_8     dim_9    dim_10  ...   dim_291   dim_292  \
country   -0.008545 -0.186523  0.045898 -0.081543  ... -0.145508  0.067383   
city       0.071777  0.013306 -0.143555 

In [125]:
# The function for searching nearest word
def find_closest_word(vector, word_embeddings):
  min_distance = float('inf')
  closest_word = None

  for word, emb in word_embeddings.items():
    emb_vector = emb # Extract only first three coordinates
    if np.all(emb_vector==0):
      continue
    distance = cosine(vector, emb_vector)
    if np.isnan(distance): # If distance = NaN - skip
      continue
    if distance < min_distance:
      min_distance = distance
      closest_word = word

  return closest_word

In [126]:
# Test 1 our function
sample_vector = df.iloc[0].values # Take the first vector in Data Frame
closest_word = find_closest_word(sample_vector, word_embeddings)
print(f'Nearest word to {df.index[0]}: {closest_word}')

Nearest word to country: country


In [127]:
# Test 2 our function
sample_vector = df.iloc[11].values # Take the first vector in Data Frame
closest_word = find_closest_word(sample_vector, word_embeddings)
print(f'Nearest word to {df.index[11]}: {closest_word}')

Nearest word to Pakistan: Pakistan


In [128]:
# Test 3 our function
sample_vector = df.iloc[16].values # Take the first vector in Data Frame
closest_word = find_closest_word(sample_vector, word_embeddings)
print(f'Nearest word to {df.index[16]}: {closest_word}')

Nearest word to Afghanistan: Afghanistan


In [129]:
# Test our function with different vectors

test_words = random.sample(words, 15)

for word in test_words:
  if word in df.index:
    sample_vector = df.loc[word].values
    closest_word = find_closest_word(sample_vector, word_embeddings)
    print(f'Nearest word to {word}: {closest_word}')
  else:
    print(f'Word "{word}" not found in embeddings.')

Nearest word to Valletta: Valletta
Nearest word to Nassau: Nassau
Nearest word to Bahrain: Bahrain
Nearest word to Belize: Belize
Nearest word to Sudan: Sudan
Nearest word to Belgium: Belgium
Nearest word to Algeria: Algeria
Nearest word to Hanoi: Hanoi
Nearest word to Stockholm: Stockholm
Nearest word to Russia: Russia
Nearest word to Luanda: Luanda
Nearest word to joyful: joyful
Nearest word to Turkmenistan: Turkmenistan
Nearest word to Honduras: Honduras
Nearest word to Lima: Lima


In [130]:
# Cross product calculation function
def find_orthogonal_word(word1, word2, word_embeddings):
    if word1 not in word_embeddings or word2 not in word_embeddings:
        return None

    vector1 = np.array(word_embeddings[word1])  # Use entire vector (300D)
    vector2 = np.array(word_embeddings[word2])  # Use entire vector (300D)

    # Calculate the cosine similarity between each word and the average vector
    average_vector = (vector1 + vector2) / 2

    # Find the word with the maximum cosine distance from the average vector
    max_distance = -1  # Start with very low distance
    orthogonal_word = None

    for word, emb in word_embeddings.items():
        emb_vector = np.array(emb)
        if np.all(emb_vector == 0):
            continue
        # Compute the cosine distance
        distance = cosine(average_vector, emb_vector)
        if distance > max_distance:
            max_distance = distance
            orthogonal_word = word

    return orthogonal_word

In [131]:
# Test orthogonal word with specific pairs

test_pairs = [('country', 'city'), ('happy', 'sad')]

for word1, word2 in test_pairs:
  if word1 in word_embeddings and word2 in word_embeddings:
    orthogonal_word = find_orthogonal_word(word1, word2, word_embeddings)
    print(f'Orthogonal word to "{word1}" and "{word2}": {orthogonal_word}')
  else:
    print(f'One of the words "{word1}" or "{word2}" not found.')

Orthogonal word to "country" and "city": Valletta
Orthogonal word to "happy" and "sad": Ankara


In [132]:
# The function for calculate angle between words
def angle_between_words(word1, word2, word_embeddings):
  if word1 not in word_embeddings or word2 not in word_embeddings:
    return None
  vector1 = np.array(word_embeddings[word1])
  vector2 = np.array(word_embeddings[word2])
  cos_theta = np.dot(vector1, vector2) / (norm(vector1) * norm(vector2))
  angle = np.arccos(np.clip(cos_theta, -1.0, 1.0))
  return np.degrees(angle)

In [133]:
# Test the function of angle between words
for word1, word2 in test_pairs:
  if word1 in word_embeddings and word2 in word_embeddings:
    angle = angle_between_words(word1, word2, word_embeddings)
    print(f'The angle between "{word1}" and "{word2}": {angle:.2f} degrees')
  else:
    print(f'One of the words "{word1}" or "{word2}" not found.')

The angle between "country" and "city": 71.16 degrees
The angle between "happy" and "sad": 57.62 degrees
