<a href="https://colab.research.google.com/github/Ubayed-Bin-Sufian/Gen-AI-and-LLMs/blob/main/Vector_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install gensim

# Import trained model

<b>Word2vec</b> is a technique in natural language processing (NLP) for obtaining vector representations of words.

In [None]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")  # download the model and return as object ready for use
# The dimension of the vector using which these words are represented is 300.

# Example of a word as a vector

In [None]:
word_vectors=model

# Let us look how the vector embedding of a word looks like
print(word_vectors['computer'])  # Example: Accessing the vector for the word 'computer'

In [None]:
# 300 numbers are used to represent each words; 300 dimensions
print(word_vectors['cat'].shape)

(300,)


**Word Embeddings Visualization**

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

# Get the word vectors for the target words and most similar words 
target_words = ["Paris", "Italy", "France", "doctor", "law", "medicine"]
similar_words = [word for word, _ in model.most_similar(positive=["Paris", "Italy"], negative=["France"], topn = 10)] + \
                [word for word, _ in model.most_similar(positive=["doctor", "law"], negative=["medicine"], topn = 10)]
words = target_words + similar_words
word_vectors = [model[word] for word in words]

# Convert word_vectors to a NumPy array 
word_vectors_array = np.array(word_vectors)

# Perform t-SNE dimensionality reduction 
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
reduced_vectors = tsne.fit_transform(word_vectors_array)

# Plot the word embeddings 
plt.figure(figsize=(12,10))
for i, word in enumerate(words):
    plt.scatter(reduced_vectors[i,0], reduced_vectors[i,1], marker="o" if word in target_words else "x")
    plt.annotate(word, xy=(reduced_vectors[i, 0], reduced_vectors[i, 1]))

plt.title("Word Embeddings Visualization")
plt.show()

# Similar words

# King + Woman - Man = ?

In [None]:
# Example of using most_similar
print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)) # prints top 1 similarity
print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10)) # prints top 10 similarities

# Let us check the similarity b/w a few pair of words

In [None]:
# Example of calculating similarity
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('king', 'queen'))
print(word_vectors.similarity('uncle', 'aunt'))
print(word_vectors.similarity('boy', 'girl'))
print(word_vectors.similarity('nephew', 'niece'))
print(word_vectors.similarity('paper', 'water'))

0.76640123
0.6510957
0.7643474
0.8543272
0.7594367
0.11408084


### Word Pair Similarities

The code calculates similarity scores for various word pairs using a language model, storing these pairs and their scores in a list. It then creates a bar plot to visualize these similarity scores, labeling each bar with the corresponding word pair and its similarity score. The plot provides a clear comparison of the semantic similarities between the chosen word pairs.

In [None]:
# Calculate similarities
similarities = [
    ("woman", "man", model.similarity("woman","man")),
    ("king", "queen", model.similarity("woman","man")),
    ("uncle", "aunt", model.similarity("uncle","aunt")),
    ("boy", "girl", model.similarity("boy","girl")),
    ("nephew", "niece", model.similarity("nephew","niece")),
    ("paper", "water", model.similarity("paper","water")),     
]

# Extract word pairs and similarity scores
word_pairs = [pair[:2] for pair in similarities]
similarity_scores = [pair[2] for pair in similarities]

# Create a bar plot
fig, ax = plt.subplots(figsize=(10,6))
ax.bar(range(len(word_pairs)), similarity_scores)
ax.set_xticks(range(len(word_pairs)))
ax.set_xticklabels(word_pairs, rotation=45, ha="right")
ax.set_ylabel("Similarity Score")
ax.set_title("Word Pair Similarities")

# Add similarity scores as text labels above the bars
for i, score in enumerate(similarity_scores):
    ax.text(i, score + 0.01, f'{score:.2f}', ha="center")

plt.tight_layout()
plt.show()

# Most similar words

In [None]:
print(word_vectors.most_similar("tower", topn=5))

[('towers', 0.8531750440597534), ('skyscraper', 0.6417425870895386), ('Tower', 0.639177143573761), ('spire', 0.594687819480896), ('responded_Understood_Atlasjet', 0.5931612253189087)]


# Now let us see the vector similarity

In [None]:
# # Words to compare
# word1 = 'man'
# word2 = 'woman'

# word3 = 'semiconductor'
# word4 = 'earthworm'

# word5 = 'nephew'
# word6 = 'niece'

# word7 = 'male'
# word8 = 'female'

# word9 = 'phone'
# word10 = 'tape'

# word11 = 'boy'
# word12 = 'girl'

# word13 = 'computer'
# word14 = 'potato'

# word15 = 'uncle'
# word16 = 'aunt'

# word17 = 'king'
# word18 = 'queen'

# word19 = 'bottle'
# word20 = 'earphone'


# # Calculate the vector difference
# vector_difference1 = model[word1] - model[word2]
# vector_difference2 = model[word3] - model[word4]
# vector_difference3 = model[word5] - model[word6]
# vector_difference4 = model[word7] - model[word8]
# vector_difference5 = model[word9] - model[word6]
# vector_difference6 = model[word11] - model[word12]
# vector_difference7 = model[word13] - model[word14]
# vector_difference8 = model[word15] - model[word16]
# vector_difference9 = model[word17] - model[word18]
# vector_difference10 = model[word19] - model[word20]


# # Calculate the magnitude of the vector difference
# magnitude_of_difference1 = np.linalg.norm(vector_difference1)
# magnitude_of_difference2 = np.linalg.norm(vector_difference2)
# magnitude_of_difference3 = np.linalg.norm(vector_difference3)
# magnitude_of_difference4 = np.linalg.norm(vector_difference4)
# magnitude_of_difference5 = np.linalg.norm(vector_difference5)
# magnitude_of_difference6 = np.linalg.norm(vector_difference6)
# magnitude_of_difference7 = np.linalg.norm(vector_difference7)
# magnitude_of_difference8 = np.linalg.norm(vector_difference8)
# magnitude_of_difference9 = np.linalg.norm(vector_difference9)
# magnitude_of_difference10 = np.linalg.norm(vector_difference10)


# # Print the magnitude of the difference
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word1, word2, magnitude_of_difference1))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word3, word4, magnitude_of_difference2))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word5, word6, magnitude_of_difference3))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word7, word8, magnitude_of_difference4))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word9, word10, magnitude_of_difference5))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word11, word12, magnitude_of_difference6))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word13, word14, magnitude_of_difference7))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word15, word16, magnitude_of_difference8))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word17, word18, magnitude_of_difference9))
# print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word19, word20, magnitude_of_difference10))

The above code could be optimised to the following:

**Vector Differences in 2D Space**

In [None]:
# Words to compare
word_pairs = [
    ("boy", "girl"),
    ("semiconductor", "earthworm"),
    ("computer", "potato"),
    ("king", "queen"),
    ("bottle", "earphone"),    
    ]

# Calculate the vector differences and magnitudes
vector_differences = []
magnitudes = []

for word1, word2 in word_pairs:
    vector_difference = model[word1] - model[word2]
    magnitude = np.linalg.norm(vector_difference)
    vector_differences.append(vector_difference)
    magnitudes.append(magnitude)

# Print the magnitude of the differences 
for i, (word1, word2) in enumerate(word_pairs):
    print(f"The magnitude of the difference between '{word1}' and '{word2}' is {magnitudes[i]:.2f}")

# Create a bar plot for the magnitudes 
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(range(len(word_pairs)), magnitudes)
ax.set_xticks(range(len(word_pairs)))
ax.set_xticklabels([f"{word1}-{word2}" for word1, word2 in word_pairs], rotation=45, ha="right")
ax.set_ylabel("Magnitude of Vector Differences")
ax.set_title("Magnitude of Vector Differences")

# Add magnitude values as text labels above the bars
for i, magnitude in enumerate(magnitudes):
    ax.text(i, magnitude + 0.1, f'{magnitude:.2f}', ha="center")

plt.tight_layout()
plt.show()

# Perform PCA on the vector differences
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_vectors_differences = pca.fit_transform(vector_differences)


# Create a scatter plot for the vector differences
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(reduced_vectors_differences[:, 0], reduced_vectors_differences[:, 1])

for i, (word1, word2) in enumerate(word_pairs):
    ax.annotate(f"{word1}-{word2}", (reduced_vectors_differences[i, 0], reduced_vectors_differences[i, 1]))

ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_title("Vector Differences in 2D Space")

plt.tight_layout()
plt.show()