**2. Word Embedding Explorer**

• Load pretrained GloVe embeddings.

• Perform word similarity & analogy tasks (king - man + woman ≈ queen).

• Perform t-SNE / PCA visualization of embeddings in 2D clusters.

• Compare Word2Vec vs FastText vs GloVe.

• Show how FastText handles out-of-vocabulary words better.

In [None]:
import sys
!{sys.executable} -m pip install gensim

import gensim.downloader as api
from gensim.models import KeyedVectors

In [None]:
import gensim.downloader as api

# Load a pre-trained Word2Vec model (this may take a moment)
model = api.load("word2vec-google-news-300")

# Now you can access word vectors
try:
    vector = model["computer"]
    print("Vector found for 'computer'")
except KeyError:
    print("Word not in vocabulary")

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Load models
glove_model = api.load('glove-wiki-gigaword-100')
word2vec_model = api.load('word2vec-google-news-300')
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

# Word similarity
word = 'computer'
print(f"GloVe similar to '{word}':", glove_model.most_similar(word, topn=5))
print(f"Word2Vec similar to '{word}':", word2vec_model.most_similar(word, topn=5))
print(f"FastText similar to '{word}':", fasttext_model.most_similar(word, topn=5))

# Analogy task
analogy = glove_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print("GloVe: king - man + woman ≈", analogy[0][0])
analogy_w2v = word2vec_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print("Word2Vec: king - man + woman ≈", analogy_w2v[0][0])
analogy_ft = fasttext_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print("FastText: king - man + woman ≈", analogy_ft[0][0])

# OOV handling
oov_word = 'unbelievablenes'  # Made-up word, likely OOV for all models loaded as KeyedVectors

try:
    print("GloVe OOV similarity:", glove_model.most_similar(oov_word))
except KeyError:
    print("GloVe: OOV error")
try:
    print("Word2Vec OOV similarity:", word2vec_model.most_similar(oov_word))
except KeyError:
    print("Word2Vec: OOV error")
try:
    # While FastText typically handles OOV words via subword embeddings,
    # the 'most_similar' method on KeyedVectors from gensim.downloader
    # will still raise a KeyError if the word is not in its primary vocabulary.
    # To leverage FastText's true OOV capability, one would typically need
    # to access the full FastText model and its 'get_vector' method.
    print("FastText OOV similarity:", fasttext_model.most_similar(oov_word, topn=5))
except KeyError:
    print("FastText: OOV error (KeyedVectors does not handle truly OOV words in most_similar)")

# Visualization (t-SNE on sample words)
words = ['king', 'queen', 'man', 'woman', 'computer', 'phone', 'apple', 'banana']
embeddings = np.array([glove_model[w] for w in words])
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

plt.figure(figsize=(8, 6))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]))
plt.title('t-SNE Visualization of GloVe Embeddings')
plt.show()

# PCA alternative
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)
plt.figure(figsize=(8, 6))
plt.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, (embeddings_pca[i, 0], embeddings_pca[i, 1]))
plt.title('PCA Visualization of GloVe Embeddings')
plt.show()

# Comparison: FastText handles OOV better, Word2Vec often more accurate for analogies, GloVe balances context.