# Word Embeddings Exploration

This notebook demonstrates various word embedding techniques and their applications using popular libraries like Gensim and spaCy.

In [None]:
# Import required libraries
import numpy as np
import gensim.downloader as api
from gensim.models import Word2Vec
import spacy
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd

# Download required NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')

## 1. Loading Pre-trained Word Embeddings

In [None]:
# Load GloVe model
glove_model = api.load('glove-wiki-gigaword-100')

# Load spaCy model
nlp = spacy.load('en_core_web_md')

## 2. Basic Word Vector Operations

In [None]:
# Get vector for a word
word = 'king'
vector = glove_model[word]
print(f"Vector shape for '{word}': {vector.shape}")

# Find most similar words
similar_words = glove_model.most_similar(word, topn=5)
print(f"\nMost similar words to '{word}':")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity:.4f}")

## 3. Word Analogies

In [None]:
# King - Man + Woman = ?
result = glove_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print(f"King - Man + Woman = {result[0][0]}")

# More analogies
analogies = [
    (['king', 'woman'], ['man']),
    (['paris', 'germany'], ['france']),
    (['walked', 'running'], ['walk'])
]

for pos, neg in analogies:
    result = glove_model.most_similar(positive=pos, negative=neg, topn=1)
    print(f"{' + '.join(pos)} - {' + '.join(neg)} = {result[0][0]}")

## 4. Visualizing Word Embeddings

In [None]:
def plot_word_embeddings(words, model, title):
    # Get vectors for words
    vectors = [model[word] for word in words]
    
    # Reduce dimensionality using t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    vectors_2d = tsne.fit_transform(vectors)
    
    # Create plot
    plt.figure(figsize=(10, 8))
    plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c='blue')
    
    # Add word labels
    for i, word in enumerate(words):
        plt.annotate(word, xy=(vectors_2d[i, 0], vectors_2d[i, 1]))
    
    plt.title(title)
    plt.show()

# Example words to visualize
words = ['king', 'queen', 'man', 'woman', 'boy', 'girl', 'prince', 'princess']
plot_word_embeddings(words, glove_model, 'Word Embeddings Visualization')

## 5. Training Custom Word2Vec Model

In [None]:
# Sample text for training
sentences = [
    ['the', 'king', 'ruled', 'the', 'kingdom', 'wisely'],
    ['the', 'queen', 'governed', 'her', 'subjects', 'fairly'],
    ['the', 'prince', 'learned', 'to', 'rule', 'the', 'land'],
    ['the', 'princess', 'studied', 'the', 'art', 'of', 'governance']
]

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Find similar words
print("Similar words to 'king':")
print(model.wv.most_similar('king'))

## 6. Comparing Different Embedding Models

In [None]:
def compare_embeddings(word, models):
    results = {}
    for name, model in models.items():
        if name == 'spacy':
            results[name] = [token.text for token in model(word).similar_by_vector(topn=5)]
        else:
            results[name] = [w for w, _ in model.most_similar(word, topn=5)]
    return results

# Compare models
models = {
    'glove': glove_model,
    'spacy': nlp
}

word = 'king'
comparison = compare_embeddings(word, models)

print(f"Similar words to '{word}' in different models:")
for model_name, similar_words in comparison.items():
    print(f"\n{model_name.upper()}:")
    print(similar_words)

## 7. Word Embeddings Applications

In [None]:
# Document similarity using word embeddings
def document_similarity(doc1, doc2, model):
    # Get document vectors
    vec1 = np.mean([model[word] for word in doc1.split() if word in model], axis=0)
    vec2 = np.mean([model[word] for word in doc2.split() if word in model], axis=0)
    
    # Calculate cosine similarity
    similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return similarity

# Example documents
doc1 = "The king ruled the kingdom wisely and fairly"
doc2 = "The queen governed her subjects with wisdom and justice"

similarity = document_similarity(doc1, doc2, glove_model)
print(f"Document similarity: {similarity:.4f}")