In [None]:
# Import all required dependencies
import numpy as np
import re
import logging
from collections import defaultdict, Counter
from typing import List, Tuple, Dict, Set, Optional
import pickle
import random
import json
from datetime import datetime
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

print("✓ All dependencies imported successfully")
print(f"NumPy version: {np.__version__}")

# Set up matplotlib for inline plotting
%matplotlib inline


In [None]:
# Complete WordEmbeddingTrainer implementation
class WordEmbeddingTrainer:
    """
    Skip-Gram word embedding trainer with negative sampling.
    
    Key insight: Each word gets its own embedding vector.
    """
    
    def __init__(self, embedding_dim=100, window_size=3, negative_samples=5, 
                 learning_rate=0.01, min_count=2, epochs=20):
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.learning_rate = learning_rate
        self.min_count = min_count
        self.epochs = epochs
        
        # Vocabulary
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.word_counts = Counter()
        self.vocab_size = 0
        
        # Embedding matrices
        self.W1 = None  # Input embeddings
        self.W2 = None  # Output embeddings
        self.training_pairs = []
    
    def preprocess_text(self, text):
        """Clean and tokenize text."""
        text = text.lower().strip()
        text = re.sub(r'[^\w\s\.\,\!\?\;\:]', '', text)
        return [token for token in text.split() if len(token) > 1]
    
    def build_vocabulary(self, texts):
        """Build vocabulary with frequency filtering."""
        print("Building vocabulary...")
        for text in texts:
            tokens = self.preprocess_text(text)
            self.word_counts.update(tokens)
        
        filtered_words = {word: count for word, count in self.word_counts.items() 
                         if count >= self.min_count}
        
        self.word_to_idx = {word: idx for idx, word in enumerate(filtered_words.keys())}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        self.vocab_size = len(self.word_to_idx)
        
        print(f"✓ Vocabulary: {self.vocab_size} words")
    
    def generate_training_data(self, texts):
        """Generate Skip-Gram training pairs."""
        print("Generating training pairs...")
        self.training_pairs = []
        
        for text in texts:
            tokens = self.preprocess_text(text)
            indices = [self.word_to_idx[token] for token in tokens if token in self.word_to_idx]
            
            for i, target_idx in enumerate(indices):
                start = max(0, i - self.window_size)
                end = min(len(indices), i + self.window_size + 1)
                
                for j in range(start, end):
                    if i != j:
                        context_idx = indices[j]
                        self.training_pairs.append((target_idx, context_idx))
        
        print(f"✓ Generated {len(self.training_pairs)} training pairs")
    
    def sigmoid(self, x):
        """Stable sigmoid function."""
        return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
    
    def negative_sampling(self, target_idx, positive_context):
        """Sample negative examples."""
        if not hasattr(self, '_neg_probs'):
            word_freqs = np.array([self.word_counts[self.idx_to_word[i]] for i in range(self.vocab_size)])
            word_freqs = np.power(word_freqs, 0.75)
            self._neg_probs = word_freqs / np.sum(word_freqs)
        
        negative_samples = []
        attempts = 0
        while len(negative_samples) < self.negative_samples and attempts < 50:
            candidate = np.random.choice(self.vocab_size, p=self._neg_probs)
            if candidate != target_idx and candidate != positive_context:
                negative_samples.append(candidate)
            attempts += 1
        
        return negative_samples
    
    def train_step(self, target_idx, context_idx):
        """One training step with negative sampling."""
        target_embedding = self.W1[target_idx].copy()
        
        # Positive sample
        positive_score = np.dot(target_embedding, self.W2[context_idx])
        positive_prob = self.sigmoid(positive_score)
        positive_loss = -np.log(positive_prob + 1e-10)
        
        positive_error = positive_prob - 1
        context_grad = positive_error * target_embedding
        target_grad = positive_error * self.W2[context_idx]
        
        # Negative samples
        negative_samples = self.negative_sampling(target_idx, context_idx)
        negative_loss = 0
        
        for neg_idx in negative_samples:
            negative_score = np.dot(target_embedding, self.W2[neg_idx])
            negative_prob = self.sigmoid(-negative_score)
            negative_loss += -np.log(negative_prob + 1e-10)
            
            negative_error = -(1 - negative_prob)
            self.W2[neg_idx] += self.learning_rate * negative_error * target_embedding
            target_grad += negative_error * self.W2[neg_idx]
        
        # Update embeddings
        self.W2[context_idx] += self.learning_rate * context_grad
        self.W1[target_idx] += self.learning_rate * target_grad
        
        return positive_loss + negative_loss
    
    def train(self, texts):
        """Train the model."""
        self.build_vocabulary(texts)
        self.generate_training_data(texts)
        
        # Initialize embeddings (Xavier initialization)
        std = np.sqrt(2.0 / (self.vocab_size + self.embedding_dim))
        self.W1 = np.random.normal(0, std, (self.vocab_size, self.embedding_dim))
        self.W2 = np.random.normal(0, std, (self.vocab_size, self.embedding_dim))
        
        print(f"Training for {self.epochs} epochs...")
        epoch_losses = []
        
        for epoch in range(self.epochs):
            epoch_loss = 0
            random.shuffle(self.training_pairs)
            
            for i, (target_idx, context_idx) in enumerate(self.training_pairs):
                loss = self.train_step(target_idx, context_idx)
                epoch_loss += loss
                
                if i % 2000 == 0 and i > 0:
                    print(f"Epoch {epoch+1}/{self.epochs}, Step {i}, Avg Loss: {epoch_loss/(i+1):.4f}")
            
            avg_loss = epoch_loss / len(self.training_pairs)
            epoch_losses.append(avg_loss)
            self.learning_rate *= 0.95
            print(f"✓ Epoch {epoch+1} completed, Loss: {avg_loss:.4f}")
        
        return {'epoch_losses': epoch_losses, 'vocab_size': self.vocab_size}
    
    def get_word_vector(self, word):
        """Get embedding for a word."""
        return self.W1[self.word_to_idx[word]] if word in self.word_to_idx else None
    
    def cosine_similarity(self, word1, word2):
        """Calculate cosine similarity between two words."""
        vec1, vec2 = self.get_word_vector(word1), self.get_word_vector(word2)
        if vec1 is None or vec2 is None:
            return 0.0
        
        dot_product = np.dot(vec1, vec2)
        norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
        return dot_product / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0.0
    
    def find_similar_words(self, word, top_k=3):
        """Find most similar words."""
        if self.get_word_vector(word) is None:
            return []
        
        similarities = [(w, self.cosine_similarity(word, w)) 
                       for w in self.word_to_idx.keys() if w != word]
        return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]

print("✓ WordEmbeddingTrainer class defined")


In [None]:
# Training corpus - diverse sentences for learning
training_corpus = [
    "The cat sat on the mat and looked around the room carefully",
    "A dog ran quickly through the park and played with children happily", 
    "Machine learning algorithms process data efficiently and accurately every time",
    "Neural networks learn complex patterns from examples and training data sets",
    "Natural language processing enables computer understanding of human text documents",
    "Deep learning models require large amounts of training data to work well",
    "The quick brown fox jumps over the lazy dog in the field",
    "Artificial intelligence will transform many industries in the coming years",
    "Python programming language is popular for data science and machine learning",
    "Word embeddings capture semantic relationships between words and concepts effectively",
    "ChatGPT represents a breakthrough in conversational artificial intelligence systems",
    "Language models use attention mechanisms to understand context and meaning",
    "Transformers have revolutionized natural language processing applications worldwide",
    "Researchers continue advancing the field of artificial intelligence rapidly",
    "Data scientists use various tools and techniques for analysis and modeling",
    "Computer vision systems can recognize objects and faces in images",
    "Robotics combines mechanical engineering with artificial intelligence",
    "Cloud computing provides scalable infrastructure for machine learning",
    "Software engineers develop applications using modern programming languages",
    "Big data analytics helps companies make better business decisions"
]

print(f"📚 Training corpus: {len(training_corpus)} sentences")
print(f"📝 Sample: {training_corpus[0]}")

# Count words
total_words = sum(len(sentence.split()) for sentence in training_corpus)
all_words = []
for sentence in training_corpus:
    all_words.extend(sentence.lower().split())
unique_words = len(set(all_words))

print(f"📊 Total words: {total_words}")
print(f"📊 Unique words: {unique_words}")


In [None]:
# Train the model
trainer = WordEmbeddingTrainer(
    embedding_dim=100,  # 100-dimensional embeddings
    window_size=3,      # Context window of 3 words each side
    negative_samples=5, # 5 negative samples per positive
    learning_rate=0.01, # Conservative learning rate
    min_count=2,        # Include words appearing 2+ times
    epochs=25           # 25 training epochs
)

print("🚀 Starting training...")
print("This will take a few moments...")

# Train the model
metrics = trainer.train(training_corpus)

print("\n" + "="*50)
print("🎉 TRAINING COMPLETED!")
print("="*50)
print(f"✅ Final vocabulary size: {metrics['vocab_size']}")
print(f"✅ Final loss: {metrics['epoch_losses'][-1]:.4f}")


In [None]:
# THE CORE DEMONSTRATION: 20 Words = 20 Individual Embeddings
def demonstrate_individual_embeddings(trainer, sentence):
    """
    This function proves our core claim: each word gets its own embedding.
    """
    print("\n" + "="*70)
    print("🎯 CORE DEMONSTRATION: EACH WORD GETS ITS OWN EMBEDDING")
    print("="*70)
    
    words = sentence.lower().split()
    
    print(f"📝 Sentence: '{sentence}'")
    print(f"📊 Number of words: {len(words)}")
    print(f"📋 Words: {words}")
    print("\n" + "-"*70)
    
    embeddings = []
    for i, word in enumerate(words):
        embedding = trainer.get_word_vector(word)
        if embedding is not None:
            embeddings.append(embedding)
            print(f"Word {i+1:2d}: '{word:12s}' → {embedding.shape} vector | First 3: [{embedding[0]:.3f}, {embedding[1]:.3f}, {embedding[2]:.3f}]")
        else:
            print(f"Word {i+1:2d}: '{word:12s}' → [NOT IN VOCABULARY]")
    
    print("\n" + "-"*70)
    print(f"🏆 RESULT: {len(words)} words = {len(embeddings)} individual embeddings")
    print("✅ Each word maintains its unique vector representation!")
    
    # Show sentence-level combination
    if embeddings:
        sentence_embedding = np.mean(embeddings, axis=0)
        print(f"\n📈 Sentence embedding (mean pooling): {sentence_embedding.shape}")
        print(f"📈 First 5 dims: [{', '.join([f'{x:.3f}' for x in sentence_embedding[:5]])}]")
        
        print("\n💡 Methods to combine word embeddings:")
        print("   1. ✅ Mean pooling (averaging) - what we just did")
        print("   2. Weighted averaging (TF-IDF, attention weights)")
        print("   3. Max pooling (element-wise maximum)")
        print("   4. LSTM/GRU encoders")
        print("   5. Transformer attention (ChatGPT approach)")
    
    return embeddings

# Test with exactly 20 words
test_sentence = "Machine learning algorithms process data efficiently and neural networks learn complex patterns from training examples in artificial intelligence systems"
word_count = len(test_sentence.split())
print(f"🎯 Testing with {word_count}-word sentence...")

embeddings = demonstrate_individual_embeddings(trainer, test_sentence)

print(f"\n🎉 CONCLUSION: {word_count} words = {len(embeddings)} individual embeddings")
print("🔑 This is exactly how ChatGPT processes text!")
