# Exercise 4: Word Embeddings

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import glob
from collections import Counter

In [2]:
from pathlib import Path

# Define base path to the dataset
BASE_DATA_PATH = Path("../exercise_01/aclImdb") 

TRAIN_PATH = BASE_DATA_PATH / "train"
TEST_PATH = BASE_DATA_PATH / "test"

TRAIN_POS_PATH = TRAIN_PATH / "pos"
TRAIN_NEG_PATH = TRAIN_PATH / "neg"
TEST_POS_PATH = TEST_PATH / "pos"
TEST_NEG_PATH = TEST_PATH / "neg"

assert TRAIN_POS_PATH.exists() and TRAIN_NEG_PATH.exists() and TEST_POS_PATH.exists() and TEST_NEG_PATH.exists()

In [3]:
import random

def load_imdb_data(
    data_path: Path, 
    subset_size: int | None = None, 
    random_seed: int = 42
) -> tuple[list[str], list[int]]:
    """
    Loads movie reviews and their sentiments from the specified path.
    If subset_size is provided, returns a random subset of that size (with fixed seed).
    """
    texts: list[str] = []
    labels: list[int] = [] 

    for sentiment, folder_path in [("pos", data_path / "pos"), ("neg", data_path / "neg")]:
        if not folder_path.exists():
            raise FileNotFoundError(f"Warning: Path {folder_path} does not exist.")

        binary_label = 1 if sentiment == "pos" else 0

        for file_path in folder_path.glob("*.txt"):
            texts.append(file_path.read_text(encoding='utf-8'))
            labels.append(binary_label)  # Use binary label, not score

    if subset_size is not None and subset_size < len(texts):
        random.seed(random_seed)
        indices = list(range(len(texts)))
        random.shuffle(indices)
        selected_indices = indices[:subset_size]
        texts = [texts[i] for i in selected_indices]
        labels = [labels[i] for i in selected_indices]

    return texts, labels

# Set the subset size for training data (e.g., 5000 for a smaller subset)
TRAIN_SUBSET_SIZE = 2000  # Change as needed

# Load training data (random subset)
train_texts_raw, train_labels = load_imdb_data(TRAIN_PATH, subset_size=TRAIN_SUBSET_SIZE, random_seed=42)
print(f"Loaded {len(train_texts_raw)} training reviews (subset).")
print(f"Training labels distribution: Positive (1): {sum(train_labels)}, Negative (0): {len(train_labels) - sum(train_labels)}")

# Load test data (full set)
test_texts_raw, test_labels = load_imdb_data(TEST_PATH)
print(f"Loaded {len(test_texts_raw)} test reviews.")
print(f"Test labels distribution: Positive (1): {sum(test_labels)}, Negative (0): {len(test_labels) - sum(test_labels)}")


Loaded 2000 training reviews (subset).
Training labels distribution: Positive (1): 1006, Negative (0): 994
Loaded 25000 test reviews.
Test labels distribution: Positive (1): 12500, Negative (0): 12500


In [4]:
train_texts_raw[:2], train_labels[:2]

(['This is one of those movies that you and a bunch of friends sit around drinking beers, eating pizza, and laugh at. Unfortunately for me I found myself watching this one alone. My friends and I rented a big block of movies and never got around to seeing this one. It was due back and I figured that it was a waste not to watch it. So I did, and I was impressed at how absolutely terrible this movie is.<br /><br />Now, I love bad movies quite a bit, and I probably would have liked this one if the "hero" wasn\'t so utterly loathsome. The entire movie I was hoping that he\'d put that stupid sword down and let someone kill him! He does very little heroic things in the movie. He\'s a beefy, disgusting, stupid thing. He has less redeeming qualities than the villains do. And what was it with all the naked chicks? I mean, I love naked chicks just as much as the next guy, but this movie went a tad overboard in that department.<br /><br />Well, anyway, if you love bad movies and can stand a disgu

In [5]:
import re
import string
from collections import Counter
from typing import List, Set


def pre_process(
    reviews: List[str],
    tokenize_punct: bool = False,
    lowercase: bool = True,
    remove_punct: bool = True,
    remove_high_freq_terms: bool = False,
    high_freq_threshold: float = 0.5,
    replace_numbers: bool = True
) -> List[List[str]]:
    """
    Pre-processes a list of text reviews by tokenizing, cleaning, and normalizing them.

    Args:
        reviews (List[str]): A list of raw review strings.
        tokenize_punct (bool): If True, punctuation marks are treated as separate tokens.
                               If False, punctuation is discarded during tokenization.
        lowercase (bool): If True, converts all text to lowercase.
        remove_punct (bool): If True, removes punctuation tokens.
                             Note: This is only effective if `tokenize_punct` is True.
        remove_high_freq_terms (bool): If True, removes words that appear in more than
                                       `high_freq_threshold` proportion of documents.
        high_freq_threshold (float): The document frequency threshold for removing terms.
        replace_numbers (bool): If True, replaces all sequences of digits with a 'num' token.

    Returns:
        List[List[str]]: A list of processed reviews, where each review is a list of tokens.
    """
    processed_reviews: List[List[str]] = []
    punct_set: Set[str] = set(string.punctuation)
    
    for review in reviews:
        text: str = review
        if lowercase:
            text = text.lower()
        if replace_numbers:
            # Use a simple token 'num' that is compatible with the tokenizer.
            text = re.sub(r'\d+', 'num', text)
        
        # Tokenize based on the specified flag
        if tokenize_punct:
            # Keeps words and punctuation as separate tokens
            tokens: List[str] = re.findall(r'\w+|[^\w\s]', text)
        else:
            # Keeps only word characters, effectively discarding punctuation
            tokens: List[str] = re.findall(r'\w+', text)
        
        # This step is only effective if tokenize_punct=True
        if remove_punct and tokenize_punct:
            tokens = [token for token in tokens if token not in punct_set]
        
        processed_reviews.append(tokens)
    
    if remove_high_freq_terms:
        doc_freq = Counter()
        # Calculate document frequency (word appears in how many docs)
        for tokens_list in processed_reviews:
            doc_freq.update(set(tokens_list))
            
        num_docs: int = len(processed_reviews)
        # Identify terms that are too frequent
        high_freq_terms = {term for term, freq in doc_freq.items() if (freq / num_docs) > high_freq_threshold}
        
        # Filter out the high-frequency terms from all reviews
        processed_reviews = [[token for token in tokens_list if token not in high_freq_terms] for tokens_list in processed_reviews]
    
    return processed_reviews

# ==============================================================================
# ==================== FINAL CORRECTED TEST SUITE ==============================
# ==============================================================================

# Define a controlled test corpus to check all behaviors
test_corpus = [
    "Hello common world! This is test 123.",
    "Another common sentence, with a unique word.",
    "Just a common sentence."
]

# --- Test 1: Default behavior ---
print("Testing default behavior...")
processed = pre_process(test_corpus)
expected_default = [
    ['hello', 'common', 'world', 'this', 'is', 'test', 'num'],
    ['another', 'common', 'sentence', 'with', 'a', 'unique', 'word'],
    ['just', 'a', 'common', 'sentence']
]
assert processed == expected_default, f"Default test failed. Got: {processed}"

# --- Test 2: `lowercase=False` ---
print("Testing lowercase=False...")
processed = pre_process(test_corpus, lowercase=False)
expected_no_lower = [
    ['Hello', 'common', 'world', 'This', 'is', 'test', 'num'],
    ['Another', 'common', 'sentence', 'with', 'a', 'unique', 'word'],
    ['Just', 'a', 'common', 'sentence']
]
assert processed == expected_no_lower, f"lowercase=False test failed. Got: {processed}"

# --- Test 3: `replace_numbers=False` ---
print("Testing replace_numbers=False...")
processed = pre_process(test_corpus, replace_numbers=False)
expected_no_num_replace = [
    ['hello', 'common', 'world', 'this', 'is', 'test', '123'],
    ['another', 'common', 'sentence', 'with', 'a', 'unique', 'word'],
    ['just', 'a', 'common', 'sentence']
]
assert processed == expected_no_num_replace, f"replace_numbers=False test failed. Got: {processed}"

# --- Test 4: Punctuation Handling ---
print("Testing tokenize_punct=True, remove_punct=False...")
processed = pre_process(test_corpus, tokenize_punct=True, remove_punct=False)
expected_keep_punct = [
    ['hello', 'common', 'world', '!', 'this', 'is', 'test', 'num', '.'],
    ['another', 'common', 'sentence', ',', 'with', 'a', 'unique', 'word', '.'],
    ['just', 'a', 'common', 'sentence', '.']
]
assert processed == expected_keep_punct, f"Keep punctuation test failed. Got: {processed}"

print("Testing tokenize_punct=True, remove_punct=True...")
processed = pre_process(test_corpus, tokenize_punct=True, remove_punct=True)
assert processed == expected_default, f"Tokenize then remove punctuation test failed. Got: {processed}"

print("Testing tokenize_punct=False, remove_punct=False...")
processed = pre_process(test_corpus, tokenize_punct=False, remove_punct=False)
assert processed == expected_default, f"Discard punctuation test failed. Got: {processed}"

# --- Test 5: High-Frequency Term Removal ---
print("Testing high-frequency removal (threshold=0.7)...")
processed = pre_process(test_corpus, remove_high_freq_terms=True, high_freq_threshold=0.7)
expected_remove_common = [
    ['hello', 'world', 'this', 'is', 'test', 'num'],
    ['another', 'sentence', 'with', 'a', 'unique', 'word'],
    ['just', 'a', 'sentence']
]
assert processed == expected_remove_common, f"High-freq removal (0.7) test failed. Got: {processed}"

print("Testing high-frequency removal (threshold=0.6)...")
processed = pre_process(test_corpus, remove_high_freq_terms=True, high_freq_threshold=0.6)
# CORRECTED: The word 'a' should also be removed as its doc freq (0.667) > 0.6
expected_remove_common_and_sentence = [
    ['hello', 'world', 'this', 'is', 'test', 'num'],
    ['another', 'with', 'unique', 'word'],
    ['just']
]
assert processed == expected_remove_common_and_sentence, f"High-freq removal (0.6) test failed. Got: {processed}"

# --- Test 6: Edge Cases ---
print("Testing edge cases...")
assert pre_process([]) == [], "Edge case: Empty list failed."
assert pre_process([""]) == [[]], "Edge case: List with empty string failed."
assert pre_process(["!@#$"]) == [[]], "Edge case: Punctuation-only string (default) failed."
assert pre_process(["!@#$"], tokenize_punct=True, remove_punct=False) == [['!', '@', '#', '$']], "Edge case: Punctuation-only string (keep) failed."

print("\n✅ All tests for the recommended pre_process function passed successfully!")

Testing default behavior...
Testing lowercase=False...
Testing replace_numbers=False...
Testing tokenize_punct=True, remove_punct=False...
Testing tokenize_punct=True, remove_punct=True...
Testing tokenize_punct=False, remove_punct=False...
Testing high-frequency removal (threshold=0.7)...
Testing high-frequency removal (threshold=0.6)...
Testing edge cases...

✅ All tests for the recommended pre_process function passed successfully!


In [6]:
# reduce the corpus if you are facing performance issues
tokenized_corpus = pre_process(train_texts_raw)

In [7]:
tokenized_corpus[1]

['this',
 'documentary',
 'on',
 'schlockmeister',
 'william',
 'castle',
 'takes',
 'a',
 'few',
 'cheap',
 'shots',
 'at',
 'the',
 'naive',
 'nums',
 'nums',
 'environment',
 'in',
 'which',
 'he',
 'did',
 'his',
 'most',
 'characteristic',
 'work',
 'look',
 'at',
 'the',
 'funny',
 'silly',
 'people',
 'with',
 'the',
 'ghost',
 'glasses',
 'but',
 'it',
 's',
 'also',
 'affectionate',
 'and',
 'lively',
 'with',
 'particularly',
 'bright',
 'commentary',
 'from',
 'john',
 'waters',
 'who',
 'was',
 'absolutely',
 'the',
 'target',
 'audience',
 'for',
 'such',
 'things',
 'at',
 'the',
 'time',
 'and',
 'from',
 'castle',
 's',
 'daughter',
 'who',
 'adored',
 'her',
 'dad',
 'and',
 'also',
 'is',
 'pretty',
 'perceptive',
 'about',
 'how',
 'he',
 'plied',
 'his',
 'craft',
 'we',
 'never',
 'find',
 'out',
 'what',
 'became',
 'of',
 'the',
 'other',
 'castle',
 'offspring',
 'the',
 'movies',
 'were',
 'not',
 'very',
 'good',
 'it',
 'makes',
 'clear',
 'but',
 'his',
 'ma

## Task 1: CBOW

In [8]:
# Parameters (change these as wanted)
CONTEXT_SIZE = 2  # Window size on each side
EMBEDDING_DIM = 100
PAD_TOKEN = '<PAD>'

# Vocabulary
vocab = list(set(word for sentence in tokenized_corpus for word in sentence))

if PAD_TOKEN not in vocab:
    vocab.append(PAD_TOKEN)

word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}
vocab_size = len(vocab)


print('Vocab Size', vocab_size)
print('Context Size', CONTEXT_SIZE)
print('Embedding Dimension', EMBEDDING_DIM)

Vocab Size 24877
Context Size 2
Embedding Dimension 100


In [9]:
first_word = vocab[0]

assert all(idx_to_word[i] == word and word_to_idx[word] == i for i, word in enumerate(vocab))

idx_to_word[0], word_to_idx[first_word]

('titles', 0)

In [10]:
# Assert that PAD_TOKEN is only present once
assert len([x for x in vocab if x == PAD_TOKEN]) == 1

print(["..."] + vocab[-3:])
pad_token_idx = word_to_idx[PAD_TOKEN]
print(f"PAD_TOKEN index: {pad_token_idx}")

['...', 'gilbert', 'yurek', '<PAD>']
PAD_TOKEN index: 24876


In [11]:
# Create the training data
cbow_data = []
for sentence in tokenized_corpus:
    # Skip very short sentences that don't have enough words for a context and a target
    if len(sentence) < 2 * CONTEXT_SIZE + 1:
        continue
    
    # Pad the sentence to handle words at the beginning and end
    padded_sentence = [PAD_TOKEN] * CONTEXT_SIZE + sentence + [PAD_TOKEN] * CONTEXT_SIZE
    
    # Slide a window across the padded sentence
    for i in range(CONTEXT_SIZE, len(padded_sentence) - CONTEXT_SIZE):
        # The word at the center of the window is the target
        target_word = padded_sentence[i]
        
        # The words around the center are the context
        # Get the words to the left
        context_left = padded_sentence[i - CONTEXT_SIZE : i]
        # Get the words to the right
        context_right = padded_sentence[i + 1 : i + CONTEXT_SIZE + 1]
        
        context_words = context_left + context_right
        
        # Append the (context, target) pair to our data
        cbow_data.append((context_words, target_word))

In [141]:
# Let's see what our data looks like
print(f"Created {len(cbow_data)} context-target pairs.")
print("First 5 training samples:")
for context, target in cbow_data[:5]:
    print(f"  Context: {context}, Target: '{target}'")

print()

print("Last 5 training samples:")
for context, target in cbow_data[-5:]:
    print(f"  Context: {context}, Target: '{target}'")

Created 466988 context-target pairs.
First 5 training samples:
  Context: ['<PAD>', '<PAD>', 'is', 'one'], Target: 'this'
  Context: ['<PAD>', 'this', 'one', 'of'], Target: 'is'
  Context: ['this', 'is', 'of', 'those'], Target: 'one'
  Context: ['is', 'one', 'those', 'movies'], Target: 'of'
  Context: ['one', 'of', 'movies', 'that'], Target: 'those'

Last 5 training samples:
  Context: ['ever', 'to', 'in', 'to'], Target: 'give'
  Context: ['to', 'give', 'to', 'being'], Target: 'in'
  Context: ['give', 'in', 'being', 'dead'], Target: 'to'
  Context: ['in', 'to', 'dead', '<PAD>'], Target: 'being'
  Context: ['to', 'being', '<PAD>', '<PAD>'], Target: 'dead'


In [142]:
# Convert the string-based data into integer-based tensors
cbow_train_data_tensors = []
for context, target in cbow_data:
    context_idxs = torch.tensor([word_to_idx[w] for w in context], dtype=torch.long)
    target_idx = torch.tensor([word_to_idx[target]], dtype=torch.long)
    cbow_train_data_tensors.append((context_idxs, target_idx))


In [143]:
# Check the first sample as a tensor
print("First training sample as tensors:")
context_tensor, target_tensor = cbow_train_data_tensors[0]
print(f"  Context Tensor: {context_tensor} (Shape: {context_tensor.shape})")
print(f"  Context Tensor: {[idx_to_word[i.item()] for i in context_tensor]}")
print(f"  Target Tensor: {target_tensor} (Shape: {target_tensor.shape})")
print(f"  Target Tensor: {idx_to_word[target_tensor[0].item()]}")

First training sample as tensors:
  Context Tensor: tensor([24876, 24876,   855, 20975]) (Shape: torch.Size([4]))
  Context Tensor: ['<PAD>', '<PAD>', 'is', 'one']
  Target Tensor: tensor([6361]) (Shape: torch.Size([1]))
  Target Tensor: this


In [144]:
# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size:int, embedding_dim:int):
        super(CBOW, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

        
    def forward(self, context_idxs):
        embeddings = self.embedding(context_idxs)
        summed_embeddings = torch.sum(embeddings, dim=1)
        predictions = self.linear(summed_embeddings)
        # should we add a Relu ? 
        return predictions

In [145]:
from torch.utils.data import Dataset, DataLoader

class CBOWDataset(Dataset):
    """Custom Dataset for CBOW training data."""
    def __init__(self, data_tensors):
        """
        Args:
            data_tensors (list of tuples): A list where each element is a 
                                           tuple (context_tensor, target_tensor).
        """
        self.data = data_tensors

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns one sample from the dataset at the given index.
        
        Args:
            idx (int): The index of the sample to retrieve.
        """
        # The DataLoader will call this method to get each sample.
        return self.data[idx]

In [146]:
BATCH_SIZE = 128 

cbow_dataset = CBOWDataset(cbow_train_data_tensors)

cbow_dataloader = DataLoader(
    dataset=cbow_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

context_batch, target_batch = next(iter(cbow_dataloader))

print("--- Sample Batch ---")
print(f"Context Batch Shape: {context_batch.shape} ({[idx_to_word[i.item()] for i in context_batch[0]]})")
print(f"Target Batch Shape: {target_batch.shape} ({idx_to_word[target_batch[0].item()]})")




--- Sample Batch ---
Context Batch Shape: torch.Size([128, 4]) (['of', 'goofy', 'scenes', 'sounds'])
Target Batch Shape: torch.Size([128, 1]) (death)


In [147]:
from torch.optim import Adam

model = CBOW(vocab_size, EMBEDDING_DIM)
optimizer = Adam(model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()

# Training loop
epochs = 10  # Use a small number for a quick test, increase for better results
model.train()  # Set the model to training mode

cbow_dataloader = DataLoader(
    dataset=cbow_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

import time

PRINT_MODULO = 50

for epoch in range(epochs):
    start_time = time.time()
    total_loss = 0
    batch_losses = []
    correct = 0
    total = 0
    # The DataLoader provides batches of data automatically
    for batch_idx, (context_batch, target_batch) in enumerate(cbow_dataloader):
        # The context_batch is already the correct shape [BATCH_SIZE, 2 * CONTEXT_SIZE]

        model.zero_grad()

        log_probs = model(context_batch)

        # CrossEntropyLoss expects the target to be a 1D tensor of shape [BATCH_SIZE].
        # Our target_batch is [BATCH_SIZE, 1], so we use .squeeze() to remove the extra dimension.
        targets = target_batch.squeeze()
        loss = loss_function(log_probs, targets)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        batch_losses.append(loss.item())

        # Compute accuracy
        predicted = log_probs.argmax(dim=1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

        if (batch_idx + 1) % PRINT_MODULO == 0 or (batch_idx + 1) == len(cbow_dataloader):
            batch_acc = (predicted == targets).float().mean().item()
            print(
                f"Epoch {epoch+1} | Batch {batch_idx+1}/{len(cbow_dataloader)} | "
                f"Batch Loss: {loss.item():.4f} | Batch Acc: {batch_acc:.4f}"
            )

    # Calculate average loss and accuracy over all batches
    avg_loss = total_loss / len(cbow_dataloader)
    epoch_acc = correct / total if total > 0 else 0.0
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1} completed in {epoch_time:.2f}s | Average Loss: {avg_loss:.4f} | Accuracy: {epoch_acc:.4f}")
    print(f"  Min batch loss: {min(batch_losses):.4f} | Max batch loss: {max(batch_losses):.4f}\n")

Epoch 1 | Batch 50/3649 | Batch Loss: 9.7170 | Batch Acc: 0.0078
Epoch 1 | Batch 100/3649 | Batch Loss: 8.8499 | Batch Acc: 0.0312
Epoch 1 | Batch 150/3649 | Batch Loss: 8.3469 | Batch Acc: 0.0781
Epoch 1 | Batch 200/3649 | Batch Loss: 8.1420 | Batch Acc: 0.0469
Epoch 1 | Batch 250/3649 | Batch Loss: 8.6080 | Batch Acc: 0.0156
Epoch 1 | Batch 300/3649 | Batch Loss: 7.5379 | Batch Acc: 0.0547
Epoch 1 | Batch 350/3649 | Batch Loss: 8.0225 | Batch Acc: 0.0391
Epoch 1 | Batch 400/3649 | Batch Loss: 7.7044 | Batch Acc: 0.0312
Epoch 1 | Batch 450/3649 | Batch Loss: 7.2661 | Batch Acc: 0.0781
Epoch 1 | Batch 500/3649 | Batch Loss: 7.5384 | Batch Acc: 0.0781
Epoch 1 | Batch 550/3649 | Batch Loss: 7.9299 | Batch Acc: 0.0547
Epoch 1 | Batch 600/3649 | Batch Loss: 7.1670 | Batch Acc: 0.1016
Epoch 1 | Batch 650/3649 | Batch Loss: 7.0220 | Batch Acc: 0.0938
Epoch 1 | Batch 700/3649 | Batch Loss: 7.3752 | Batch Acc: 0.1250
Epoch 1 | Batch 750/3649 | Batch Loss: 6.6843 | Batch Acc: 0.1094
Epoch 1 | B

In [150]:
def evaluate_cbow(model, context_words):
    # Set the model to evaluation mode
    model.eval()
    
    with torch.no_grad():
        # Convert context words to a tensor of indices
        try:
            context_idxs = torch.tensor([word_to_idx[w] for w in context_words], dtype=torch.long)
        except KeyError as e:
            print(f"Error: One of the context words is not in the vocabulary: {e}")
            return

        # =================================================================
        # THE FIX: Add a batch dimension before passing to the model
        # =================================================================
        context_idxs = context_idxs.unsqueeze(0)  # Shape changes from [4] to [1, 4]
        # =================================================================
        
        # Now the model receives the correct input shape
        output = model(context_idxs)
        
        # The rest of the function is correct
        probs = torch.softmax(output, dim=1)
        top_prob, top_idx = torch.topk(probs, 5)  # top 5 predictions

        print(f"Context: {context_words}")
        print("Top predictions for the center word:")
        for prob, idx in zip(top_prob[0], top_idx[0]):
            print(f"  {idx_to_word[idx.item()]}: {prob.item():.4f}")

# Now, when you call it, it will work
context_example = ['i', 'didn', 'know', 'this']
evaluate_cbow(model, context_example)

print("-" * 30)

context_example_2 = ['this', 'movie', 'is', 'good']
evaluate_cbow(model, context_example_2)

Context: ['i', 'didn', 'know', 'this']
Top predictions for the center word:
  t: 0.9621
  why: 0.0035
  movie: 0.0031
  but: 0.0025
  that: 0.0023
------------------------------
Context: ['this', 'movie', 'is', 'good']
Top predictions for the center word:
  a: 0.2687
  but: 0.0553
  very: 0.0316
  one: 0.0274
  not: 0.0270


In [151]:
movie_embedding = model.embedding.weight[word_to_idx["movie"]]
film_embedding = model.embedding.weight[word_to_idx["film"]]

cosine_similarity = torch.nn.functional.cosine_similarity(movie_embedding, film_embedding, dim=0)
print(f"Cosine similarity between 'movie' and 'film': {cosine_similarity.item():.4f}")

Cosine similarity between 'movie' and 'film': 0.5160


In [152]:
dog_embedding = model.embedding.weight[word_to_idx["dog"]]
chair_embedding = model.embedding.weight[word_to_idx["chair"]]

cosine_similarity = torch.nn.functional.cosine_similarity(dog_embedding, chair_embedding, dim=0)
print(f"Cosine similarity between 'dog' and 'chair': {cosine_similarity.item():.4f}")

Cosine similarity between 'dog' and 'chair': 0.0956


In [153]:
dog_embedding = model.embedding.weight[word_to_idx["dog"]]
cat_embedding = model.embedding.weight[word_to_idx["cat"]]

cosine_similarity = torch.nn.functional.cosine_similarity(dog_embedding, cat_embedding, dim=0)
print(f"Cosine similarity between 'dog' and 'cat': {cosine_similarity.item():.4f}")

Cosine similarity between 'dog' and 'cat': -0.0769


In [154]:
def find_most_similar(word, model, word_to_idx, idx_to_word, top_k=10):
    """
    Finds the top_k most similar words to a given word based on cosine similarity.
    
    Args:
        word (str): The word to find similar words for.
        model (nn.Module): The trained CBOW or Skip-gram model.
        word_to_idx (dict): Mapping from word to its index.
        idx_to_word (dict): Mapping from index to its word.
        top_k (int): The number of similar words to return.
    """
    if word not in word_to_idx:
        print(f"Error: '{word}' not in vocabulary.")
        return

    # Set the model to evaluation mode
    model.eval()

    with torch.no_grad():
        # 1. Get the full embedding matrix
        all_embeddings = model.embedding.weight

        # 2. Get the query vector for the input word
        query_idx = torch.tensor([word_to_idx[word]], dtype=torch.long)
        query_embedding = model.embedding(query_idx) # Shape: [1, EMBEDDING_DIM]

        # 3. Compute cosine similarity between the query vector and all other vectors
        # The query_embedding is [1, D] and all_embeddings is [V, D].
        # F.cosine_similarity will broadcast the query vector to compare against all V vectors.
        # dim=1 specifies that the similarity should be computed along the embedding dimension.
        cos_similarities = torch.nn.functional.cosine_similarity(
            query_embedding, 
            all_embeddings, 
            dim=1
        )
        # The result is a tensor of shape [VOCAB_SIZE]

        # 4. Find the top K most similar words
        # We ask for top_k + 1 because the most similar word will always be the word itself.
        top_results = torch.topk(cos_similarities, k=top_k + 1)
        
        top_indices = top_results.indices
        top_scores = top_results.values

        print(f"Most similar words to '{word}':")
        # 5. Print the results, skipping the first one (which is the word itself)
        for i in range(1, top_k + 1):
            idx = top_indices[i].item()
            score = top_scores[i].item()
            similar_word = idx_to_word[idx]
            print(f"  - {similar_word:<15} (Similarity: {score:.4f})")

# Find words similar to 'movie'
find_most_similar('movie', model, word_to_idx, idx_to_word, top_k=5)

print("\n" + "="*40 + "\n")

# Find words similar to 'dog'
find_most_similar('dog', model, word_to_idx, idx_to_word, top_k=5)

print("\n" + "="*40 + "\n")

# Find words similar to 'good'
find_most_similar('good', model, word_to_idx, idx_to_word, top_k=5)

Most similar words to 'movie':
  - film            (Similarity: 0.5160)
  - liver           (Similarity: 0.3955)
  - misfire         (Similarity: 0.3730)
  - cinematographic (Similarity: 0.3606)
  - adaption        (Similarity: 0.3528)


Most similar words to 'dog':
  - doulittle       (Similarity: 0.3887)
  - inked           (Similarity: 0.3841)
  - kelley          (Similarity: 0.3630)
  - histories       (Similarity: 0.3586)
  - stanza          (Similarity: 0.3533)


Most similar words to 'good':
  - decent          (Similarity: 0.4362)
  - great           (Similarity: 0.4184)
  - prepubescent    (Similarity: 0.4182)
  - earnestness     (Similarity: 0.4085)
  - novelty         (Similarity: 0.4051)


## Task 2: Skip-Gram

In [174]:
skip_gram_training : list[tuple[str, str]]= [] # list of (center_word -> word in context)

for sentence in tokenized_corpus:
    padded_sentence = [PAD_TOKEN] * CONTEXT_SIZE + sentence + [PAD_TOKEN] * CONTEXT_SIZE
    for i in range(len(padded_sentence)):
        if i < CONTEXT_SIZE or i >= len(sentence)+CONTEXT_SIZE:
            continue
       
        center_word = padded_sentence[i]

        for k in range(CONTEXT_SIZE):
            skip_gram_training.append((center_word, padded_sentence[i-(k+1)]))
        for k in range(CONTEXT_SIZE):
            skip_gram_training.append((center_word, padded_sentence[i+(k+1)]))



# show first 5 samples

print(f"first sentence: {tokenized_corpus[0][:10]}")
print(f"First 4 samples: {skip_gram_training[:4]}")
print()
print(f"Last sentence: {tokenized_corpus[-1][-10:]}")
print(f"Last 4 samples: {skip_gram_training[-4:]}")

first sentence: ['this', 'is', 'one', 'of', 'those', 'movies', 'that', 'you', 'and', 'a']
First 4 samples: [('this', '<PAD>'), ('this', '<PAD>'), ('this', 'is'), ('this', 'one')]

Last sentence: ['or', 'native', 'inability', 'ever', 'to', 'give', 'in', 'to', 'being', 'dead']
Last 4 samples: [('dead', 'being'), ('dead', 'to'), ('dead', '<PAD>'), ('dead', '<PAD>')]


In [203]:
from torch.utils.data import Dataset, DataLoader

class SkipGramDataset(Dataset):
    """Custom Dataset for Skip-Gram training data."""
    def __init__(self, data_tensors: list[tuple[str, str]], word_to_idx: dict[str, int]):
        """
        Args:
            data_tensors (list of tuples): A list where each element is a tuple (center_word, word in context).
        """
        self.data = data_tensors
        self.word_to_idx = word_to_idx

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns one sample from the dataset at the given index.
        
        Args:
            idx (int): The index of the sample to retrieve.
        """
        center_word, context_word = self.data[idx]

        return torch.tensor([word_to_idx[center_word]]), torch.tensor([word_to_idx[context_word]])


skip_gram_dataset = SkipGramDataset(skip_gram_training, word_to_idx=word_to_idx)

print(f"First sample: {skip_gram_dataset[0]}")
print(f"Last sample: {skip_gram_dataset[-1]}")

First sample: (tensor([6361]), tensor([24876]))
Last sample: (tensor([15946]), tensor([24876]))


In [226]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size:int, embedding_dim:int):
        super().__init__()

        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

        self.linear = nn.Linear(in_features=embedding_dim, out_features=vocab_size)

    def forward(self, center_word_idx:torch.Tensor):
        center_word_emb = self.embeddings(center_word_idx)
        return self.linear(center_word_emb)


In [227]:
!uv pip install -q tensorboard

In [228]:
from torch.utils.tensorboard import SummaryWriter
import time

# Create a unique directory for this training run
timestamp = time.strftime("%Y%m%d-%H%M%S")
log_dir = f"runs/skipgram_{timestamp}"

# Instantiate the writer
writer = SummaryWriter(log_dir)
print(f"TensorBoard logs will be saved to: {log_dir}")

TensorBoard logs will be saved to: runs/skipgram_20250715-170035


In [229]:
from torch.optim import Adam


skip_gram_dataset = SkipGramDataset(skip_gram_training, word_to_idx=word_to_idx)
skip_gram_dataloader = DataLoader(
    dataset=skip_gram_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)


model = model = SkipGram(vocab_size, EMBEDDING_DIM)
optimizer = Adam(model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()

# Training loop
epochs = 10  # Use a small number for a quick test, increase for better results
model.train()  # Set the model to training mode


global_step = 0
for epoch in range(epochs):
    start_time = time.time()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (center, context) in enumerate(skip_gram_dataloader):
        # Squeeze the tensors right after getting them from the dataloader
        center = center.squeeze(1)   # Shape: [BATCH_SIZE]
        context = context.squeeze(1) # Shape: [BATCH_SIZE]

        model.zero_grad()
        log_probs = model(center) # Model now takes [BATCH_SIZE] and returns [BATCH_SIZE, VOCAB_SIZE]
        loss = loss_function(log_probs, context)
        loss.backward()
        optimizer.step()

        # --- TensorBoard Logging ---
        # Log the loss for this specific batch
        writer.add_scalar('Loss/batch', loss.item(), global_step)
        
        # Log accuracy for this batch
        with torch.no_grad():
            predicted = log_probs.argmax(dim=1)
            batch_acc = (predicted == context).float().mean().item()
            writer.add_scalar('Accuracy/batch', batch_acc, global_step)
        
        global_step += 1
        # -------------------------

        # Accumulate for epoch-level stats
        total_loss += loss.item()
        correct += (predicted == context).sum().item()
        total += context.size(0)

    # --- Log epoch-level metrics ---
    avg_loss = total_loss / len(skip_gram_dataloader)
    epoch_acc = correct / total
    writer.add_scalar('Loss/epoch', avg_loss, epoch)
    writer.add_scalar('Accuracy/epoch', epoch_acc, epoch)
    # -----------------------------

    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1} completed in {epoch_time:.2f}s | Avg Loss: {avg_loss:.4f} | Accuracy: {epoch_acc:.4f}")

writer.close()

Epoch 1 completed in 267.32s | Avg Loss: 7.0563 | Accuracy: 0.0638
Epoch 2 completed in 317.91s | Avg Loss: 6.6207 | Accuracy: 0.0729
Epoch 3 completed in 304.20s | Avg Loss: 6.5146 | Accuracy: 0.0750


KeyboardInterrupt: 

In [230]:
print("Preparing embeddings for TensorBoard visualization...")

# 1. Get the trained embedding matrix from the model
final_embeddings = model.embeddings.weight.data

# 2. Get the corresponding labels for each vector (our vocabulary)
# We need a list of strings, so we'll create it from our idx_to_word mapping.
# Make sure the order is correct!
metadata = [idx_to_word[i] for i in range(vocab_size)]

# 3. Add the embedding data to the writer
# This function will save the data needed for the visualization.
writer.add_embedding(
    mat=final_embeddings,
    metadata=metadata,
    global_step=epochs  # Log it at the final epoch
)

# 4. Close the writer to make sure everything is saved
writer.close()

print("Embeddings saved. Launch TensorBoard to view.")
print(f"Command: tensorboard --logdir {writer.log_dir}")

Preparing embeddings for TensorBoard visualization...
Embeddings saved. Launch TensorBoard to view.
Command: tensorboard --logdir runs/skipgram_20250715-170035


In [231]:
def evaluate_skipgram(model, center_word):
    model.eval()
    with torch.no_grad():
        input_idx = torch.tensor([word_to_idx[center_word]], dtype=torch.long)  # (1,)
        output = model(input_idx)  # (1, context_size*2, vocab_size)
        
        # For each context position, get top predictions
        context_preds = output.squeeze(0)  # (context_size*2, vocab_size)
        
        print(f"Center word: '{center_word}'")
        print("Top predicted context words per context position:")
        
        for pos, preds in enumerate(context_preds):
            probs = torch.softmax(preds, dim=0)  # softmax over vocab dimension
            top_prob, top_idx = torch.topk(probs, 5)
            print(f" Context position {pos+1}:")
            for prob, idx in zip(top_prob, top_idx):
                print(f"   {idx_to_word[int(idx.item())]}: {prob.item():.4f}")
            print()

# Example usage
center_word_example = 'can'
evaluate_skipgram(model, center_word_example)

Center word: 'can'
Top predicted context words per context position:


RuntimeError: selected index k out of range

In [236]:
import torch
import torch.nn.functional as F

def get_embedding(word, model, word_to_idx):
    """Helper function to get the embedding vector for a single word."""
    if word not in word_to_idx:
        print(f"Error: '{word}' not in vocabulary.")
        return None
    return model.embeddings.weight[word_to_idx[word]]

def find_most_similar(query, model, word_to_idx, idx_to_word, top_k=10):
    """
    Finds the top_k most similar words to a given query.
    The query can be a word (str) or a vector (torch.Tensor).
    """
    model.eval()
    with torch.no_grad():
        # 1. Get the query embedding
        if isinstance(query, str):
            if query not in word_to_idx:
                print(f"Error: '{query}' not in vocabulary.")
                return
            query_embedding = get_embedding(query, model, word_to_idx).unsqueeze(0)
        elif isinstance(query, torch.Tensor):
            query_embedding = query.unsqueeze(0)
        else:
            print("Error: Query must be a string or a tensor.")
            return

        # 2. Get the full embedding matrix
        all_embeddings = model.embeddings.weight

        # 3. Compute cosine similarities
        cos_similarities = F.cosine_similarity(query_embedding, all_embeddings, dim=1)

        # 4. Find the top K results
        top_results = torch.topk(cos_similarities, k=top_k + 1)
        
        # 5. Print the results
        # If the query was a word, we skip the first result (the word itself)
        start_index = 1 if isinstance(query, str) else 0
        for i in range(start_index, top_k + start_index):
            idx = top_results.indices[i].item()
            score = top_results.values[i].item()
            print(f"  - {idx_to_word[idx]:<15} (Similarity: {score:.4f})")

def find_analogy(pos, neg, model, word_to_idx, idx_to_word, top_k=5):
    """
    Finds analogies like "king is to man as [?] is to woman".
    Usage: find_analogy(pos=['king', 'woman'], neg=['man'], ...)
    """
    model.eval()
    with torch.no_grad():
        pos_vecs = [get_embedding(word, model, word_to_idx) for word in pos]
        neg_vecs = [get_embedding(word, model, word_to_idx) for word in neg]

        if any(v is None for v in pos_vecs) or any(v is None for v in neg_vecs):
            return

        # Perform the vector arithmetic
        result_vec = sum(pos_vecs) - sum(neg_vecs)
        
        print(f"Finding analogy for: {' + '.join(pos)} - {' - '.join(neg)}")
        find_most_similar(result_vec, model, word_to_idx, idx_to_word, top_k=top_k)



In [237]:
# ======================================================
# == Section 1: Sanity Checks (Basic Similarities)    ==
# ======================================================
print("--- Section 1: Sanity Checks ---\n")

# Test 1: Synonyms. 'movie' and 'film' should be very similar.
vec1 = get_embedding('movie', model, word_to_idx)
vec2 = get_embedding('film', model, word_to_idx)
if vec1 is not None and vec2 is not None:
    sim = F.cosine_similarity(vec1, vec2, dim=0).item()
    print(f"Similarity between 'movie' and 'film': {sim:.4f}")

# Test 2: Related concepts. 'actor' and 'actress' should be similar.
vec1 = get_embedding('actor', model, word_to_idx)
vec2 = get_embedding('actress', model, word_to_idx)
if vec1 is not None and vec2 is not None:
    sim = F.cosine_similarity(vec1, vec2, dim=0).item()
    print(f"Similarity between 'actor' and 'actress': {sim:.4f}")

# Test 3: Unrelated concepts. 'movie' and 'chair' should have low similarity.
vec1 = get_embedding('movie', model, word_to_idx)
vec2 = get_embedding('chair', model, word_to_idx)
if vec1 is not None and vec2 is not None:
    sim = F.cosine_similarity(vec1, vec2, dim=0).item()
    print(f"Similarity between 'movie' and 'chair': {sim:.4f}\n")


# ======================================================
# == Section 2: Finding Nearest Neighbors             ==
# ======================================================
print("--- Section 2: Finding Nearest Neighbors ---\n")

print("Most similar to 'good':")
find_most_similar('good', model, word_to_idx, idx_to_word, top_k=5)
print("-" * 20)

print("Most similar to 'bad':")
find_most_similar('bad', model, word_to_idx, idx_to_word, top_k=5)
print("-" * 20)

print("Most similar to 'woman':")
find_most_similar('woman', model, word_to_idx, idx_to_word, top_k=5)
print("-" * 20)


# ======================================================
# == Section 3: Word Analogies                        ==
# ======================================================
# This is a very hard task and may not work perfectly with our small model,
# but it's fun to try!

print("\n--- Section 3: Word Analogies ---\n")

# Analogy 1: man -> woman :: actor -> ? (expecting 'actress')
find_analogy(pos=['woman', 'actor'], neg=['man'], model=model, word_to_idx=word_to_idx, idx_to_word=idx_to_word)
print("-" * 20)

# Analogy 2: movie -> director :: book -> ? (expecting 'author' or 'writer')
find_analogy(pos=['director', 'book'], neg=['movie'], model=model, word_to_idx=word_to_idx, idx_to_word=idx_to_word)
print("-" * 20)

# Analogy 3: good -> best :: bad -> ? (expecting 'worst')
find_analogy(pos=['best', 'bad'], neg=['good'], model=model, word_to_idx=word_to_idx, idx_to_word=idx_to_word)

--- Section 1: Sanity Checks ---

Similarity between 'movie' and 'film': 0.3148
Similarity between 'actor' and 'actress': 0.0758
Similarity between 'movie' and 'chair': 0.0026

--- Section 2: Finding Nearest Neighbors ---

Most similar to 'good':
  - thin            (Similarity: 0.3959)
  - pitiful         (Similarity: 0.3845)
  - slightly        (Similarity: 0.3587)
  - bathing         (Similarity: 0.3364)
  - torturers       (Similarity: 0.3363)
--------------------
Most similar to 'bad':
  - amadeus         (Similarity: 0.3828)
  - delve           (Similarity: 0.3614)
  - retirony        (Similarity: 0.3480)
  - rosenstrasse    (Similarity: 0.3459)
  - purse           (Similarity: 0.3434)
--------------------
Most similar to 'woman':
  - analysing       (Similarity: 0.4166)
  - characteratures (Similarity: 0.3989)
  - fornication     (Similarity: 0.3931)
  - elite           (Similarity: 0.3898)
  - blacks          (Similarity: 0.3749)
--------------------

--- Section 3: Word Analog

## Task 3: Cosine Similarity
Make sure that you have installed the package gensim.

In [239]:
!uv pip install -q gensim

In [243]:
# Try to fix numpy if needed, then install gensim
!uv pip install --upgrade numpy
!uv pip install --force-reinstall --no-cache-dir numpy
!uv pip install --upgrade --force-reinstall --no-cache-dir gensim


[2mUsing Python 3.12.3 environment at: /Users/s.mallet/passau/dlnlp/.venv[0m
[2K[37m⠙[0m [2m                                                                              [0m[2mResolved [1m1 package[0m [2min 91ms[0m[0m
[2K[2mPrepared [1m1 package[0m [2min 0.71ms[0m[0m                                             
[2mUninstalled [1m1 package[0m [2min 96ms[0m[0m
[2K[2mInstalled [1m1 package[0m [2min 70ms[0m[0m                                 [0m
 [31m-[39m [1mnumpy[0m[2m==1.26.4[0m
 [32m+[39m [1mnumpy[0m[2m==2.3.1[0m
[2mUsing Python 3.12.3 environment at: /Users/s.mallet/passau/dlnlp/.venv[0m
[2K[2mResolved [1m1 package[0m [2min 232ms[0m[0m                                          [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m     0 B/4.88 MiB            [1A
[2K[1A[37m⠙[0m [2mPreparing packages.

In [1]:
import gensim
from gensim.models import KeyedVectors
import gensim.downloader
from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
import numpy as np
from numpy.linalg import norm
from numpy import dot

### Task 3 (a): Cosine Similarity

In [2]:
import numpy as np
from numpy.linalg import norm
from numpy import dot

def cosine_similarity(x, y):
    """
    Computes the cosine similarity between two vectors.
    
    Formula: cos(theta) = (x . y) / (||x|| * ||y||)
    
    Args:
        x (np.ndarray): The first vector.
        y (np.ndarray): The second vector.
        
    Returns:
        float: The cosine similarity, a value between -1 and 1.
    """
    # Ensure the vectors are numpy arrays
    x = np.asarray(x)
    y = np.asarray(y)
    
    # Compute the dot product
    dot_product = dot(x, y)
    
    # Compute the L2 norms (magnitudes)
    norm_x = norm(x)
    norm_y = norm(y)
    
    # Handle the case of zero-vectors to avoid division by zero
    if norm_x == 0 or norm_y == 0:
        return 0.0

    return dot_product / (norm_x * norm_y)


### Task 3 (b)

#### Model 1

In [3]:
model1 = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) 

In [4]:
king_vector_m1 = model1.get_vector('king')
queen_vector_m1 = model1.get_vector('queen')
man_vector_m1 = model1.get_vector('man')
woman_vector_m1 = model1.get_vector('woman')

KeyError: "Key 'queen' not present"

In [6]:
model1.key_to_index

{'the': 0,
 'to': 1,
 'of': 2,
 'in': 3,
 'and': 4,
 'he': 5,
 'is': 6,
 'for': 7,
 'on': 8,
 'said': 9,
 'that': 10,
 'has': 11,
 'says': 12,
 'was': 13,
 'have': 14,
 'it': 15,
 'be': 16,
 'are': 17,
 'with': 18,
 'will': 19,
 'at': 20,
 'mr': 21,
 'from': 22,
 'by': 23,
 'we': 24,
 'been': 25,
 'as': 26,
 'an': 27,
 'not': 28,
 'his': 29,
 'but': 30,
 'they': 31,
 'after': 32,
 'were': 33,
 'had': 34,
 'there': 35,
 'new': 36,
 'this': 37,
 'australia': 38,
 'australian': 39,
 'who': 40,
 'people': 41,
 'palestinian': 42,
 'their': 43,
 'two': 44,
 'government': 45,
 'up': 46,
 'south': 47,
 'us': 48,
 'which': 49,
 'year': 50,
 'one': 51,
 'about': 52,
 'out': 53,
 'if': 54,
 'also': 55,
 'more': 56,
 'when': 57,
 'its': 58,
 'would': 59,
 'into': 60,
 'first': 61,
 'last': 62,
 'against': 63,
 'israeli': 64,
 'minister': 65,
 'arafat': 66,
 'all': 67,
 'over': 68,
 'afghanistan': 69,
 'three': 70,
 'united': 71,
 'no': 72,
 'world': 73,
 'or': 74,
 'police': 75,
 'than': 76,
 'bef

#### Model 2

In [7]:
model2 = KeyedVectors.load_word2vec_format(datapath('high_precision.kv.bin'), binary=True) 

In [8]:
king_vector_m2 = model2.get_vector('king')
queen_vector_m2 = model2.get_vector('queen')
man_vector_m2 = model2.get_vector('man')
woman_vector_m2 = model2.get_vector('woman')

KeyError: "Key 'king' not present"

In [9]:
model2.key_to_index

{'kangaroo.n.01': 0, 'horse.n.01': 1}

#### Model 3

In [10]:
model3 = KeyedVectors.load_word2vec_format(datapath('euclidean_vectors.bin'), binary=True) 

In [11]:
king_vector_m3 = model3.get_vector('king')
queen_vector_m3 = model3.get_vector('queen')
man_vector_m3 = model3.get_vector('man')
woman_vector_m3 = model3.get_vector('woman')

In [12]:
model3.key_to_index

{'the': 0,
 'to': 1,
 'of': 2,
 'in': 3,
 'and': 4,
 'he': 5,
 'is': 6,
 'for': 7,
 'on': 8,
 'said': 9,
 'that': 10,
 'has': 11,
 'says': 12,
 'was': 13,
 'have': 14,
 'it': 15,
 'be': 16,
 'are': 17,
 'with': 18,
 'will': 19,
 'at': 20,
 'mr': 21,
 'from': 22,
 'by': 23,
 'we': 24,
 'been': 25,
 'as': 26,
 'an': 27,
 'not': 28,
 'his': 29,
 'but': 30,
 'they': 31,
 'after': 32,
 'were': 33,
 'had': 34,
 'there': 35,
 'new': 36,
 'this': 37,
 'australian': 38,
 'australia': 39,
 'who': 40,
 'people': 41,
 'palestinian': 42,
 'their': 43,
 'government': 44,
 'two': 45,
 'up': 46,
 'south': 47,
 'us': 48,
 'which': 49,
 'year': 50,
 'one': 51,
 'about': 52,
 'out': 53,
 'if': 54,
 'also': 55,
 'more': 56,
 'when': 57,
 'its': 58,
 'into': 59,
 'would': 60,
 'first': 61,
 'against': 62,
 'last': 63,
 'israeli': 64,
 'minister': 65,
 'arafat': 66,
 'all': 67,
 'over': 68,
 'afghanistan': 69,
 'three': 70,
 'united': 71,
 'no': 72,
 'world': 73,
 'police': 74,
 'or': 75,
 'than': 76,
 'att

#### Analogy Example 1

In [13]:
king_mins_man_plus_woman_m3 = (king_vector_m3 - man_vector_m3) + woman_vector_m3

# Make sure you have implemented cosine similarity. 
cosine_similarity(king_mins_man_plus_woman_m3, queen_vector_m3)

-0.22200273

#### Model 4

In [14]:
word2vec_google = gensim.downloader.load('word2vec-google-news-300');

In [15]:
len(word2vec_google.get_vector('king'))

300

In [16]:
# you can also try the GLOVE model
glove_google = gensim.downloader.load('glove-wiki-gigaword-100');

In [17]:
len(glove_google.get_vector('king'))

100

In [18]:
model4 = word2vec_google

In [19]:
king_vector_m4 = model4.get_vector('king')
queen_vector_m4 = model4.get_vector('queen')
man_vector_m4 = model4.get_vector('man')
woman_vector_m4 = model4.get_vector('woman')

#### Analogy Example 2

In [20]:
king_mins_man_plus_woman_m4 = (king_vector_m4 - man_vector_m4) + woman_vector_m4

# Make sure you have implemented cosine similarity. 
cosine_similarity(king_mins_man_plus_woman_m4, queen_vector_m4)

0.7300517

In [21]:
# Find a method to search for similar words given a word
# Hint: you can use a method of the word2vec_google object

similar_words = model4.IDENTIFIED_METHOD('phone', topn=10)

for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

AttributeError: 'KeyedVectors' object has no attribute 'IDENTIFIED_METHOD'

In [None]:
similar_words = model4.IDENTIFIED_METHOD('king', topn=10)

for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

In [None]:
# try to find at least five analogies using the method you found above


## Theoretical Question #8

In [None]:
word2vec_google.IDENTIFIED_METHOD(king_mins_man_plus_woman_m4) # First answer will be King