# Exercise 4: Word Embeddings

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import glob
from collections import Counter
import sys
sys.path.append(os.path.abspath('..'))

DATASET_PATH = os.path.join('..', 'datasets', 'aclImdb')

In [2]:
def sorter(item):
    """ Function tha gets only the first number of the name of the file and organizes the files base on that"""
    
    return int(os.path.basename(item).split('_')[0])

def read_raw_text(path_data):
    """ Function for reading the raw data in the .txt files. 
    
    Parameters
    ----------
    path_data: str
        path of the folder that contains the data that is going to be used. (should be test or train)
    path_vocab_pos: str, optional
        Glob pattern for the data files. If None, defaults to standard IMDB structure.
        
    Returns
    ---------
    data,scores: array_like
        Data arrays, X is an array of shape [#documents of the dataset, #words in the vocabulary], y is an array of shape [#documents,] 
    """
    
    data = []
    scores = []
    
    sentiments = ['pos', 'neg']
    for sentiment in sentiments:
        path_vocab_pos = os.path.join(DATASET_PATH, path_data, sentiment, "*.txt")
        for filename in sorted(glob.glob(path_vocab_pos), key=sorter):
            with open(filename, encoding='utf8') as f:
                lines = f.read()
                data.append(lines)
                scores.append(int(os.path.basename(filename).split('_')[1].strip('.txt')))
    return data, scores

def read_vocab(path_vocab):
    """ Function for reading the vocabulary file. 
    
    Parameters
    ----------
    path_vocab: str
        Path to the vocabulary file.
    Returns
    ---------
    initial_vocab: list
        list with the values different tokens that compose the vocabulary ...... 
    """
    with open(path_vocab, encoding='utf-8') as f:
        lines = f.read()
    lines = lines.split('\n')
    vocab = []
    for line in lines:
        vocab.append(line)
    return vocab

In [3]:
# import the data
from utils.preprocess import load_data_to_df


corpus, scores = read_raw_text('train')
corpus_df = load_data_to_df(corpus, scores)

In [4]:
import re
import string
from collections import Counter
from utils.preprocess import pre_process

tokenized_corpus = pre_process(corpus_df, tokenize_punct=True, lowercase=True, remove_punct=True)

In [5]:
# reduce the corpus if you are facing performance issues
tokenized_corpus = tokenized_corpus[:1000]  # reduce to first 1000 documents for testing
print(tokenized_corpus)

                                                  text  score  \
0    bromwell high is a cartoon comedy  it ran at t...      9   
1    if you like adult comedy cartoons  like south ...      7   
2    bromwell high is nothing short of brilliant  e...      9   
3     all the world s a stage and its people actors...     10   
4    futz is the only show preserved from the exper...      8   
..                                                 ...    ...   
995  this movie surprised me  some things were  cli...      9   
996  this movie surprised me  some things were  cli...      9   
997  i have to agree with most of the other posts  ...      7   
998  this is a really interesting movie  it is an a...      7   
999  i am amazed at how this movie and most others ...     10   

                                                tokens  
0    [bromwell, high, is, a, cartoon, comedy, it, r...  
1    [if, you, like, adult, comedy, cartoons, like,...  
2    [bromwell, high, is, nothing, short, of, br

## Task 1: CBOW

In [6]:
# Parameters (change these as wanted)
CONTEXT_SIZE = 2  # Window size on each side
EMBEDDING_DIM = 10
PAD_TOKEN = '<PAD>'

# Vocabulary
# Build vocabulary from all tokens in the tokenized_corpus DataFrame
vocab = sorted(set(token for tokens in tokenized_corpus['tokens'] for token in tokens))
if PAD_TOKEN not in vocab:
    vocab.append(PAD_TOKEN)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}
vocab_size = len(vocab)

print('Vocab Size', vocab_size)
print('Context Size', CONTEXT_SIZE)
print('Embedding Dimension', EMBEDDING_DIM)

Vocab Size 16016
Context Size 2
Embedding Dimension 10


In [7]:
idx_to_word

{0: '0',
 1: '00',
 2: '000',
 3: '000s',
 4: '007',
 5: '0079',
 6: '0080',
 7: '0083',
 8: '00pm',
 9: '01',
 10: '02',
 11: '06',
 12: '1',
 13: '10',
 14: '100',
 15: '10000000000000',
 16: '101',
 17: '10th',
 18: '11',
 19: '1100',
 20: '12',
 21: '128',
 22: '12th',
 23: '13',
 24: '135',
 25: '13th',
 26: '14',
 27: '14th',
 28: '15',
 29: '150',
 30: '16',
 31: '17',
 32: '18',
 33: '1800',
 34: '1830',
 35: '1836',
 36: '1838',
 37: '1846',
 38: '1850s',
 39: '1853',
 40: '1854',
 41: '1855',
 42: '1860',
 43: '1862',
 44: '1896',
 45: '18th',
 46: '19',
 47: '1909',
 48: '1910',
 49: '1920',
 50: '1920s',
 51: '1924',
 52: '1928',
 53: '1929',
 54: '1930',
 55: '1930s',
 56: '1931',
 57: '1933',
 58: '1935',
 59: '1936',
 60: '1938',
 61: '1939',
 62: '1940',
 63: '1940s',
 64: '1944',
 65: '1945',
 66: '1946',
 67: '1947',
 68: '1948',
 69: '1950',
 70: '1950s',
 71: '1951',
 72: '1952',
 73: '1953',
 74: '1954',
 75: '1955',
 76: '1956',
 77: '1957',
 78: '1958',
 79: '195

Padding the sentences

In [8]:
tokenized_corpus['tokens'] = tokenized_corpus['tokens'].apply(lambda x: [PAD_TOKEN] * CONTEXT_SIZE + x + [PAD_TOKEN] * CONTEXT_SIZE)

In [9]:
# Create CBOW dataset from padded sentences
cbow_data = []
for tokens in tokenized_corpus['tokens']:
    for i in range(CONTEXT_SIZE, len(tokens) - CONTEXT_SIZE):
        context = tokens[i - CONTEXT_SIZE:i] + tokens[i + 1:i + CONTEXT_SIZE + 1]
        target = tokens[i]
        cbow_data.append((context, target))

# Show a few examples
for context, target in cbow_data[:5]:
    print(f"Context: {context} -> Target: {target}")

Context: ['<PAD>', '<PAD>', 'high', 'is'] -> Target: bromwell
Context: ['<PAD>', 'bromwell', 'is', 'a'] -> Target: high
Context: ['bromwell', 'high', 'a', 'cartoon'] -> Target: is
Context: ['high', 'is', 'cartoon', 'comedy'] -> Target: a
Context: ['is', 'a', 'comedy', 'it'] -> Target: cartoon


In [10]:
from torch.utils.data import Dataset

class CBOWDataset(Dataset):
    def __init__(self, data, word_to_idx):
        self.data = data
        self.word_to_idx = word_to_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context_idxs = torch.tensor([self.word_to_idx[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor(self.word_to_idx[target], dtype=torch.long)
        return context_idxs, target_idx

In [11]:
from sklearn.model_selection import train_test_split

# Create the CBOW dataset object for model training
# Split cbow_data into train and validation sets

cbow_train_data, cbow_val_data = train_test_split(cbow_data, test_size=0.2, random_state=42)

cbow_train_dataset = CBOWDataset(cbow_train_data, word_to_idx)
cbow_val_dataset = CBOWDataset(cbow_val_data, word_to_idx)

# Use train dataset for the rest of this cell
cbow_dataset = cbow_train_dataset
print(f"CBOWDataset size: {len(cbow_dataset)}")
# Example: get the first item (context indices, target index)
context_idxs, target_idx = cbow_dataset[0]
print(f"Context indices: {context_idxs}")
print(f"Target index: {target_idx}")

CBOWDataset size: 193551
Context indices: tensor([10603, 14278, 14030,  1406])
Target index: 2190


In [12]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=word_to_idx[PAD_TOKEN])
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_idxs):
        # context_idxs: (batch_size, 2*CONTEXT_SIZE)
        embeds = self.embeddings(context_idxs)  # (batch_size, 2*CONTEXT_SIZE, embedding_dim)
        avg_embeds = embeds.mean(dim=1)        # (batch_size, embedding_dim)
        out = self.linear(avg_embeds)          # (batch_size, vocab_size)
        return out

In [13]:
from torch.utils.data import DataLoader

# Create a DataLoader for the CBOW dataset
cbow_train_dataloader = DataLoader(
    cbow_dataset,
    batch_size=32,
    shuffle=True
)

cbow_val_dataloader = DataLoader(
    cbow_val_dataset,
    batch_size=32,
    shuffle=False
)

In [14]:
from sklearn.metrics import accuracy_score, f1_score

# Create DataLoaders for train and validation datasets
model = CBOW(vocab_size, EMBEDDING_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
num_epochs = 20
batch_size = 32
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_context_idxs, batch_target_idxs in cbow_train_dataloader:
        optimizer.zero_grad()
        outputs = model(batch_context_idxs)
        loss = criterion(outputs, batch_target_idxs)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(cbow_train_dataloader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")
    # Validation
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for batch_context_idxs, batch_target_idxs in cbow_val_dataloader:
            outputs = model(batch_context_idxs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(batch_target_idxs.cpu().numpy())
    accuracy = accuracy_score(all_targets, all_preds)
    f1 = f1_score(all_targets, all_preds, average='weighted')
    print(f"Validation Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")


Epoch [1/20], Loss: 7.5036
Validation Accuracy: 0.0700, F1 Score: 0.0184
Validation Accuracy: 0.0700, F1 Score: 0.0184
Epoch [2/20], Loss: 6.6807
Epoch [2/20], Loss: 6.6807
Validation Accuracy: 0.0788, F1 Score: 0.0253
Validation Accuracy: 0.0788, F1 Score: 0.0253
Epoch [3/20], Loss: 6.4803
Epoch [3/20], Loss: 6.4803
Validation Accuracy: 0.0838, F1 Score: 0.0300
Validation Accuracy: 0.0838, F1 Score: 0.0300
Epoch [4/20], Loss: 6.3613
Epoch [4/20], Loss: 6.3613
Validation Accuracy: 0.0895, F1 Score: 0.0350
Validation Accuracy: 0.0895, F1 Score: 0.0350
Epoch [5/20], Loss: 6.2764
Epoch [5/20], Loss: 6.2764
Validation Accuracy: 0.0933, F1 Score: 0.0378
Validation Accuracy: 0.0933, F1 Score: 0.0378
Epoch [6/20], Loss: 6.2108
Epoch [6/20], Loss: 6.2108
Validation Accuracy: 0.0976, F1 Score: 0.0405
Validation Accuracy: 0.0976, F1 Score: 0.0405
Epoch [7/20], Loss: 6.1574
Epoch [7/20], Loss: 6.1574
Validation Accuracy: 0.1003, F1 Score: 0.0429
Validation Accuracy: 0.1003, F1 Score: 0.0429
Epoch

In [18]:
def evaluate_cbow(model, context_words):
    model.eval()
    with torch.no_grad():
        context_idxs = torch.tensor([word_to_idx[w] for w in context_words], dtype=torch.long).unsqueeze(0)
        output = model(context_idxs)
        probs = torch.softmax(output, dim=1)
        top_prob, top_idx = torch.topk(probs, 5)  # top 5 predictions

        print(f"Context: {context_words}")
        print("Top predictions for center word:")
        for prob, idx in zip(top_prob[0], top_idx[0]):
            print(f"  {idx_to_word[idx.item()]}: {prob.item():.4f}")

# Example: I didn't know this -> [i], [didn], [t], [know], [this]
context_example = ['i', 'didn', 'know', 'this']
evaluate_cbow(model, context_example)

Context: ['i', 'didn', 'know', 'this']
Top predictions for center word:
  t: 0.8064
  movie: 0.0126
  was: 0.0079
  say: 0.0069
  would: 0.0062


## Task 2: Skip-Gram

In [44]:
import pandas as pd

# Prepare training data for SkipGram, i.e. (center_word, context_words), e.g., ('is', ['bromwell', 'high', 'a', 'cartoon'])

# Re-generate skipgram_data for (center_word, target_context_word) pairs
skipgram_data_pairs = []


for context_words_list, center_word_for_cbow in cbow_data:
    # For Skip-Gram, center_word_for_cbow is the 'input' word
    # Each word in context_words_list is a 'target' context word
    for target_context_word in context_words_list:
        skipgram_data_pairs.append((center_word_for_cbow, target_context_word))
        
print("Generated Skip-Gram Pairs:")
for center, context_target in skipgram_data_pairs[:5]:
    print(f"Center: '{center}' -> Target Context: '{context_target}'")

Generated Skip-Gram Pairs:
Center: 'bromwell' -> Target Context: '<PAD>'
Center: 'bromwell' -> Target Context: '<PAD>'
Center: 'bromwell' -> Target Context: 'high'
Center: 'bromwell' -> Target Context: 'is'
Center: 'high' -> Target Context: '<PAD>'


In [45]:
class SkipGramDataset(Dataset):
    def __init__(self, data_pairs, word_to_idx):
        # data_pairs is now a list of (center_word_str, target_context_word_str) tuples
        self.data = data_pairs
        self.word_to_idx = word_to_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        center_word_str, target_context_word_str = self.data[idx]
        center_idx = torch.tensor(self.word_to_idx[center_word_str], dtype=torch.long)
        target_context_idx = torch.tensor(self.word_to_idx[target_context_word_str], dtype=torch.long)
        return center_idx, target_context_idx

In [46]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=word_to_idx[PAD_TOKEN])
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, center_word_idx):
        center_embedding = self.embeddings(center_word_idx)
        context_scores = self.linear(center_embedding)
        return context_scores

In [47]:
model = SkipGram(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)

skipgram_train_data, skipgram_val_data = train_test_split(skipgram_data_pairs, test_size=0.2, random_state=42)

skipgram_train_dataset = SkipGramDataset(skipgram_train_data, word_to_idx)
skipgram_val_dataset = SkipGramDataset(skipgram_val_data, word_to_idx)


In [48]:
skipgram_train_dataloader = DataLoader(
    skipgram_train_dataset,
    batch_size=32,
    shuffle=True
)

skipgram_val_dataloader = DataLoader(
    skipgram_val_dataset,
    batch_size=32,
    shuffle=False
)

In [49]:
num_epochs = 20
batch_size = 32
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    # batch_center_idxs: (batch_size,)
    # batch_target_context_idxs: (batch_size,) - no longer a list of contexts
    for batch_center_idxs, batch_target_context_idxs in skipgram_train_dataloader:
        optimizer.zero_grad()
        outputs = model(batch_center_idxs) # outputs shape: (batch_size, vocab_size)

        # The target for CrossEntropyLoss should be a 1D tensor of class indices (the target words)
        # Its batch dimension must match the input (outputs)
        loss = criterion(outputs, batch_target_context_idxs) # Corrected! No .view(-1) needed.

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(skipgram_train_dataloader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for batch_center_idxs, batch_target_context_idxs in skipgram_val_dataloader:
            outputs = model(batch_center_idxs)
            _, preds = torch.max(outputs, 1) # preds will be (batch_size,)
            all_preds.extend(preds.cpu().numpy())
            # batch_target_context_idxs is already 1D
            all_targets.extend(batch_target_context_idxs.cpu().numpy())
    accuracy = accuracy_score(all_targets, all_preds)
    f1 = f1_score(all_targets, all_preds, average='weighted')
    print(f"Validation Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

Epoch [1/20], Loss: 7.1574
Validation Accuracy: 0.0646, F1 Score: 0.0173
Epoch [2/20], Loss: 6.7971
Validation Accuracy: 0.0657, F1 Score: 0.0189
Epoch [3/20], Loss: 6.7691
Validation Accuracy: 0.0672, F1 Score: 0.0213
Epoch [4/20], Loss: 6.7521
Validation Accuracy: 0.0685, F1 Score: 0.0236
Epoch [5/20], Loss: 6.7399
Validation Accuracy: 0.0693, F1 Score: 0.0244
Epoch [6/20], Loss: 6.7300
Validation Accuracy: 0.0690, F1 Score: 0.0239
Epoch [7/20], Loss: 6.7183
Validation Accuracy: 0.0705, F1 Score: 0.0261
Epoch [8/20], Loss: 6.7056
Validation Accuracy: 0.0705, F1 Score: 0.0256
Epoch [9/20], Loss: 6.6949
Validation Accuracy: 0.0705, F1 Score: 0.0257
Epoch [10/20], Loss: 6.6850
Validation Accuracy: 0.0716, F1 Score: 0.0269
Epoch [11/20], Loss: 6.6777
Validation Accuracy: 0.0717, F1 Score: 0.0270
Epoch [12/20], Loss: 6.6707
Validation Accuracy: 0.0719, F1 Score: 0.0268
Epoch [13/20], Loss: 6.6644
Validation Accuracy: 0.0717, F1 Score: 0.0271
Epoch [14/20], Loss: 6.6589
Validation Accuracy

In [54]:
def evaluate_skipgram(model, center_word):
    model.eval()
    with torch.no_grad():
        input_idx = torch.tensor([word_to_idx[center_word]], dtype=torch.long)  # (1,)
        output = model(input_idx)  # (1, vocab_size)
        
        # For each context position, get top predictions
        context_preds = output.squeeze(0)  # (vocab_size,)
        probs = torch.softmax(context_preds, dim=0)
        top_prob, top_idx = torch.topk(probs, 5)  # top
        
        print(f"Center word: '{center_word}'")
        print("Top predictions for context words:")
        for prob, idx in zip(top_prob, top_idx):
            print(f"  {idx_to_word[idx.item()]}: {prob.item():.4f}")

# Example usage
center_word_example = 'can'
evaluate_skipgram(model, center_word_example)

Center word: 'can'
Top predictions for context words:
  you: 0.0532
  i: 0.0402
  t: 0.0385
  it: 0.0326
  to: 0.0320


## Task 3: Cosine Similarity
Make sure that you have installed the package gensim.

In [None]:
#conda install -c conda-forge gensim -y

In [56]:
import gensim
from gensim.models import KeyedVectors
import gensim.downloader
from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
import numpy as np
from numpy.linalg import norm
from numpy import dot

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

### Task 3 (a): Cosine Similarity

In [None]:
def cosine_similarity(x, y):
    return dot(x, y) / (norm(x) * norm(y))

### Task 3 (b)

#### Model 1

In [None]:
model1 = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) 

In [None]:
king_vector_m1 = model1.get_vector('king')
queen_vector_m1 = model1.get_vector('queen')
man_vector_m1 = model1.get_vector('man')
woman_vector_m1 = model1.get_vector('woman')

In [None]:
model1.key_to_index

#### Model 2

In [None]:
model2 = KeyedVectors.load_word2vec_format(datapath('high_precision.kv.bin'), binary=True) 

In [None]:
king_vector_m2 = model2.get_vector('king')
queen_vector_m2 = model2.get_vector('queen')
man_vector_m2 = model2.get_vector('man')
woman_vector_m2 = model2.get_vector('woman')

In [None]:
model2.key_to_index

#### Model 3

In [None]:
model3 = KeyedVectors.load_word2vec_format(datapath('euclidean_vectors.bin'), binary=True) 

In [None]:
king_vector_m3 = model3.get_vector('king')
queen_vector_m3 = model3.get_vector('queen')
man_vector_m3 = model3.get_vector('man')
woman_vector_m3 = model3.get_vector('woman')

In [None]:
model3.key_to_index

#### Analogy Example 1

In [None]:
king_mins_man_plus_woman_m3 = (king_vector_m3 - man_vector_m3) + woman_vector_m3

# Make sure you have implemented cosine similarity. 
cosine_similarity(king_mins_man_plus_woman_m3, queen_vector_m3)

#### Model 4

In [None]:
word2vec_google = gensim.downloader.load('word2vec-google-news-300');

In [None]:
len(word2vec_google.get_vector('king'))

In [None]:
# you can also try the GLOVE model
glove_google = gensim.downloader.load('glove-wiki-gigaword-100');

In [None]:
len(glove_google.get_vector('king'))

In [None]:
model4 = word2vec_google

In [None]:
king_vector_m4 = model4.get_vector('king')
queen_vector_m4 = model4.get_vector('queen')
man_vector_m4 = model4.get_vector('man')
woman_vector_m4 = model4.get_vector('woman')

#### Analogy Example 2

In [None]:
king_mins_man_plus_woman_m4 = (king_vector_m4 - man_vector_m4) + woman_vector_m4

# Make sure you have implemented cosine similarity. 
cosine_similarity(king_mins_man_plus_woman_m4, queen_vector_m4)

In [None]:
# Find a method to search for similar words given a word
# Hint: you can use a method of the word2vec_google object

similar_words = model4.most_similar('phone', topn=10)

for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

In [None]:
similar_words = model4.most_similar('king', topn=10)

for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

In [None]:
# try to find at least five analogies using the method you found above


## Theoretical Question #8

In [None]:
word2vec_google.IDENTIFIED_METHOD(king_mins_man_plus_woman_m4) # First answer will be King