#Assignment 3

1.   Data Preprocessing.
2.   Create a Skip-gram model using PyTorch/TensorFlow or NumPy.
3.   Modify the Skip-gram model to CBOW.
4.   Implement GloVe.

In [None]:
import nltk
nltk.download('punkt')
nltk.download('gutenberg')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [None]:
import nltk
nltk.download('punkt')
nltk.download('gutenberg')
# Download the missing 'punkt_tab' data package
nltk.download('punkt_tab') # This line downloads the necessary data

from nltk.corpus import gutenberg
import re

# Load a sample corpus (Jane Austen's book)
text = gutenberg.raw('austen-emma.txt')[:50000]

# Tokenize and clean
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    return tokens

tokens = preprocess(text)
print(tokens[:20])

['emma', 'by', 'jane', 'austen', 'volume', 'i', 'chapter', 'i', 'emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'and', 'happy']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
from collections import Counter

vocab = sorted(set(tokens))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}
vocab_size = len(vocab)
print("Vocabulary Size:", vocab_size)


Vocabulary Size: 1761


In [None]:
window_size = 2

def generate_skipgram_data(tokens, window_size):
    skip_grams = []
    for i, target in enumerate(tokens):
        context_window = tokens[max(i - window_size, 0): i] + tokens[i + 1: i + window_size + 1]
        for context in context_window:
            skip_grams.append((word_to_idx[target], word_to_idx[context]))
    return skip_grams

def generate_cbow_data(tokens, window_size):
    cbow_data = []
    for i in range(window_size, len(tokens) - window_size):
        context = tokens[i - window_size:i] + tokens[i + 1:i + window_size + 1]
        target = tokens[i]
        cbow_data.append(([word_to_idx[word] for word in context], word_to_idx[target]))
    return cbow_data

skipgram_data = generate_skipgram_data(tokens, window_size)
cbow_data = generate_cbow_data(tokens, window_size)

print("Skip-gram Example:", skipgram_data[:3])
print("CBOW Example:", cbow_data[:3])


Skip-gram Example: [(471, 207), (471, 836), (207, 471)]
CBOW Example: [([471, 207, 122, 1657], 836), ([207, 836, 1657, 764], 122), ([836, 122, 764, 237], 1657)]


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

embedding_dim = 100

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, center_word):
        embed = self.embeddings(center_word)
        out = self.linear(embed)
        return out

model = SkipGram(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
for epoch in range(1):
    total_loss = 0
    for target, context in random.sample(skipgram_data, 1000):
        input_tensor = torch.tensor([target], dtype=torch.long)
        context_tensor = torch.tensor([context], dtype=torch.long)

        optimizer.zero_grad()
        output = model(input_tensor)
        loss = criterion(output, context_tensor)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print("Skip-gram Epoch:", epoch, "Loss:", total_loss)


Skip-gram Epoch: 0 Loss: 7288.345961332321


In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_words):
        embeds = self.embeddings(context_words)
        avg_embed = embeds.mean(dim=0).view(1, -1)
        out = self.linear(avg_embed)
        return out

cbow_model = CBOW(vocab_size, embedding_dim)
cbow_optimizer = optim.Adam(cbow_model.parameters(), lr=0.001)

# Training
for epoch in range(1):
    total_loss = 0
    for context, target in random.sample(cbow_data, 1000):
        context_tensor = torch.tensor(context, dtype=torch.long)
        target_tensor = torch.tensor([target], dtype=torch.long)

        cbow_optimizer.zero_grad()
        output = cbow_model(context_tensor)
        loss = criterion(output, target_tensor)
        loss.backward()
        cbow_optimizer.step()
        total_loss += loss.item()
    print("CBOW Epoch:", epoch, "Loss:", total_loss)


CBOW Epoch: 0 Loss: 7167.2594165802


In [None]:
import numpy as np
from collections import defaultdict

def build_cooccurrence_matrix(tokens, window_size):
    matrix = defaultdict(lambda: defaultdict(int))
    for idx, word in enumerate(tokens):
        for j in range(max(idx - window_size, 0), min(idx + window_size + 1, len(tokens))):
            if idx == j:
                continue
            matrix[word][tokens[j]] += 1
    return matrix

co_matrix = build_cooccurrence_matrix(tokens, window_size)

X = np.zeros((vocab_size, vocab_size))
for w1 in co_matrix:
    for w2 in co_matrix[w1]:
        i = word_to_idx[w1]
        j = word_to_idx[w2]
        X[i][j] = co_matrix[w1][w2]

embedding_dim = 50
W = np.random.rand(vocab_size, embedding_dim)
W_context = np.random.rand(vocab_size, embedding_dim)
bias = np.random.rand(vocab_size)
bias_context = np.random.rand(vocab_size)

def glove_loss(i, j, xij):
    weight = (xij / 100) ** 0.75 if xij < 100 else 1
    dot = np.dot(W[i], W_context[j]) + bias[i] + bias_context[j]
    return weight * ((dot - np.log(xij)) ** 2)

for epoch in range(1):
    total_loss = 0
    for i in range(vocab_size):
        for j in range(vocab_size):
            if X[i][j] > 0:
                total_loss += glove_loss(i, j, X[i][j])
    print("GloVe Epoch:", epoch, "Loss:", total_loss)


GloVe Epoch: 0 Loss: 166903.54231195807


In [None]:
def get_similar(word, embeddings, top_n=5):
    idx = word_to_idx[word]
    vec = embeddings[idx]
    sims = np.dot(embeddings, vec)
    sorted_idx = np.argsort(-sims)
    return [idx_to_word[i] for i in sorted_idx[1:top_n+1]]

print("Similar to 'emma' in Skip-gram:", get_similar('emma', model.embeddings.weight.detach().numpy()))
print("Similar to 'emma' in CBOW:", get_similar('emma', cbow_model.embeddings.weight.detach().numpy()))
print("Similar to 'emma' in GloVe:", get_similar('emma', W))


Similar to 'emma' in Skip-gram: ['dissuade', 'included', 'widowerfather', 'ill', 'quite']
Similar to 'emma' in CBOW: ['sit', 'congratulation', 'an', 'mr', 'over']
Similar to 'emma' in GloVe: ['soft', 'widowerfather', 'emma', 'afford', 'prodigies']
