In [None]:
corpus = [
    "I love machine learning",
    "I love deep learning",
    "machine learning is fun",
    "deep learning is a branch of machine learning"
]


In [None]:
import re

def tokenize(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-z\s]', '', sentence)
    return sentence.split()

tokenized_corpus = [tokenize(sentence) for sentence in corpus]
print(tokenized_corpus)


In [None]:
from collections import defaultdict

vocab = set(word for sentence in tokenized_corpus for word in sentence)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)
print(word2idx)


In [None]:
def generate_skipgram_pairs(tokenized_corpus, window_size=2):
    pairs = []
    for sentence in tokenized_corpus:
        for idx, target_word in enumerate(sentence):
            context_range = list(range(max(0, idx - window_size), min(len(sentence), idx + window_size + 1)))
            for context_idx in context_range:
                if context_idx != idx:
                    pairs.append((target_word, sentence[context_idx]))
    return pairs

pairs = generate_skipgram_pairs(tokenized_corpus)
print(pairs[:10])  # sample output


In [None]:
import numpy as np

def one_hot_encode(word, word2idx):
    vec = np.zeros(len(word2idx))
    vec[word2idx[word]] = 1
    return vec

X = []
Y = []

for target, context in pairs:
    X.append(one_hot_encode(target, word2idx))
    Y.append(one_hot_encode(context, word2idx))

X = np.array(X)
Y = np.array(Y)
# print(X)


In [None]:
# !pip list
# # !python --version

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn as nn

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embedding = nn.Linear(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.output(x)
        return x

embedding_dim = 10
model = Word2Vec(vocab_size, embedding_dim)


In [None]:
# Step 7: Model Training (REVISED using Softmax and proper targets)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

X_tensor = torch.FloatTensor(X)  # input: one-hot encoded
Y_tensor = torch.LongTensor([np.argmax(y) for y in Y])  # labels: class index

num_epochs = 300
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(X_tensor)  # raw scores (logits), shape: [batch, vocab_size]
    loss = loss_fn(outputs, Y_tensor)

    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}/{num_epochs}, Loss: {loss.item():.4f}")


In [None]:
embeddings = model.embedding.weight.data.numpy()
for word, idx in word2idx.items():
    print(f"{word}: {embeddings[idx]}")


In [None]:
from numpy.linalg import norm

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))


In [None]:
word1 = "machine"
word2 = "learning"

vec1 = embeddings[word2idx[word1]]
vec2 = embeddings[word2idx[word2]]

similarity = cosine_similarity(vec1, vec2)
print(f"Cosine similarity between '{word1}' and '{word2}': {similarity:.4f}")


In [2]:
import numpy as np
a = [[1, 0], [0, 1]]
b = [[4, 1], [2, 2]]
np.dot(a, b)

array([[4, 1],
       [2, 2]])