In [2]:
!pip install gensim tqdm scipy scikit-learn




In [3]:
!wget http://mattmahoney.net/dc/enwik8.zip
!unzip enwik8.zip


--2026-01-12 10:57:04--  http://mattmahoney.net/dc/enwik8.zip
Resolving mattmahoney.net (mattmahoney.net)... 20.119.76.151
Connecting to mattmahoney.net (mattmahoney.net)|20.119.76.151|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36445475 (35M) [application/zip]
Saving to: ‘enwik8.zip’


2026-01-12 10:57:08 (9.29 MB/s) - ‘enwik8.zip’ saved [36445475/36445475]

Archive:  enwik8.zip
  inflating: enwik8                  


In [4]:
with open("enwik8", "r", encoding="utf-8") as f:
    text = f.read()

print("Total characters:", len(text))


Total characters: 99621832


In [5]:
text = text.lower()
tokens = text.split()

print("Total tokens:", len(tokens))
print(tokens[:20])


Total tokens: 13303079
['<mediawiki', 'xmlns="http://www.mediawiki.org/xml/export-0.3/"', 'xmlns:xsi="http://www.w3.org/2001/xmlschema-instance"', 'xsi:schemalocation="http://www.mediawiki.org/xml/export-0.3/', 'http://www.mediawiki.org/xml/export-0.3.xsd"', 'version="0.3"', 'xml:lang="en">', '<siteinfo>', '<sitename>wikipedia</sitename>', '<base>http://en.wikipedia.org/wiki/main_page</base>', '<generator>mediawiki', '1.6alpha</generator>', '<case>first-letter</case>', '<namespaces>', '<namespace', 'key="-2">media</namespace>', '<namespace', 'key="-1">special</namespace>', '<namespace', 'key="0"']


In [6]:
from collections import Counter

VOCAB_SIZE = 30000

word_counts = Counter(tokens)
most_common = word_counts.most_common(VOCAB_SIZE)

word2id = {word: i for i, (word, _) in enumerate(most_common)}
id2word = {i: word for word, i in word2id.items()}

filtered_tokens = [word2id[w] for w in tokens if w in word2id]

print("Vocabulary size:", len(word2id))


Vocabulary size: 30000


In [7]:
import numpy as np

word_freqs = np.array([count for _, count in most_common], dtype=np.float32)
word_freqs = word_freqs ** 0.75
word_freqs = word_freqs / word_freqs.sum()


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim


In [9]:
class SkipGramNS(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.input_embed = nn.Embedding(vocab_size, embed_dim)
        self.output_embed = nn.Embedding(vocab_size, embed_dim)

    def forward(self, center, context, negatives):
        center_vec = self.input_embed(center)
        context_vec = self.output_embed(context)
        neg_vec = self.output_embed(negatives)

        pos_score = torch.sum(center_vec * context_vec, dim=1)
        pos_loss = -torch.log(torch.sigmoid(pos_score))

        neg_score = torch.bmm(neg_vec, center_vec.unsqueeze(2)).squeeze()
        neg_loss = -torch.log(torch.sigmoid(-neg_score)).sum(1)

        return (pos_loss + neg_loss).mean()


In [10]:
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

EMBED_DIM = 100
WINDOW_SIZE = 2
NEG_SAMPLES = 5
BATCH_SIZE = 512
EPOCHS = 2

model = SkipGramNS(len(word2id), EMBED_DIM).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.003)


In [11]:
def train():
    for epoch in range(EPOCHS):
        total_loss = 0

        for i in tqdm(range(0, len(filtered_tokens), BATCH_SIZE)):
            batch = filtered_tokens[i:i+BATCH_SIZE]

            centers, contexts, negatives = [], [], []

            for idx, center in enumerate(batch):
                for j in range(max(0, idx-WINDOW_SIZE), min(len(batch), idx+WINDOW_SIZE+1)):
                    if idx != j:
                        centers.append(center)
                        contexts.append(batch[j])
                        negs = np.random.choice(len(word2id), NEG_SAMPLES, p=word_freqs)
                        negatives.append(negs)

            if not centers:
                continue

            centers = torch.tensor(centers).to(device)
            contexts = torch.tensor(contexts).to(device)
            negatives = torch.tensor(negatives).to(device)

            optimizer.zero_grad()
            loss = model(centers, contexts, negatives)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


In [None]:
train()

  negatives = torch.tensor(negatives).to(device)
 85%|████████▍ | 17603/20733 [3:26:59<35:03,  1.49it/s]

In [None]:
embeddings = model.input_embed.weight.detach().cpu().numpy()


In [None]:
from gensim.models import KeyedVectors

gensim_model = KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin",
    binary=True
)


In [None]:
from scipy.spatial.distance import cosine

def cosine_sim(v1, v2):
    return 1 - cosine(v1, v2)

word = "king"

our_vec = embeddings[word2id[word]]
gensim_vec = gensim_model[word][:EMBED_DIM]

print("Cosine similarity:", cosine_sim(our_vec, gensim_vec))


In [None]:
def analogy(a, b, c, topn=5):
    vec = embeddings[word2id[b]] - embeddings[word2id[a]] + embeddings[word2id[c]]

    scores = []
    for i in range(len(embeddings)):
        sim = cosine_sim(vec, embeddings[i])
        scores.append((id2word[i], sim))

    return sorted(scores, key=lambda x: x[1], reverse=True)[:topn]


In [None]:
analogy("man", "king", "woman")


In [None]:
gender_direction = embeddings[word2id["he"]] - embeddings[word2id["she"]]


In [None]:
professions = ["doctor", "nurse", "engineer", "teacher"]

for p in professions:
    score = cosine_sim(embeddings[word2id[p]], gender_direction)
    print(p, score)


