<a href="https://colab.research.google.com/github/SanjayBista1010/DeepLearning/blob/main/CBOW_Top5_Mountains.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import string

pages = ["Mount_Everest", "K2", "Kangchenjunga", "Lhotse", "Makalu"]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

def extract_paragraphs(soup):
    """
    Extract text from <p> tags only to avoid empty divs and tables
    """
    paragraphs = soup.find_all("p")
    text_list = []
    for p in paragraphs:
        t = p.get_text().strip()
        if t:
            text_list.append(t)
    return "\n".join(text_list)

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

corpus = []

for page in pages:
    url = f"https://en.wikipedia.org/wiki/{page}"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        text = extract_paragraphs(soup)
        processed_text = preprocess_text(text)
        corpus.append(processed_text)
        print(f"Fetched and processed text for {page} (length: {len(processed_text)} chars)")
    else:
        print(f"Failed to fetch {page}, status code: {response.status_code}")

print("Number of documents in corpus:", len(corpus))
print("First 500 characters of first document:\n", corpus[0][:100])


Fetched and processed text for Mount_Everest (length: 80998 chars)
Fetched and processed text for K2 (length: 20121 chars)
Fetched and processed text for Kangchenjunga (length: 9530 chars)
Fetched and processed text for Lhotse (length: 4465 chars)
Fetched and processed text for Makalu (length: 2544 chars)
Number of documents in corpus: 5
First 500 characters of first document:
 mount everest known locally as sagarmāthāa in nepal and qomolangmab in tibet is earths highest mount


In [2]:
words = " ".join(corpus).split()
print(words[:10])
vocab = sorted(set(words))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w,i in word2idx.items()}
vocab_size = len(vocab)
print(vocab_size)

['mount', 'everest', 'known', 'locally', 'as', 'sagarmāthāa', 'in', 'nepal', 'and', 'qomolangmab']
4496


In [3]:
print("vocab:", vocab[:10])
print("word2idx first 10 items:", list(word2idx.items())[:10])

vocab: ['012', '016', '024', '0333', '04', '1', '10', '100', '1000', '10000']
word2idx first 10 items: [('012', 0), ('016', 1), ('024', 2), ('0333', 3), ('04', 4), ('1', 5), ('10', 6), ('100', 7), ('1000', 8), ('10000', 9)]


In [4]:
sequences = [[word2idx[w] for w in s.split()] for s in corpus]
sequences[0][:10]

[2814, 1665, 2460, 2595, 728, 3562, 2249, 2893, 661, 3311]

In [5]:
contexts = []
targets = []
window = 2

In [6]:
#only considering positions that have both left and right words

for seq in sequences:
  for i in range(window, len(seq)-window):
    context = [seq[i-1], seq[i+1]] #left and right
    target = seq[i]
    contexts.append(context)
    targets.append(target)

In [7]:
contexts[:10]

[[1665, 2595],
 [2460, 728],
 [2595, 3562],
 [728, 2249],
 [3562, 2893],
 [2249, 661],
 [2893, 3311],
 [661, 2249],
 [3311, 4087],
 [2249, 2336]]

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [9]:
contexts = torch.tensor(contexts, dtype=torch.long)
print(contexts)
targets = torch.tensor(targets, dtype=torch.long)
print(targets)

tensor([[1665, 2595],
        [2460,  728],
        [2595, 3562],
        ...,
        [ 864, 1846],
        [ 508, 1860],
        [1846, 4438]])
tensor([2460, 2595,  728,  ...,  508, 1846, 1860])


In [10]:
class CBOWDataset(Dataset):
    def __init__(self, contexts, targets):
        self.contexts = contexts
        self.targets = targets
    def __len__(self):
        return len(self.targets)
    def __getitem__(self, idx):
        return self.contexts[idx], self.targets[idx]

In [11]:
dataset = CBOWDataset(contexts, targets)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [12]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.fc = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        embeds = self.emb(x)
        mean = embeds.mean(dim=1)
        out = self.fc(mean)
        return out

In [13]:
emb_dim = 10
model = CBOW(vocab_size=vocab_size, emb_dim=emb_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [14]:
epochs = 150
for epoch in range(1, epochs + 1):
    total_loss = 0.0
    for ctx, tgt in loader:
        optimizer.zero_grad()
        logits = model(ctx)
        loss = criterion(logits, tgt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 30 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} - avg loss: {total_loss / len(loader):.4f}")

Epoch   1 - avg loss: 7.4464
Epoch  30 - avg loss: 5.1171
Epoch  60 - avg loss: 5.0252
Epoch  90 - avg loss: 4.9911
Epoch 120 - avg loss: 4.9761
Epoch 150 - avg loss: 4.9670


In [18]:
embeddings = model.emb.weight.detach().numpy()
for i, (word, idx) in enumerate(word2idx.items()):
    print(word, embeddings[idx][:5])
    if i == 9:
        break

012 [ 2.8199644 -7.945237   7.2808814 -0.065623  -6.2410045]
016 [ 0.6251435 -2.4418862 -1.9786686  1.2966774  6.1659393]
024 [-7.959104   -7.7805476  -9.930698    0.20892972 20.084316  ]
0333 [-2.2756548  6.964826   2.892752  -1.8399916  9.8171215]
04 [-0.13834625 -6.3407974   5.1883273   6.5660815  19.906788  ]
1 [ 1.9471714 -4.6404133  1.6328532 -2.6106403 -2.0116222]
10 [ 3.0057287   0.49790943 -5.2661695  -3.9126866  -1.0671101 ]
100 [-1.4478639  -1.9239297   1.9309886   0.63080233  4.4433093 ]
1000 [ 1.105321  -3.4645805 -1.1251391  3.2168832 -3.0299826]
10000 [-2.5030866  2.9571662  4.394929   5.1762705 -0.8749694]


In [16]:
def predict(context_words):
    idxs = torch.tensor([[word2idx[w] for w in context_words]], dtype=torch.long)  # [1,2]
    with torch.no_grad():
        logits = model(idxs)             # [1, vocab_size]
        pred_idx = logits.argmax(dim=1).item()
    return idx2word[pred_idx]

In [19]:
print("['mount', 'everest', 'is', 'the'] ->", predict(['mount', 'everest', 'is', 'the']))
print("['the', 'highest', 'mountain', 'in'] ->", predict(['the', 'highest', 'mountain', 'in']))

['mount', 'everest', 'is', 'the'] -> everest
['the', 'highest', 'mountain', 'in'] -> the
