<a href="https://colab.research.google.com/github/SanjayBista1010/DeepLearning/blob/main/CBOW_Top5_Mountains.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
from bs4 import BeautifulSoup
import re
import string

pages = ["Mount_Everest", "K2", "Kangchenjunga", "Lhotse", "Makalu"]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

def extract_paragraphs(soup):
    """
    Extract text from <p> tags only to avoid empty divs and tables
    """
    paragraphs = soup.find_all("p")
    text_list = []
    for p in paragraphs:
        t = p.get_text().strip()
        if t:
            text_list.append(t)
    return "\n".join(text_list)

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

corpus = []

for page in pages:
    url = f"https://en.wikipedia.org/wiki/{page}"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        text = extract_paragraphs(soup)
        processed_text = preprocess_text(text)
        corpus.append(processed_text)
        print(f"Fetched and processed text for {page} (length: {len(processed_text)} chars)")
    else:
        print(f"Failed to fetch {page}, status code: {response.status_code}")

print("Number of documents in corpus:", len(corpus))
print("First 500 characters of first document:\n", corpus[0][:500])


Fetched and processed text for Mount_Everest (length: 80998 chars)
Fetched and processed text for K2 (length: 20121 chars)
Fetched and processed text for Kangchenjunga (length: 9530 chars)
Fetched and processed text for Lhotse (length: 4465 chars)
Fetched and processed text for Makalu (length: 2544 chars)
Number of documents in corpus: 5
First 500 characters of first document:
 mount everest known locally as sagarmāthāa in nepal and qomolangmab in tibet is earths highest mountain above sea level it lies in the mahalangur himal subrange of the himalayas and marks part of the china–nepal border at its summit4 its height was most recently measured in 2020 by chinese and nepali authorities as 884886 m 29031 ft 81⁄2 in56 mount everest attracts many climbers including highly experienced mountaineers there are two main climbing routes one approaching the summit from the south


In [4]:
print(corpus)

['mount everest known locally as sagarmāthāa in nepal and qomolangmab in tibet is earths highest mountain above sea level it lies in the mahalangur himal subrange of the himalayas and marks part of the china–nepal border at its summit4 its height was most recently measured in 2020 by chinese and nepali authorities as 884886 m 29031 ft 81⁄2 in56 mount everest attracts many climbers including highly experienced mountaineers there are two main climbing routes one approaching the summit from the southeast in nepal known as the standard route and the other from the north in tibet while not posing substantial technical climbing challenges on the standard route everest presents dangers such as altitude sickness weather and wind as well as hazards from avalanches and the khumbu icefall as of may 2024 340 people have died on everest over 200 bodies remain on the mountain and have not been removed due to the dangerous conditions78 climbers typically ascend only part of mount everests elevation a

In [5]:
words = " ".join(corpus).split()
print(words)
vocab = sorted(set(words))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w,i in word2idx.items()}
vocab_size = len(vocab)
print(vocab_size)

['mount', 'everest', 'known', 'locally', 'as', 'sagarmāthāa', 'in', 'nepal', 'and', 'qomolangmab', 'in', 'tibet', 'is', 'earths', 'highest', 'mountain', 'above', 'sea', 'level', 'it', 'lies', 'in', 'the', 'mahalangur', 'himal', 'subrange', 'of', 'the', 'himalayas', 'and', 'marks', 'part', 'of', 'the', 'china–nepal', 'border', 'at', 'its', 'summit4', 'its', 'height', 'was', 'most', 'recently', 'measured', 'in', '2020', 'by', 'chinese', 'and', 'nepali', 'authorities', 'as', '884886', 'm', '29031', 'ft', '81⁄2', 'in56', 'mount', 'everest', 'attracts', 'many', 'climbers', 'including', 'highly', 'experienced', 'mountaineers', 'there', 'are', 'two', 'main', 'climbing', 'routes', 'one', 'approaching', 'the', 'summit', 'from', 'the', 'southeast', 'in', 'nepal', 'known', 'as', 'the', 'standard', 'route', 'and', 'the', 'other', 'from', 'the', 'north', 'in', 'tibet', 'while', 'not', 'posing', 'substantial', 'technical', 'climbing', 'challenges', 'on', 'the', 'standard', 'route', 'everest', 'prese

In [6]:
print("vocab:", vocab)
print("word2inx:", word2idx)

vocab: ['012', '016', '024', '0333', '04', '1', '10', '100', '1000', '10000', '10010', '10200', '10500', '108', '10day', '11', '1100', '11000', '1125', '1130', '115', '11980', '12', '120', '12000', '121', '124', '125', '12th', '130', '1300', '13000', '13796', '13800', '14', '140', '1400', '14000', '1480', '15', '150', '1500', '150000', '15072', '15260', '15302', '154', '16', '160', '16000', '16400', '16404', '16500', '169', '16990', '17', '170', '17100', '1721', '1733', '174', '17400', '175', '17700', '17th', '18', '18000', '1802', '1830s', '18400', '1847', '1849', '1852', '1854', '1856', '1857', '18652124', '1885', '1892', '19', '190', '1900s', '1902', '1905', '1909', '190litre', '1920s', '1921', '1921–2006', '1922', '1924', '1924276', '1925', '1933', '1936—tried', '1938', '1939', '19400', '1950', '1950s', '1952', '195220', '1953', '1953110', '195395', '1954', '1955', '1956', '1958', '19609', '1960–61', '1962', '1963', '19685', '1970', '1970s', '1973283', '1974', '1975', '1976', '1977

In [7]:
sequences = [[word2idx[w] for w in s.split()] for s in corpus]
sequences

[[2814,
  1665,
  2460,
  2595,
  728,
  3562,
  2249,
  2893,
  661,
  3311,
  2249,
  4087,
  2336,
  1562,
  2133,
  2815,
  518,
  3604,
  2551,
  2344,
  2561,
  2249,
  4043,
  2646,
  2143,
  3902,
  2975,
  4043,
  2147,
  661,
  2682,
  3083,
  2975,
  4043,
  1095,
  941,
  755,
  2349,
  3927,
  2349,
  2111,
  4314,
  2812,
  3365,
  2721,
  2249,
  194,
  1005,
  1096,
  661,
  2898,
  782,
  728,
  490,
  2635,
  302,
  1909,
  450,
  2251,
  2814,
  1665,
  776,
  2666,
  1134,
  2260,
  2136,
  1719,
  2834,
  4055,
  704,
  4183,
  2647,
  1139,
  3536,
  2994,
  694,
  4043,
  3921,
  1906,
  4043,
  3797,
  2249,
  2893,
  2460,
  728,
  4043,
  3838,
  3531,
  661,
  4043,
  3027,
  1906,
  4043,
  2934,
  2249,
  4087,
  4360,
  2946,
  3214,
  3906,
  4014,
  1139,
  1068,
  2992,
  4043,
  3838,
  3531,
  1665,
  3236,
  1338,
  3912,
  728,
  631,
  3704,
  4328,
  661,
  4378,
  728,
  4340,
  728,
  2097,
  1906,
  790,
  661,
  4043,
  2440,
  2209,
  728,
  

In [8]:
contexts = []
targets = []
window = 1

In [9]:
#only considering positions that have both left and right words

for seq in sequences:
  for i in range(window, len(seq)-window):
    context = [seq[i-1], seq[i+1]] #left and right
    target = seq[i]
    contexts.append(context)
    targets.append(target)

In [10]:
contexts

[[2814, 2460],
 [1665, 2595],
 [2460, 728],
 [2595, 3562],
 [728, 2249],
 [3562, 2893],
 [2249, 661],
 [2893, 3311],
 [661, 2249],
 [3311, 4087],
 [2249, 2336],
 [4087, 1562],
 [2336, 2133],
 [1562, 2815],
 [2133, 518],
 [2815, 3604],
 [518, 2551],
 [3604, 2344],
 [2551, 2561],
 [2344, 2249],
 [2561, 4043],
 [2249, 2646],
 [4043, 2143],
 [2646, 3902],
 [2143, 2975],
 [3902, 4043],
 [2975, 2147],
 [4043, 661],
 [2147, 2682],
 [661, 3083],
 [2682, 2975],
 [3083, 4043],
 [2975, 1095],
 [4043, 941],
 [1095, 755],
 [941, 2349],
 [755, 3927],
 [2349, 2349],
 [3927, 2111],
 [2349, 4314],
 [2111, 2812],
 [4314, 3365],
 [2812, 2721],
 [3365, 2249],
 [2721, 194],
 [2249, 1005],
 [194, 1096],
 [1005, 661],
 [1096, 2898],
 [661, 782],
 [2898, 728],
 [782, 490],
 [728, 2635],
 [490, 302],
 [2635, 1909],
 [302, 450],
 [1909, 2251],
 [450, 2814],
 [2251, 1665],
 [2814, 776],
 [1665, 2666],
 [776, 1134],
 [2666, 2260],
 [1134, 2136],
 [2260, 1719],
 [2136, 2834],
 [1719, 4055],
 [2834, 704],
 [4055, 4

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [13]:
contexts = torch.tensor(contexts, dtype=torch.long)
print(contexts)
targets = torch.tensor(targets, dtype=torch.long)
print(targets)

tensor([[2814, 2460],
        [1665, 2595],
        [2460,  728],
        ...,
        [ 508, 1860],
        [1846, 4438],
        [1860, 1712]])
tensor([1665, 2460, 2595,  ..., 1846, 1860, 4438])


In [14]:
class CBOWDataset(Dataset):
    def __init__(self, contexts, targets):
        self.contexts = contexts
        self.targets = targets
    def __len__(self):
        return len(self.targets)
    def __getitem__(self, idx):
        return self.contexts[idx], self.targets[idx]

In [15]:
dataset = CBOWDataset(contexts, targets)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [16]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.fc = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        embeds = self.emb(x)
        mean = embeds.mean(dim=1)
        out = self.fc(mean)
        return out

In [17]:
emb_dim = 10
model = CBOW(vocab_size=vocab_size, emb_dim=emb_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [18]:
epochs = 150
for epoch in range(1, epochs + 1):
    total_loss = 0.0
    for ctx, tgt in loader:
        optimizer.zero_grad()
        logits = model(ctx)
        loss = criterion(logits, tgt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 30 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} - avg loss: {total_loss / len(loader):.4f}")

Epoch   1 - avg loss: 7.4443
Epoch  30 - avg loss: 5.1006
Epoch  60 - avg loss: 5.0059
Epoch  90 - avg loss: 4.9779
Epoch 120 - avg loss: 4.9698
Epoch 150 - avg loss: 4.9589


In [19]:
embeddings = model.emb.weight.detach().numpy()
for word, idx in word2idx.items():
    print(word, embeddings[idx][:5])

012 [-7.6138287  -0.27113342 -2.9812272   6.3932      3.0532825 ]
016 [-4.8519907  6.6415625  6.2028866 -5.8396354  2.2239327]
024 [  2.660937    2.0845575  14.754251  -16.143867   14.002586 ]
0333 [-5.2738233 -5.9074507 -1.73651   -8.096121  14.505024 ]
04 [-15.643725   23.872934   -1.7131246 -11.455491    1.305082 ]
1 [-7.6129045  1.2436904  1.2686841  0.6445363  5.1196475]
10 [-1.9441477   0.06356593  0.13183631 -1.0983216   3.1355782 ]
100 [-6.3824515  1.6965698 -2.1596494 -1.4681567 -1.3712581]
1000 [-3.4085767   0.81032854  3.313774   -8.406659   -1.6686604 ]
10000 [-10.097041    -3.6316886   -0.81674767 -14.064377     2.4939635 ]
10010 [-19.521275   19.448265   -2.4739065 -16.145372    6.964392 ]
10200 [-11.961872  -14.687226   -3.4455082  17.737467   -1.9390184]
10500 [-20.468666   18.866333   15.198184  -19.275932    6.6723485]
108 [-26.427067   -7.4525595  -2.8760343   1.8806543  -0.7460125]
10day [-0.00774605 -4.335643    0.39742833  1.0698521   3.0690372 ]
11 [-5.581821   1

In [20]:
def predict(context_words):
    idxs = torch.tensor([[word2idx[w] for w in context_words]], dtype=torch.long)  # [1,2]
    with torch.no_grad():
        logits = model(idxs)             # [1, vocab_size]
        pred_idx = logits.argmax(dim=1).item()
    return idx2word[pred_idx]

In [22]:
print("Example prediction for context ['i', 'cats'] ->", predict(['mount', 'is']))
print("Example prediction for context ['cats', 'are'] ->", predict(['highest', 'mountain']))

Example prediction for context ['i', 'cats'] -> everest
Example prediction for context ['cats', 'are'] -> the
