In [8]:
! pip install requests bs4



In [9]:
# Getting data based on web scrapping

import requests
from bs4 import BeautifulSoup

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/My_Hero_Academia"

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

paragraphs = soup.find_all('p')

all_corpus = "\n".join([para.get_text() for para in paragraphs])

print(type(all_corpus) , len(all_corpus))

<class 'str'> 68472


In [10]:
import numpy as np
from collections import Counter , defaultdict

In [11]:
# Preprocess corpus

def preprocess_corpus(corpus, min_count=5):
    tokens = corpus.split()
    word_counts = Counter(tokens)
    word_counts = {word: count for word, count in word_counts.items() if count >= min_count}
    word2idx = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

word2idx, idx2word = preprocess_corpus(all_corpus)

In [12]:
# Co Occurance matrix
def create_cooccurrence_matrix(corpus, word2idx, window_size=2):
    tokens = corpus.split()
    cooccurrence_matrix = defaultdict(lambda: defaultdict(int))
    
    for i, word in enumerate(tokens):
        if word in word2idx:
            for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
                if i != j and tokens[j] in word2idx:
                    cooccurrence_matrix[word2idx[word]][word2idx[tokens[j]]] += 1
    return cooccurrence_matrix

cooccurrence_matrix = create_cooccurrence_matrix(all_corpus, word2idx) 


In [13]:
def train_glove(cooccurrence_matrix, vector_size=50, iterations=100, learning_rate=0.05):
    vocab_size = len(cooccurrence_matrix)
    W = np.random.rand(vocab_size, vector_size)
    biases = np.random.rand(vocab_size)
    
    for _ in range(iterations):
        for i, j_dict in cooccurrence_matrix.items():
            for j, count in j_dict.items():
                weight = (count / 100.0) ** 0.75
                diff = np.dot(W[i], W[j]) + biases[i] + biases[j] - np.log(count)
                W[i] -= learning_rate * weight * diff * W[j]
                W[j] -= learning_rate * weight * diff * W[i]
                biases[i] -= learning_rate * weight * diff
                biases[j] -= learning_rate * weight * diff
    return W

word_vectors = train_glove(cooccurrence_matrix)

In [14]:
# Get word embedding

def get_word_vector(word, word2idx, word_vectors):
    idx = word2idx.get(word)
    if idx is not None:
        return word_vectors[idx]
    else:
        return None
    
word_to_get_vector_for = "Academia"
if word_to_get_vector_for in word2idx:
    print(f"word to get vector for {word_to_get_vector_for} is in the voacbularty")
else:
    print(f"{word_to_get_vector_for} is not in the vocabulary")


vector = get_word_vector(word_to_get_vector_for, word2idx, word_vectors)
print(vector)

word to get vector for Academia is in the voacbularty
[-0.09372459 -0.0020299  -0.63604222  0.01376847  0.47278176 -0.30102091
  0.06926416 -1.00278841  0.45463025  0.17976685 -0.15200502 -0.01123583
  0.49683294 -0.08055328  0.35353427  0.23605113 -0.03273169  0.30453993
 -0.42990099 -0.53206754  0.18329484  0.26215612 -0.07335393  0.26247792
 -0.21117039  0.13433911 -0.04140806 -0.42416936  0.3312708  -0.39452095
  0.80411539  0.00320571  0.29179658 -0.06893394  0.15645514  0.35270605
 -0.04352232 -0.10262123  0.32774533 -0.11925565  0.7471698   0.01437103
 -0.20195146 -0.25817752  0.09304465  0.10679811  0.37994778 -0.01433488
 -0.09143073 -0.03408371]
