In [27]:
! pip install requests bs4



In [28]:
# Getting data based on web scrapping

import requests
from bs4 import BeautifulSoup

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/My_Hero_Academia"

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

paragraphs = soup.find_all('p')

all_corpus = "\n".join([para.get_text() for para in paragraphs])

print(type(all_corpus) , len(all_corpus))

<class 'str'> 68472


In [29]:
import numpy as np
from collections import Counter , defaultdict

In [30]:
# Preprocess corpus

def preprocess_corpus(corpus, min_count=5):
    tokens = corpus.split()
    word_counts = Counter(tokens)
    word_counts = {word: count for word, count in word_counts.items() if count >= min_count}
    word2idx = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

word2idx, idx2word = preprocess_corpus(all_corpus)

In [31]:
# Co Occurance matrix
def create_cooccurrence_matrix(corpus, word2idx, window_size=2):
    tokens = corpus.split()
    cooccurrence_matrix = defaultdict(lambda: defaultdict(int))
    
    for i, word in enumerate(tokens):
        if word in word2idx:
            for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
                if i != j and tokens[j] in word2idx:
                    cooccurrence_matrix[word2idx[word]][word2idx[tokens[j]]] += 1
    return cooccurrence_matrix

cooccurrence_matrix = create_cooccurrence_matrix(all_corpus, word2idx) 


In [32]:
def train_glove(cooccurrence_matrix, vector_size=50, iterations=100, learning_rate=0.05):
    vocab_size = len(cooccurrence_matrix)
    W = np.random.rand(vocab_size, vector_size)
    biases = np.random.rand(vocab_size)
    
    for _ in range(iterations):
        for i, j_dict in cooccurrence_matrix.items():
            for j, count in j_dict.items():
                weight = (count / 100.0) ** 0.75
                diff = np.dot(W[i], W[j]) + biases[i] + biases[j] - np.log(count)
                W[i] -= learning_rate * weight * diff * W[j]
                W[j] -= learning_rate * weight * diff * W[i]
                biases[i] -= learning_rate * weight * diff
                biases[j] -= learning_rate * weight * diff
    return W

word_vectors = train_glove(cooccurrence_matrix)

In [33]:
# Get word embedding

def get_word_vector(word, word2idx, word_vectors):
    idx = word2idx.get(word)
    if idx is not None:
        return word_vectors[idx]
    else:
        return None
    
word_to_get_vector_for = "Academia"
if word_to_get_vector_for in word2idx:
    print("word to get vector for is in the voacbularty")
else:
    print(f"{word_to_get_vector_for} is not in the vocabulary")


vector = get_word_vector(word_to_get_vector_for, word2idx, word_vectors)
print(vector)

word to get vector for is in the voacbularty
[-0.29164513 -0.05472709 -0.09473932  0.05362779  0.55323347  0.15294911
 -0.28119575 -0.34410333 -0.21068753  0.17891824  0.23723767  0.33059958
 -0.20836728  0.11458079 -0.08017352 -0.48794548  0.11523654 -0.45072525
 -0.55774467  0.26317767 -0.04299657  0.03259341  0.147565    0.64404846
 -0.33979364 -0.21353863  0.39576707 -0.17469242  0.06442587 -0.37370159
  0.18750833  0.89961775 -0.41840317  0.50746513 -0.15189318 -0.05070499
  0.26657242  0.38083355 -0.25773601  0.15303332  0.80749579  0.23801125
 -0.31436828  0.13813286  0.09107391 -0.40692326 -0.00196485  0.2496837
  0.40912805  0.13470439]
