<strong><h3>Design of word embeddings using co-occurrence matrix.
</h3></strong>

In [2]:
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import normalize


In [3]:
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]


In [4]:
window_size = 2  
co_occurrence_matrix = defaultdict(int)

for text in corpus:
    tokens = text.lower().split()
    for i, word in enumerate(tokens):
        for j in range(max(i - window_size, 0), min(i + window_size + 1, len(tokens))):
            if i != j:
                co_occurrence_matrix[(word, tokens[j])] += 1

# Create vocabulary
vocab = sorted(set(word for context_pair in co_occurrence_matrix.keys() for word in context_pair))



In [8]:
vocab

['and',
 'document',
 'document.',
 'document?',
 'first',
 'is',
 'one.',
 'second',
 'the',
 'third',
 'this']

In [5]:

# Construct co-occurrence matrix
num_words = len(vocab)
co_occurrence_matrix_np = np.zeros((num_words, num_words))

for i, word1 in enumerate(vocab):
    for j, word2 in enumerate(vocab):
        co_occurrence_matrix_np[i, j] = co_occurrence_matrix[(word1, word2)]


In [6]:
# SVD for dimensionality reduction
U, _, _ = np.linalg.svd(co_occurrence_matrix_np)

# Extract word embeddings
word_embeddings = U[:, :100]  # Assuming 100 dimensions

# Normalize embeddings
word_embeddings = normalize(word_embeddings, axis=1, norm='l2')


In [7]:
words_of_interest = ['document', 'this', 'first']
for word in words_of_interest:
    index = vocab.index(word)
    embedding = word_embeddings[index]
    print(f"Word: {word}, Embedding: {embedding}")


Word: document, Embedding: [-0.17107766  0.0244336   0.02566876  0.21928474  0.22681178  0.04574579
 -0.25154451 -0.03144657  0.57120788  0.1301905   0.67850052]
Word: this, Embedding: [-0.47517019 -0.17300101  0.72385866  0.33809759 -0.17658999  0.12582661
  0.17666056  0.04350924 -0.12145046 -0.10506454 -0.00977071]
Word: first, Embedding: [-0.25890911  0.0774463   0.074289   -0.36249141  0.70593064  0.22354213
 -0.00367718  0.37374086 -0.06116064 -0.30338755 -0.07911846]
