<a href="https://colab.research.google.com/github/SiracencoSerghei/DataScienceHW/blob/main/example_kaggle/les_12/Module_12_1_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to NLP

Main building blocks of an NLP application:
- tokenization
- word embeddings
- sequence modeling
- common applications

### Lemmatization
 Lemmatization reduces words to their base or dictionary form (i.e., lemma).

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # https://www.nltk.org/api/nltk.corpus.reader.wordnet.html

# Sample text
text = "Running wild cats run in the forest. They ran away when they saw a pack of wolves."

# Tokenize the text into words
words = nltk.word_tokenize(text)
# words = nltk.sent_tokenize(text)

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each word
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Original words:", words)
print("Lemmatized words:", lemmatized_words)


### Stemming
Stemming is a text processing technique that reduces words to their base or root form, called a stem.
Stemming removes suffixes and prefixes from words to achieve this normalization.

In [None]:
import nltk
from nltk.stem import SnowballStemmer

# Sample text
text = "Running wild cats run in the forest. They ran away when they saw a pack of wolves."

# Tokenize the text into words
words = nltk.word_tokenize(text)

# Initialize PorterStemmer
stemmer = SnowballStemmer('english')

# Stem each word
stemmed_words = [stemmer.stem(word) for word in words]

print("Original words:", words)
print("Stemmed words:", stemmed_words)

## Tokenization

Splitting of texts into single unique units (tokens).

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.util import ngrams
nltk.download('punkt')  # https://www.nltk.org/_modules/nltk/tokenize/punkt.html

# Sample text
text = "Tokenization is an important step in natural language processing."

# Character-level tokenization
char_tokens = list(text)
print("Character-level tokenization:", char_tokens)

# Word level tokenization
word_tokens = word_tokenize(text)
print("Word level tokenization:", word_tokens)

# N-gram level tokenization
n = 2
n_grams = list(ngrams(word_tokenize(text), n))
print("N-gram level tokenization (2-grams):", n_grams)


ByteLevelBPE as a Trained Tokenizer:
https://towardsdatascience.com/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0

In [None]:
# Trained tokenizer
# BPE ensures that the most common words are represented in the vocabulary as a single token while the rare words are broken down
# into two or more subword tokens and this is in agreement with what a subword-based tokenization algorithm does.

from tokenizers import ByteLevelBPETokenizer

# Sample text
text = """Tokenization is an important step in natural language processing.
Tokenization is an important step in NLP.
Tokenization is an important process.
Process of tokenizing text sequences is important."""

# Initialize ByteLevelBPETokenizer, used in GPT-2
# https://towardsdatascience.com/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0
tokenizer = ByteLevelBPETokenizer()

# Train tokenizer on text
tokenizer.train_from_iterator([text])

# Sub-word level tokenization (Byte-pair encoding)
subword_tokens = tokenizer.encode(text).tokens
print("Sub-word level tokenization (Byte-pair encoding):", subword_tokens)


In [None]:
len(tokenizer.get_vocab())

In [None]:
tokenizer.get_vocab()

In [None]:
tokenizer.id_to_token(260)

Training your own tokenizer: https://huggingface.co/learn/nlp-course/en/chapter6/8

In [None]:
# Training from scratch
## WordPiece Tokenizer: https://huggingface.co/learn/nlp-course/en/chapter6/6

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from tokenizers.pre_tokenizers import Whitespace


tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))


In [None]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

In [None]:
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The cat sits on the window sill",
    "The dog barks loudly in the night",
    "Birds of a feather flock together",
    "Tokenization is an important step in natural language processing.",
    "Tokenization is an important step in NLP.",
    "Tokenization is an important process.",
    "Process of tokenizing text sequences is important."
]
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]  # BERT
trainer = trainers.WordPieceTrainer(vocab_size=1000, special_tokens=special_tokens)
tokenizer.train_from_iterator(corpus, trainer=trainer)
tokenizer.decoder = decoders.WordPiece(prefix="##")
tokenizer.save("./tokenizer-trained.json")
#tokenizer = Tokenizer.from_file("./tokenizer-trained.json")

In [None]:
encoding = tokenizer.encode("The quick brown fox")
print(encoding.tokens)

In [None]:
encoding.ids

In [None]:
tokenizer.decode(encoding.ids)

In [None]:
tokenizer.get_vocab()

In [None]:
tokenizer.get_vocab_size()

# Text Embeddings

https://www.deepset.ai/blog/the-beginners-guide-to-text-embeddings

## Sparse (розріджені) vectors: One hot encoding, Bag of words, Tf-idf

In [None]:
from nltk.tokenize import word_tokenize
import numpy as np
import nltk

# Sample text
text = "The quick brown fox jumps over the lazy dog"

# Tokenize the text
tokens = word_tokenize(text.lower())
print("tokens", tokens)
# Create vocabulary
vocab = sorted(set(tokens))
print("vocab", vocab)

In [None]:
# Create one-hot encoding
one_hot_encoded = []
for token in tokens:
    one_hot_vector = [0] * len(vocab)
    one_hot_vector[vocab.index(token)] = 1
    one_hot_encoded.append(one_hot_vector)
one_hot_encoded

In [None]:
# Convert to numpy array for easier manipulation
one_hot_encoded = np.array(one_hot_encoded)

# Print results
print("Original text:", text)
print("Tokens:", tokens)
print("Vocabulary:", vocab)
print("One-hot encoded text:\n", one_hot_encoded)

In [None]:
# Bag of words

# Initialize Bag of Words
bow = np.zeros(len(vocab))

# Count occurrences of each word
for token in tokens:
    bow[vocab.index(token)] += 1

# Print results
print("Original text:", text)
print("Tokens:", tokens)
print("Vocabulary:", vocab)
print("Bag of Words:\n", bow)

# And just like that, we have managed to encode our text as a vector
# (also known as a “bag of words” or “BoW” embedding — because it ignores the order of the words in the sentence).

In [None]:
# TF-idf
# see module 7 lesson 1
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The cat sits on the window sill",
    "The dog barks loudly in the night",
    "Birds of a feather flock together"
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names (vocabulary)
vocab = tfidf_vectorizer.get_feature_names_out()

# Convert TF-IDF matrix to dense array for easier manipulation
tfidf_matrix_dense = tfidf_matrix.toarray()

# Print results
print("Sample corpus:", corpus)
print("Vocabulary:", vocab)
print("TF-IDF matrix:\n", tfidf_matrix_dense)


### Cosine similarity


![cosine](https://storage.googleapis.com/lds-media/images/cosine-similarity-vectors.original.jpg)
Source: https://www.learndatasci.com/glossary/cosine-similarity/

In [None]:
# show vector similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Sample vectors
vector1 = np.array([1, 2, 3])
vector2 = np.array([4, 5, 6])

# Reshape the vectors to ensure they are 2D arrays
vector1 = vector1.reshape(1, -1)
vector2 = vector2.reshape(1, -1)

# Compute cosine similarity
similarity = cosine_similarity(vector1, vector2)

print("Vector 1:", vector1)
print("Vector 2:", vector2)
print("Cosine similarity:", similarity[0][0])


![formula](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*LfW66-WsYkFqWc4XYJbEJg.png)

https://github.com/greyhatguy007/deep-learning-specialization/blob/main/C5-sequence-models/week2/C5W2A1/Operations_on_word_vectors_v2a.ipynb

In [None]:
def cosine_similarity(u, v):
    """
    Cosine similarity reflects the degree of similarity between u and v

    Arguments:
        u -- a word vector of shape (n,)
        v -- a word vector of shape (n,)

    Returns:
        cosine_similarity -- the cosine similarity between u and v defined by the formula above.
    """

    # Special case. Consider the case u = [0, 0], v=[0, 0]
    if np.all(u == v):
        return 1

    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u, v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(np.dot(u,u)))

    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(np.dot(v,v)))

    # Avoid division by 0
    if np.isclose(norm_u * norm_v, 0, atol=1e-32):
        return 0

    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u * norm_v)

    return cosine_similarity

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# Sample text vectors
vector1 = np.array([1, 2])
vector2 = np.array([4, 5])

# Reshape the vectors to ensure they are 2D arrays
vector1 = vector1.reshape(1, -1)
vector2 = vector2.reshape(1, -1)

# Compute cosine similarity
similarity = cosine_similarity(vector1, vector2)[0][0]

# Plot vectors
plt.figure(figsize=(8, 6))
plt.plot([0, vector1[0][0]], [0, vector1[0][1]], label='Vector 1', marker='o')
plt.plot([0, vector2[0][0]], [0, vector2[0][1]], label='Vector 2', marker='o')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Visualization of Text Vectors')
plt.legend()

# Annotate cosine similarity
plt.text(1, 4, f'Cosine Similarity: {similarity:.2f}', fontsize=12)

# Set aspect ratio to be equal
plt.gca().set_aspect('equal', adjustable='box')

# Show plot
plt.grid()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Sample corpus
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The cat sits on the window sill",
    "The dog barks loudly in the night",
    "Birds of a feather flock together"
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Convert TF-IDF matrix to dense array for easier manipulation
tfidf_matrix_dense = tfidf_matrix.toarray()

# Compute cosine similarity
cos_sim = cosine_similarity(tfidf_matrix_dense)

# Perform PCA to reduce dimensions to 2 for visualization
pca = PCA(n_components=2)
tfidf_matrix_pca = pca.fit_transform(tfidf_matrix_dense)

# Plot one graph per pair of text
num_pairs = len(corpus) * (len(corpus) - 1) // 2
plt.figure(figsize=(15, 10))
subplot_index = 1
for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        plt.subplot(2, num_pairs // 2, subplot_index)
        plt.scatter(tfidf_matrix_pca[:, 0], tfidf_matrix_pca[:, 1])
        plt.plot([tfidf_matrix_pca[i, 0], tfidf_matrix_pca[j, 0]],
                 [tfidf_matrix_pca[i, 1], tfidf_matrix_pca[j, 1]],
                 linestyle='-', color='red')
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
        plt.title(f'Cosine Similarity: {cos_sim[i, j]:.2f}')
        plt.xticks([])
        plt.yticks([])
        subplot_index += 1
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Sample corpus
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The cat sits on the window sill",
    "The dog barks loudly in the night",
    "Birds of a feather flock together"
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Convert TF-IDF matrix to dense array for easier manipulation
tfidf_matrix_dense = tfidf_matrix.toarray()

# Compute cosine similarity
cos_sim = cosine_similarity(tfidf_matrix_dense)

for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        print(f'Text: {corpus[i]}, \n{corpus[j]}\nCosine Similarity: {cos_sim[i, j]:.2f}.\n')



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Sample corpus
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The cat sits on the window sill",
    "The dog barks loudly in the night",
    "Birds of a feather flock together"
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Convert TF-IDF matrix to dense array for easier manipulation
tfidf_matrix_dense = tfidf_matrix.toarray()

# Perform PCA to reduce dimensions to 2 for visualization
pca = PCA(n_components=2)
tfidf_matrix_pca = pca.fit_transform(tfidf_matrix_dense)

# Compute cosine similarity
cos_sim = cosine_similarity(tfidf_matrix_pca)

# Plot one graph per pair of text
num_pairs = len(corpus) * (len(corpus) - 1) // 2
plt.figure(figsize=(15, 10))
subplot_index = 1
for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        plt.subplot(2, num_pairs // 2, subplot_index)
        plt.scatter(tfidf_matrix_pca[:, 0], tfidf_matrix_pca[:, 1])
        plt.plot([0, tfidf_matrix_pca[i, 0]],
                 [0, tfidf_matrix_pca[i, 1]],
                 linestyle='-', color='blue', alpha=0.5)
        plt.plot([0, tfidf_matrix_pca[j, 0]],
                 [0, tfidf_matrix_pca[j, 1]],
                 linestyle='-', color='red', alpha=0.5)
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
        plt.title(f'Cosine Similarity: {cos_sim[i, j]:.2f}.\nText: {corpus[i]}, \n{corpus[j]}')
        plt.xticks(np.arange(-0.5, 1.5, 0.5))
        plt.yticks(np.arange(-0.5, 1.5, 0.5))
        subplot_index += 1
plt.tight_layout()
plt.show()



## Word2Vec
https://www.tensorflow.org/text/tutorials/word2vec

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Sample corpus
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The cat sits on the window sill",
    "The dog barks loudly in the night",
    "Birds of a feather flock together"
]

# Tokenize the corpus
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

# Define Word2Vec model parameters
vector_size = 100  # Dimensionality of word vectors
window_size = 5  # Context window size
min_count = 1  # Minimum frequency count of words

# Initialize Word2Vec model
model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=vector_size,
    window=window_size,
    min_count=min_count,
    sg=1  # Use Skip-gram model
)

# Train Word2Vec model
model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

# Save trained model
model.save("word2vec_model.bin")

# To load the trained model:
# loaded_model = Word2Vec.load("word2vec_model.bin")


In [None]:
model.get_latest_training_loss()

In [None]:
vocab = model.wv.index_to_key
vocab

In [None]:
model.wv.key_to_index

In [None]:
# Sample text to encode
text = "This fox is a very quick fox"

# Tokenize the text
tokens = word_tokenize(text.lower())

# Encode the text
word_vectors = [model.wv[word] for word in tokens if word in model.wv.key_to_index.keys()]

# Average the word vectors to get the text encoding
if word_vectors:
    encoded_text = np.mean(word_vectors, axis=0)
else:
    encoded_text = np.zeros(model.vector_size)  # Default encoding for empty text

# Print the encoded text
print("Encoded text:", encoded_text)

In [None]:
model.wv["fox"]

In [None]:
word_vectors

In [None]:
# Sample corpus
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The cat sits on the window sill",
    "The dog barks loudly in the night",
    "Birds of a feather flock together"
]

embeddings = np.zeros((len(corpus), model.vector_size))

for i in range(len(corpus)):
    # Tokenize the text
    tokens = word_tokenize(corpus[i].lower())

    # Encode the text
    word_vectors = [model.wv[word] for word in tokens if word in model.wv.key_to_index.keys()]

    # Average the word vectors to get the text encoding
    if word_vectors:
        encoded_text = np.mean(word_vectors, axis=0)
    else:
        encoded_text = np.zeros(model.vector_size)  # Default encoding for empty text
    embeddings[i] = encoded_text

print(embeddings)


In [None]:
# Compute cosine similarity
cos_sim = cosine_similarity(embeddings)
for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        print(f'Text: {corpus[i]}, \n{corpus[j]}\nCosine Similarity: {cos_sim[i, j]:.2f}.\n')

In [None]:
cos_sim

In [None]:
# Training of Skip-gram model
# source: https://colab.research.google.com/drive/1IxAnnFSqk3mL3A8n1PKYWdEzDSd2Y9rF?usp=sharing#scrollTo=13xBa01XEnpb

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Set up the training data
sentences = [["I", "love", "machine", "learning"],
             ["I", "like", "deep", "learning"],
             ["I", "enjoy", "neural", "networks"]]

# Create the skip-gram dataset
skip_gram_pairs = []
window_size = 2

for sentence in sentences:
    for i in range(len(sentence)):
        target_word = sentence[i]
        for j in range(i - window_size, i + window_size + 1):
            if j >= 0 and j < len(sentence) and j != i:
                context_word = sentence[j]
                skip_gram_pairs.append((target_word, context_word))

# Create word-to-index and index-to-word mappings
word_to_index = {}
index_to_word = {}
index = 0

for sentence in sentences:
    for word in sentence:
        if word not in word_to_index:
            word_to_index[word] = index
            index_to_word[index] = word
            index += 1

# Convert skip-gram pairs to indices
skip_gram_pairs_indices = []
for pair in skip_gram_pairs:
    target_index = word_to_index[pair[0]]
    context_index = word_to_index[pair[1]]
    skip_gram_pairs_indices.append((target_index, context_index))

# Define the model
vocab_size = len(word_to_index)
embedding_dim = 10

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=1))
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam')
target_words = []
context_words = []

for pair in skip_gram_pairs_indices:
    target_words.append(pair[0])
    context_words.append(pair[1])

target_words = np.array(target_words)
context_words = np.array(context_words)

model.fit(target_words, tf.keras.utils.to_categorical(context_words, num_classes=vocab_size), epochs=100)

# Get the word embeddings
embeddings = model.get_weights()[0]

# Print the word embeddings
for i in range(vocab_size):
    word = index_to_word[i]
    embedding = embeddings[i]
    print(f"Word: {word}, Embedding: {embedding}")

### Pretrained embeddings: GloVe

https://nlp.stanford.edu/projects/glove/

https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db

In [None]:
embeddings_dict = {}
with open("glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [None]:
len(embeddings_dict["the"])

In [None]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))


In [None]:
find_closest_embeddings(embeddings_dict["king"])[1:10]

In [None]:
find_closest_embeddings(
        embeddings_dict["king"] - embeddings_dict["man"] + embeddings_dict["woman"]
)[1:5]

In [None]:
find_closest_embeddings(
        embeddings_dict["ukraine"] - embeddings_dict["country"] + embeddings_dict["city"]
)[1:7]

In [None]:
find_closest_embeddings(
        embeddings_dict["germany"] - embeddings_dict["country"] + embeddings_dict["city"]
)[1:5]

In [None]:
find_closest_embeddings(
        embeddings_dict["cat"] - embeddings_dict["home"] + embeddings_dict["wild"]
)[1:5]

In [None]:
find_closest_embeddings(
        embeddings_dict["kiev"] - embeddings_dict["ukraine"] + embeddings_dict["germany"]
)[1:5]

In [None]:
find_closest_embeddings(
        embeddings_dict["bird"] - embeddings_dict["air"] + embeddings_dict["water"]
)[1:5]

In [None]:
find_closest_embeddings(
        embeddings_dict["windows"] - embeddings_dict["microsoft"] + embeddings_dict["apple"]
)[1:5]

In [None]:
# Sample corpus
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The cat sits on the window sill",
    "The dog barks loudly in the night",
    "Birds of a feather flock together"
]

embeddings = np.zeros((len(corpus), 50))

for i in range(len(corpus)):
    # Tokenize the text
    tokens = word_tokenize(corpus[i].lower())

    # Encode the text
    word_vectors = [embeddings_dict[word] for word in tokens if word in embeddings_dict]

    # Average the word vectors to get the text encoding
    if word_vectors:
        encoded_text = np.mean(word_vectors, axis=0)
    else:
        encoded_text = np.zeros(model.vector_size)  # Default encoding for empty text
    embeddings[i] = encoded_text

print(embeddings)

In [None]:
cos_sim = cosine_similarity(embeddings)
for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        print(f'Text: {corpus[i]}, \n{corpus[j]}\nCosine Similarity: {cos_sim[i, j]:.2f}.\n')


In [None]:
# Perform PCA to reduce dimensions to 2 for visualization
pca = PCA(n_components=2)
matrix_pca = pca.fit_transform(embeddings)
# Compute cosine similarity
cos_sim = cosine_similarity(matrix_pca)


# Plot one graph per pair of text
num_pairs = len(corpus) * (len(corpus) - 1) // 2
plt.figure(figsize=(15, 10))
subplot_index = 1
for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        plt.subplot(2, num_pairs // 2, subplot_index)
        plt.scatter(matrix_pca[:, 0], matrix_pca[:, 1])
        plt.plot([0, matrix_pca[i, 0]],
                 [0, matrix_pca[i, 1]],
                 linestyle='-', color='blue', alpha=0.5)
        plt.plot([0, matrix_pca[j, 0]],
                 [0, matrix_pca[j, 1]],
                 linestyle='-', color='red', alpha=0.5)
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
        plt.title(f'Cosine Similarity: {cos_sim[i, j]:.2f}.\nText: {corpus[i]}, \n{corpus[j]}')
        plt.xticks(np.arange(-0.5, 1.5, 0.5))
        plt.yticks(np.arange(-0.5, 1.5, 0.5))
        subplot_index += 1
plt.tight_layout()
plt.show()

In [None]:
# Training Glove Embeddings
# Source: https://colab.research.google.com/drive/1IxAnnFSqk3mL3A8n1PKYWdEzDSd2Y9rF?usp=sharing#scrollTo=DpONsaktz-8w
# https://medium.com/nerd-for-tech/implementing-glove-from-scratch-word-embedding-for-transformers-95503138d65
#
import numpy as np
from collections import defaultdict

corpus = [
    "I love chocolate",
    "I love ice cream",
    "I enjoy playing tennis"
]
# Initialize vocabulary and co-occurrence matrix
vocab = set()
co_occurrence = defaultdict(float)

window_size = 4
# Iterate through the corpus to build vocabulary and co-occurrence matrix
for sentence in corpus:
    words = sentence.split()
    for i in range(len(words)):
        word = words[i]
        vocab.add(word)
        for j in range(max(0, i - window_size), min(i + window_size + 1, len(words))):
            if i != j:
                co_occurrence[(word, words[j])] += 1.0 / abs(i - j)

In [None]:
co_occurrence

In [None]:
embedding_dim = 10
word_embeddings = {
    word: np.random.randn(embedding_dim) for word in vocab
}

learning_rate = 0.1
num_epochs = 100

# Gradient descent to update word embeddings
for epoch in range(num_epochs):
    total_loss = 0
    for (word_i, word_j), observed_count in co_occurrence.items():
        # Calculate dot product of word embeddings
        dot_product = np.dot(word_embeddings[word_i], word_embeddings[word_j])

        # Calculate difference and update
        diff = dot_product - np.log(observed_count)
        total_loss += 0.5 * diff**2
        gradient = diff * word_embeddings[word_j]
        word_embeddings[word_i] -= learning_rate * gradient

    print(f"Epoch: {epoch+1}, Loss: {total_loss}")