##### T1. For an arbitrary text used for training, write functions that perform data preparation:
- tokenize text into sentences and tokens
- build bag of words: map words into ids and ids into words
- generate pairs (target_word, context_word) using context window size N.
- build training set: from all pairs, build input data X as one-hot encoded target words and output y as one-hot encoded context words. 

In [1]:
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import plotly.express as px

In [2]:
# read text
t = open('../lesson1/text.txt').read()
print(t)

Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. Recently, artificial neural networks have been able to surpass many previous approaches in performance.Machine learning approaches have been applied to many fields including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine. ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods.The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervised learning. From a t

In [3]:
def tokenize(t, lemmatization_mode=False):
    s = sent_tokenize(t)
    w = [word_tokenize(sent) for sent in s]
    w_proc = []
    if lemmatization_mode == True:
        lemmatizer = WordNetLemmatizer()
        for sent in w:
            w_proc.append([lemmatizer.lemmatize(word).lower() for word in sent if word.isalpha()])
    else:
        for sent in w:
            w_proc.append([word.lower() for word in sent if word.isalpha()])
    
    return w_proc

def bag_w(w):
    w_clean = set()
    for sent in w:
        for word in sent:
            w_clean.add(word)
    
    w_id, id_w = {}, {}
    
    w_id['unk'] = 0 # ?
    id_w[0] = 'unk'   

    for i, word in enumerate(w_clean):
        w_id[word] = i + 1
        id_w[i + 1] = word

    return w_id, id_w

def one_hot_encode(id, vocab):
    encoding = [0] * len(vocab)
    encoding[id] = 1

    return encoding

def generate_pairs(w, window):
    pairs = []
    for sent in w:
        sent_len = len(sent)
        for index, word in enumerate(sent):
            behind = list(range(max(0, index-window), index))
            ahead = list(range(index+1, min(sent_len, index+1+window)))
            for i in behind:
                pairs.append((word, sent[i]))
            for j in ahead:
                pairs.append((word, sent[j]))
    
    return pairs

def create_train_set(pairs, w_id):
    x = []
    y = []
    for x_word, y_word in pairs:
        x.append(one_hot_encode(w_id[x_word], w_id))
        y.append(one_hot_encode(w_id[y_word], w_id))

    return np.array(x), np.array(y)

def preprocess(t, l_mode, window):
    w = tokenize(t, lemmatization_mode=False)
    w_id, id_w = bag_w(w)
    pairs = generate_pairs(w, window=2)
    x, y = create_train_set(pairs, w_id)

    return w, w_id, id_w, pairs, torch.tensor(x, dtype=torch.float), torch.tensor(y, dtype=torch.float)

In [4]:
# run
w, w_id, id_w, pairs, x, y = preprocess(t=t, l_mode=False, window=2)
print(x.size())

torch.Size([13276, 1042])


##### T2. Train the model
- build NN consisting of 2 linear layers, i.e. word embeddings and context embedding matrices and sigmoid output function
- perform training of your model with your training set. Plot learning curve to make sure your NN actually learns something.

In [5]:
class W2V(nn.Module):
    def __init__(self, len_vocab, len_embedding):
        super().__init__()
        self.len_vocab = len_vocab
        self.len_embedding = len_embedding
        self.layer1 = nn.Parameter(data=torch.randn(self.len_vocab, self.len_embedding), requires_grad=True)
        self.layer2 = nn.Parameter(data=torch.randn(self.len_embedding, self.len_vocab), requires_grad=True)
 
    def forward(self, x):
        x = x @ self.layer1 # @ - dot product
        x = x @ self.layer2
        return x

In [18]:
len_vocab = len(w_id)
len_embedding = 10

learning_rate = 0.1
epochs = 100

model = W2V(len_vocab, len_embedding)
loss_fn = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

loss_list = []

for epoch in tqdm(range(epochs)):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    loss_list.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

100%|██████████| 100/100 [00:08<00:00, 11.66it/s]


In [19]:
fig = px.line(x=range(epochs), y=loss_list, markers=True)
fig.update_layout(xaxis_title='epoch',yaxis_title='loss')
fig.show()

##### T3. Print most similar words:
- from your model, extract target embeddings matrix and use it to map all the words from your vocabulary into embedding vectors
- for any given word, compute most similar words using cosine similarity


In [54]:
def similarity(vec1, vec2):
    return (vec1 @ vec2) / (torch.norm(vec1) * torch.norm(vec2))

def find_similar(word1, w_vec, num=10):
    if word1 not in w_vec:
        print('Word is not in the dictionary!')

        vec1 = w_vec[word1]
        sim = {}

        for word2, vec2 in w_vec.items():
            if word1 != word2:
                sim[word2] = similarity(vec1, vec2)

        sim_sort = sorted(sim.items(), key=lambda item: item[1], reverse=True)
        similar = sim_sort[:num]

    return similar

In [47]:
vectors = list(model.parameters())[0].detach()

In [48]:
w_vec = {word: vectors[index] for word, index in w_id.items()}

In [53]:
find_similar('learning', w_vec, num=5)

[('machine', tensor(0.9705)),
 ('algorithms', tensor(0.8865)),
 ('multilinear', tensor(0.8490)),
 ('term', tensor(0.8489)),
 ('subspace', tensor(0.8126))]