In [1]:
from nltk.lm import Vocabulary
import os
import string
from collections import defaultdict

data_dir = "data"

In [2]:
! git clone https://github.com/sathya-pramodh/E2E-RAG.git
! mv E2E-RAG/data ./

Cloning into 'E2E-RAG'...
remote: Enumerating objects: 392, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 392 (delta 10), reused 30 (delta 6), pack-reused 356[K
Receiving objects: 100% (392/392), 57.53 MiB | 31.88 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [3]:
files = os.listdir(path=data_dir)
all_lines = []
for file in files:
    with open(os.path.join(data_dir, file)) as file:
        lines = file.readlines()
        for line in lines:
            all_lines.extend(line.strip().split('.'))

In [4]:
all_lines[:10], len(all_lines)

(['Travian is now migrating its business intelligence systems to AWS using Amazon Redshift, which uses SQL to analyze structured and semistructured data across data warehouses, operational databases, and data lakes',
  ' Using data analytics, Travian will be able to analyze player behavior in the game based on the 11 TB of data that it collects each month and make improvements',
  ' “It used to be impossible for us to do this at this scale,” says Strathaus',
  ' “We’re looking forward to using analytics to improve our games further on AWS',
  '”',
  'Français',
  'Amazon Elastic Kubernetes Service (Amazon EKS) is a managed Kubernetes service to run Kubernetes in the AWS cloud and on-premises data centers',
  '  Learn more\xa0»',
  'Travian needed a more stable service that could handle Kubernetes',
  ' The studio was initially hesitant to use AWS because the offerings from AWS are so vast that Travian worried it would be overwhelming'],
 82163)

In [5]:
# this has to be run twice to eliminate all special characters and lower case everything
# i don't know why that's the case
for _ in range(7):
    for i,line in enumerate(all_lines):
        # https://stackoverflow.com/a/60725180/15368987
        all_lines[i] = all_lines[i].strip().translate(str.maketrans('', '', string.punctuation)).lower()
        if all_lines[i] == '' or \
        len(all_lines[i]) < 3:
            all_lines.pop(i)

In [6]:
all_lines[:10], len(all_lines)

(['travian is now migrating its business intelligence systems to aws using amazon redshift which uses sql to analyze structured and semistructured data across data warehouses operational databases and data lakes',
  'using data analytics travian will be able to analyze player behavior in the game based on the 11 tb of data that it collects each month and make improvements',
  '“it used to be impossible for us to do this at this scale” says strathaus',
  '“we’re looking forward to using analytics to improve our games further on aws',
  'français',
  'amazon elastic kubernetes service amazon eks is a managed kubernetes service to run kubernetes in the aws cloud and onpremises data centers',
  'learn more\xa0»',
  'travian needed a more stable service that could handle kubernetes',
  'the studio was initially hesitant to use aws because the offerings from aws are so vast that travian worried it would be overwhelming',
  'however as the need for reliability became paramount travian decided

In [7]:
# get each line. and also in each line split by '.' to get sentences. build vocab with appropriate min count in nltk
#

In [8]:
word_counts = defaultdict(int)
for line in all_lines:
    for word in line.split():
        word_counts[word] += 1

In [9]:
vocab = [word for word, count in word_counts.items()]
w2i = {w: i for i,w in enumerate(vocab)}
i2w = {i: w for w,i in w2i.items()}
len(vocab)

20857

In [11]:
w2i["travian"], i2w[0]

(0, 'travian')

In [12]:
import torch
from torch import tensor
from torch import nn
from torch.nn import functional as F

In [13]:
EMBEDDING_DIM = 200
NUM_NEG_SAMPLES = 5
NUM_SURROUNDING_WORDS = 2

vocab_sz = len(vocab)

In [14]:
pairs = []
for sentence in all_lines[:5]:
    sentence = sentence.split()
    for word_idx, target_word in enumerate(sentence):
        target_idx = w2i[target_word]
        for j in range(max(0, word_idx - NUM_SURROUNDING_WORDS), min(len(sentence), word_idx + NUM_SURROUNDING_WORDS + 1)):
            if word_idx == j: continue
            # append surrounding word to labels
            context_word = sentence[j]
            context_idx = w2i[context_word]
            pairs.append((target_idx, context_idx))


In [15]:
pairs[:10]

[(0, 1),
 (0, 2),
 (1, 0),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 4),
 (3, 1)]

In [33]:
class SkipGram(nn.Module):
    @staticmethod
    def get_negative_samples(target_idx):
        neg_samples = []
        while len(neg_samples) < NUM_NEG_SAMPLES:
            neg_sample = torch.randint(0, vocab_sz, (1, )).to('cuda')
            if neg_sample != target_idx:
                neg_samples.append(neg_sample)
        return neg_samples

    def __init__(self, vocab_sz, embedding_dim):
        super(SkipGram, self).__init__()
        self.target_emb = nn.Embedding(vocab_sz, embedding_dim)
        self.context_emb = nn.Embedding(vocab_sz, embedding_dim)

    def forward(self, target_idx, context_idx, neg_samples):
        # this is equivalent to * ing 1-hot-encoded vector with the emb/weight matrix
        target_embedding = self.target_emb(target_idx)
        context_embedding = self.context_emb(context_idx)
        neg_embeddings = self.context_emb(neg_samples)

        # (1, D) @ (D, 1) -> (1, 1)
        pos_out = target_embedding[None] @ context_embedding[...,None]
        pos_scores = pos_out.sigmoid()

        # (1, NUM_NEG_SAMPLES, D) @ (1, D, 1) -> (1, NUM_NEG_SAMPLES, 1) -> (1, NUM_NEG_SAMPLES)
        # neg_out = torch.bmm(neg_embeddings, target_embedding[...,None]).squeeze(-1)
        neg_out = torch.einsum("nd,d -> n", neg_embeddings, target_embedding)
        neg_scores = neg_out.sigmoid()

        return pos_scores, neg_scores

In [36]:
device = 'cuda'
model = SkipGram(vocab_sz, EMBEDDING_DIM).to(device)
loss_fn = F.binary_cross_entropy
lr = 0.1

In [37]:
for i in range(100):
    epoch_loss = 0
    for t, c in pairs:
        t, c = tensor(t).to(device), tensor(c).to(device)
        neg_samples = model.get_negative_samples(t)
        neg_samples = tensor(neg_samples).to(device)
        p, n = model(t, c, neg_samples)

        pl = torch.ones_like(p)
        nl = torch.zeros_like(n)

        loss = loss_fn(p, pl) + loss_fn(n, nl)
        for p in model.parameters():
            p.grad = None
        loss.backward()

        for p in model.parameters():
            p.data += -lr * p.grad

        epoch_loss += loss.item()

    print(f"Epoch: {i}; Loss: {epoch_loss}")

Epoch: 0; Loss: 7007.018651020182
Epoch: 1; Loss: 4672.861348517239
Epoch: 2; Loss: 5224.104994235732
Epoch: 3; Loss: 4922.869771795813
Epoch: 4; Loss: 4082.163927370915
Epoch: 5; Loss: 4871.11863450706
Epoch: 6; Loss: 4468.280669493048
Epoch: 7; Loss: 4671.96493857661
Epoch: 8; Loss: 4149.169310898693
Epoch: 9; Loss: 4273.095055247009
Epoch: 10; Loss: 3970.177048623329
Epoch: 11; Loss: 4013.0716511138526
Epoch: 12; Loss: 4280.14077221106
Epoch: 13; Loss: 3908.847448966556
Epoch: 14; Loss: 4131.766737407539
Epoch: 15; Loss: 3580.512126503221
Epoch: 16; Loss: 3785.3665570202284
Epoch: 17; Loss: 3753.145708022952
Epoch: 18; Loss: 3570.6233106009854
Epoch: 19; Loss: 3682.2023678884143
Epoch: 20; Loss: 3627.688434967131
Epoch: 21; Loss: 3765.872869040792
Epoch: 22; Loss: 3342.011117795453
Epoch: 23; Loss: 3549.887832555454
Epoch: 24; Loss: 3187.9231333234056
Epoch: 25; Loss: 3176.030073896327
Epoch: 26; Loss: 3453.2587818381144
Epoch: 27; Loss: 3250.655112879991
Epoch: 28; Loss: 3140.82652

In [40]:
v1 = model.target_emb(tensor(w2i["athena"], device=device))
v2 = model.target_emb(tensor(w2i["aws"], device=device))
v3 = model.target_emb(tensor(w2i["fastapi"], device=device))

In [41]:
torch.dot(v1, v2)

tensor(7.1017, device='cuda:0', grad_fn=<DotBackward0>)

In [42]:
torch.dot(v1, v3)

tensor(-24.3139, device='cuda:0', grad_fn=<DotBackward0>)

In [43]:
torch.dot(v2, v3)

tensor(-1.7388, device='cuda:0', grad_fn=<DotBackward0>)