In [1]:
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import torch.optim as optim
import numpy as np
import numpy.random as random

In [2]:
# # getting shakespeare content from website (already dotorch.nn.functionalmport requests
# response = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')

# if response.status_code == 200:
#     text = response.text
# with open('shakespeare.txt','w') as file:
#     file.write(text)

In [3]:
# open downloaded text file
# with open('shakespeare.txt','r') as file:
#     text = file.read()[10462:-569]

In [5]:
# creating tokenizer and loading vocabs and merges 
from tokenizers.tokenizer import Tokenizer
tok = Tokenizer()
tok.vocabs = tok.load('tokenizers/vocabs.pkl')
tok.merges = tok.load('tokenizers/merges.pkl')

In [6]:
# have ncoded text file, no need from encoding
with open('tokens.pkl','rb') as file:
    tokens = pickle.load(file)
    
tokens = torch.tensor(tokens)
batch_size = 4
k = 5
text = ''.join([tok.decode([token.item()]) for token in tokens])

In [199]:
class Loader:
    def __init__(self,tokens,batch_size,k,return_tensor=True):
        
        self.tokens = tokens
        self.num_tokens = len(tokens)
        self.batch_size = batch_size
        self.k = k
        self.return_tensor = return_tensor
        
        self.negative_keys,self.negative_values = self.get_distribution()
        self.ones = np.ones((batch_size,1),dtype=int)
        self.lbls = np.concatenate(((np.ones((batch_size,2))),np.zeros((batch_size,k))),axis=1)
        if return_tensor == True:
            self.lbls = torch.tensor(self.lbls)
        
    def get_distribution(self):
        occurance_dictionary = {}
        for token in self.tokens:
            occurance_dictionary[token] = occurance_dictionary.get(token,0)+1

        values = list(occurance_dictionary.values())
        keys = list(occurance_dictionary.keys())

        values = [value**(3/4) for value in values]
        values = [value/sum(values) for value in values]

        return keys,values
    
    def get_targets_negative(self):
        
        targets_negative = self.ones@random.choice(self.negative_keys,
                                    size=(1,self.k),p=self.negative_values)
        
        return targets_negative
    
    def get_context_ids(self):
        
        idx_context = random.randint(1,self.num_tokens-1,(self.batch_size,1))
        ids_targets_positive = np.concatenate((idx_context-1,idx_context+1),1)
        
        return idx_context,ids_targets_positive
    
    def load_batch(self):
        
        targets_negative = self.get_targets_negative()
        idx_context,ids_targets_positive = self.get_context_ids()
        
        context = self.tokens[idx_context]
        targets_positive = self.tokens[ids_targets_positive]
        
        targets = np.concatenate((targets_positive,targets_negative),axis=-1)
        
        if self.return_tensor == True:
            context = torch.tensor(context)
            targets = torch.tensor(targets)
            
        return (context,targets,self.lbls),(idx_context,ids_targets_positive)

In [249]:
class Loader:
    def __init__(self,tokens,batch_size,k):
        
        self.tokens = tokens
        self.num_tokens = len(tokens)
        self.batch_size = batch_size
        self.k = k
        
        self.negative_keys,self.negative_values = self.get_distribution()
        
        self.lbls = torch.cat(
            (torch.ones(batch_size,2,dtype=torch.float64),
            torch.zeros(batch_size,k,dtype=torch.float64))
            ,dim=1)
        
    def get_distribution(self):
        occurance_dictionary = {}
        tokens = self.tokens.tolist()
        for token in tokens:
            occurance_dictionary[token] = occurance_dictionary.get(token,0)+1

        values = list(occurance_dictionary.values())
        keys = list(occurance_dictionary.keys())

        values = [value**(3/4) for value in values]
        values = [value/sum(values) for value in values]
        
        values = torch.tensor(values)
        keys = torch.tensor(keys)
        
        return keys,values
    
    def get_targets_negative(self):
        
        self.ids = torch.multinomial(self.negative_values,(self.batch_size*self.k)
                                     ,replacement=True).view(self.batch_size,self.k)
        
        targets_negative = self.negative_keys[self.ids]
        
        return targets_negative
    
    def get_context_ids(self):
        
        ids_context = torch.randint(1,self.num_tokens-1,(self.batch_size,1))
        ids_targets_positive = torch.cat((ids_context-1,ids_context+1),dim=1)
        
        return ids_context,ids_targets_positive
    
    def load_batch(self):
        
        targets_negative = self.get_targets_negative()
        ids_context,ids_targets_positive = self.get_context_ids()
        
        context = self.tokens[ids_context]
        targets_positive = self.tokens[ids_targets_positive]
        
        targets = torch.cat((targets_positive,targets_negative),dim=-1)
                    
        return (context,targets,self.lbls),(ids_context,ids_targets_positive)

In [263]:
batch_size = 20
k = 10
loader = Loader(tokens,batch_size,k)

print(f'{batch_size = }')
print(f'{k = }')

(context,targets,lbls),_ = loader.load_batch()

batch_size = 20
k = 10


In [264]:
class Model(nn.Module):
    def __init__(self,n_vocabs):
        super(Model,self).__init__()
        self.embeddings = nn.Embedding(n_vocabs,1024)
        self.thetas = nn.Embedding(n_vocabs,1024)
    
    def forward(self,context,targets):
        emb_context = self.embeddings(context)
        theta_targets = self.thetas(targets)
        outputs = torch.matmul(theta_targets,emb_context.permute(0,2,1)).to(torch.float64)
        predictions = F.sigmoid(outputs).squeeze(-1)
        return emb_context,theta_targets,predictions
    

In [279]:
model = Model(len(tok.vocabs))
model.load_state_dict(torch.load('embedding_model_state_dict.pth'))

<All keys matched successfully>

In [280]:
batch_size = 128
k = 10
loader = Loader(tokens,batch_size,k)
loss_fn = nn.BCELoss()
lr = 100
lr_decay_rate = 0.9
optimizer = optim.SGD(model.parameters(),lr=100)

In [281]:
def test():
    (context,targets,lbls),_ = loader.load_batch()
    emb_context,theta_targets,preds = model(context,targets)
    loss = loss_fn(preds,lbls)
    print(loss.item())
    return preds
preds = test()

0.7164157067651237


In [290]:
# preds[preds>0.2] = 1
# preds[preds!=1] = 0
preds[5,2:]

tensor([0., 0., 1., 1., 0., 0., 0., 0., 0., 0.], dtype=torch.float64,
       grad_fn=<SliceBackward0>)

In [None]:
for i in tqdm(range(10000)):
    context,targets,lbls = loader.load_batch()
    
    context = torch.tensor(context)
    targets = torch.tensor(targets)
    lbls = torch.tensor(lbls)
    
    emb_context,theta_targets,predictions = model(context,targets)
    loss = loss_fn(predictions,lbls)
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    if i%500 == 0:
        print(loss.item())
        optimizer.param_groups[0]['lr'] *= lr_decay_rate