In [2]:
# importing required packages

import torch
import torch.nn as nn
import math
from transformers import AutoTokenizer, AutoModel
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# selecting cuda device is available

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
batch_size = 32    # batch size : number of sentences in a batcgh
seq_len = 128      # sequence length : number of tokens(words) in a sentence
num_embd = 384     # num ebmedding : embedding dimension for a single token
num_heads = 2      # num heads : number of multihead attention for encoder and decoder blocks

In [5]:
# reading file for extracting sentences to make a corpus

with open('001ssb.txt', 'r') as file:
    corpus = file.readlines()

for i in range(len(corpus)):
    corpus[i] = corpus[i].replace(' \n', '')
    corpus[i] = corpus[i].replace('\n', '')


In [6]:
# extracting words(tokens) from the courpus

tokens = []

for sent in corpus:
    for word in sent.split():
        tokens.append(word)

In [7]:
# selecting only first 50000 tokens from then corpus

tokens = tokens[:50000]

In [8]:
# function for creating embeddings from the above formed tokens

def get_embeddings(words, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    embeddings = []
    
    
    batch_size = 32
    for i in range(0, len(words), batch_size):
        batch = words[i:i + batch_size]
        
        
        encoded = tokenizer(batch, 
                          padding=True, 
                          truncation=True,
                          max_length=128,
                          return_tensors='pt')
        
        
        encoded = {k: v.to(device) for k, v in encoded.items()}
        
        
        with torch.no_grad():
            model_output = model(**encoded)
            
        
        attention_mask = encoded['attention_mask']
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
       
        embeddings.append(sentence_embeddings.cpu().numpy())
    
    
    all_embeddings = np.concatenate(embeddings, axis=0)
    return all_embeddings

In [9]:
embeddings = get_embeddings(tokens)    # embeddings shape : (50000, 384)

In [10]:
x = embeddings   # storing the embeddings to a varaiable for easy access

In [11]:
# Positional Encodings

def POS_Emb():
    pos_emb = torch.empty(seq_len, num_embd)

    for pos in range(seq_len):
        for i in range(num_embd):
            if i % 2 == 0:
                emb = math.sin(pos / (10000 ** ((2 * i)/num_embd)))
            else:
                emb = math.cos(pos / (10000 ** ((2 * (i - 1))/num_embd)))

            t_emb = torch.tensor(emb)
            pos_emb[pos][i] = t_emb

    out = pos_emb

    return out

In [12]:
pe = POS_Emb()
pe = pe.to(device)

In [13]:
# Encoder part

In [14]:
class EncoderHead(nn.Module):
    
    def __init__(self, head_size):
        super().__init__();

        self.head_size = head_size

        self.wq = nn.Linear(num_embd, head_size, bias=False)
        self.wk = nn.Linear(num_embd, head_size, bias=False)
        self.wv = nn.Linear(num_embd, head_size, bias=False)

    def forward(self, x):

        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        energy = torch.matmul(q, k.transpose(-2, -1)) * (self.head_size ** -0.5)
        
        attention = torch.softmax(energy, dim=-1)

        out = torch.matmul(attention, v)

        return out

In [15]:
class EncoderMultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()

        self.heads = nn.ModuleList([EncoderHead(head_size) for _ in range(num_heads)])
        self.wo = nn.Linear(num_heads * head_size, num_embd, bias=False)
        
    def forward(self, x):

        out = torch.cat([h(x) for h in self.heads], dim=-1)

        out = self.wo(out)
        
        return out


In [16]:
class EncoderFeedForward(nn.Module):

    def __init__(self, num_features):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(num_features, 200),
            nn.ReLU(),
            nn.Linear(200, num_features),
        )


    def forward(self, x):

        out = self.model(x)

        return out

In [17]:
class EncoderBlock(nn.Module):

    def __init__(self, num_heads):
        super().__init__()

        head_size = num_embd // num_heads
        self.mha = EncoderMultiHeadAttention(num_heads, head_size)
        self.ffwd = EncoderFeedForward(num_embd)
        self.ln1 = nn.LayerNorm(num_embd)
        self.ln2 = nn.LayerNorm(num_embd)

    def forward(self, x):

        x = x + self.ln1(self.mha(x))
        x = x + self.ln2(self.ffwd(x))
        
        return x

In [18]:
class Encoder(nn.Module):

    def __init__(self, num_heads):
        super().__init__()

        self.blocks = nn.Sequential(
            EncoderBlock(num_heads),
            EncoderBlock(num_heads),
            EncoderBlock(num_heads),
        )

    def forward(self, x):

        x = x + pe[:x.shape[1], :]
        x = self.blocks(x)

        return x

In [19]:
# Decoder part

In [20]:
class CrossHead(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()

        self.head_size = head_size

        self.wq = nn.Linear(num_embd, head_size, bias=False)
        self.wk = nn.Linear(num_embd, head_size, bias=False)
        self.wv = nn.Linear(num_embd, head_size, bias=False)

    def forward(self, x, l):

        q = self.wq(l)
        k = self.wk(x)
        v = self.wv(x)

        energy = torch.matmul(q, k.transpose(-2, -1)) * (self.head_size ** -0.5)

        attention = torch.softmax(energy, dim=-1)

        out = torch.matmul(attention, v)

        return out

In [21]:
class CrossMultiheadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()

        self.heads = nn.ModuleList([CrossHead(head_size) for _ in range(num_heads)])
        self.wo = nn.Linear(head_size * num_heads, num_embd, bias=False)
        
    def forward(self, x, l):

        out = torch.cat([h(x, l) for h in self.heads], dim=-1)

        out = self.wo(out)
        
        return out

In [22]:
class DecoderHead(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()

        self.head_size = head_size

        self.wq = nn.Linear(num_embd, head_size, bias=False)
        self.wk = nn.Linear(num_embd, head_size, bias=False)
        self.wv = nn.Linear(num_embd, head_size, bias=False)

    def forward(self, x):

        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        energy = torch.matmul(q, k.transpose(-2, -1)) * (self.head_size ** -0.5)
        energy = torch.tril(energy)
        energy = energy.masked_fill(energy==0, float('-inf'))
         
        attention = torch.softmax(energy, dim=-1)

        out = torch.matmul(attention, v)

        return out

In [23]:
class DecoderMultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()

        self.heads = nn.ModuleList([DecoderHead(head_size) for _ in range(num_heads)])
        self.wo = nn.Linear(head_size * num_heads, num_embd, bias=False)
        
    def forward(self, x):

        out = torch.cat([h(x) for h in self.heads], dim=-1)

        out = self.wo(out)
        
        return out

In [24]:
class DecoderFeedForward(nn.Module):

    def __init__(self, num_embd):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(num_embd, 200),
            nn.ReLU(),
            nn.Linear(200, num_embd),
        )


    def forward(self, x):

        out = self.model(x)

        return out

In [25]:
class DecoderBlock(nn.Module):

    def __init__(self, num_heads):
        super().__init__()

        head_size = num_embd // num_heads
        self.mmha = DecoderMultiHeadAttention(num_heads, head_size)
        self.ln1 = nn.LayerNorm(num_embd)
        self.ffwd = DecoderFeedForward(num_embd)
        self.ln2 = nn.LayerNorm(num_embd)
        self.cmha = CrossMultiheadAttention(num_heads, head_size)
        self.ln3 = nn.LayerNorm(num_embd)

    def forward(self, x, l):

        x = x + self.ln1(self.mmha(x))
        x = x + self.ln2(self.cmha(x, l))
        x = x + self.ln3(self.ffwd(x))
        
        return x

In [26]:
class Decoder(nn.Module):

    def __init__(self, num_heads, vocab_size):
        super().__init__()

        self.blocks = nn.ModuleList([
            DecoderBlock(num_heads),
            DecoderBlock(num_heads),
            DecoderBlock(num_heads),
        ])
        self.linear = nn.Linear(num_embd, vocab_size)

    def forward(self, x, l, action):

        x = x + pe[:x.shape[1], :]
        for block in self.blocks:
            x = block(x, l)

        if action == 'train':
            x = self.linear(x)

        if action == 'inference':
            x = self.linear(x[:, -1, :])

        return x

In [27]:
def customDatasetLoader(x):

    data = []
    total_batches = x.shape[0] // (batch_size * seq_len)

    for batch_idx in range(total_batches):
        b = []
        for bs in range(batch_size):
            batch = []
            for sl in range(seq_len * bs + (batch_size * seq_len * batch_idx), (seq_len * bs) + (batch_size * seq_len * batch_idx) + seq_len):
                    batch.append(x[sl])

            b.append(batch)
        data.append(b)

    return torch.tensor(data)

In [28]:
data_loader = customDatasetLoader(x)

  return torch.tensor(data)


In [29]:
class Model(nn.Module):

    def __init__(self):
        super().__init__()

        self.encoder = Encoder(num_heads)
        self.decoder = Decoder(num_heads, vocab_size=batch_size * seq_len * len(data_loader))

    def forward(self, x, l=None, action='train'):

        x = self.encoder(x)
        x = self.decoder(x, l, action)

        return x
        

In [30]:
model = Model()
total_params = sum(p.numel() for p in model.parameters())
total_params    # total parameters of the model

25168560

In [31]:
model = model.to(device)

In [32]:
# model configs

epochs = 70
lr = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.functional.cross_entropy

In [33]:
# generating targets for evaluating loss

def generateTargets(x):

    data = []
    total_batches = x.shape[0] // (batch_size * seq_len)

    for batch in range(total_batches):
        b = []
        for bs in range(batch_size):
            batch = []
            for sl in range(seq_len * bs + 1, (seq_len * bs) + seq_len):
                    batch.append(torch.tensor())

            b.append(batch)
        data.append(b)

    return torch.tensor(data)

In [34]:
targets = torch.arange(0, 32 * 128 * 12).reshape(12, 32, 128)
base_sequence = torch.ones(32 * 128, dtype=torch.int).reshape(32, 128)
targets = base_sequence + targets
targets[-1][-1][-1] -= torch.tensor(1)

In [35]:
# generating labels for training the model

def generateLabels(x):
    data = []
    total_batches = x.shape[0] // (batch_size * seq_len)

    for batch_idx in range(total_batches):
        b = []
        for bs in range(batch_size):
            batch_start = (batch_idx * batch_size + bs) * seq_len + 1
            batch_end = batch_start + seq_len
            batch = x[batch_start:batch_end]

            b.append(batch)
        data.append(b)

    return torch.tensor(data)


In [36]:
labels = generateLabels(x)

In [37]:
model.train()

for epoch in range(epochs):

    # forward pass
    result = model(data_loader[0], labels[0], 'train')

    # calculate loss
    loss = loss_fn(result.view(-1, batch_size * seq_len * len(data_loader)), targets[0].view(-1))

    # zero grads
    optimizer.zero_grad()

    # backward pass
    loss.backward()

    # update params
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch : {epoch}, Loss : {loss}")

print(f"Loss : {loss}")

Epoch : 0, Loss : 13.718318939208984
Epoch : 10, Loss : 7.884413242340088
Epoch : 20, Loss : 5.46866512298584
Epoch : 30, Loss : 4.171416282653809
Epoch : 40, Loss : 3.2255072593688965
Epoch : 50, Loss : 2.592472791671753
Epoch : 60, Loss : 1.796868085861206
Loss : 1.736008644104004


In [38]:
def make_inputs(size):
    inputs = torch.ones([1, size, num_embd])
    
    for i in range(size):
        inputs[0, i] = data_loader[0][0][i]

    return inputs

In [39]:
def make_labels(size, ls, output):
    els = torch.ones([1, size+1, num_embd])

    for i in range(size):
        els[0, i] = ls[0, i]

    els[0, size] = torch.tensor(embeddings[torch.argmax(output)])

    return els

In [40]:
ls = torch.ones([1, 1, 384])
ls[0, 0] = labels[0, 0, 0]

for i in range(128):
    inputs = make_inputs(i+1)

    output = model(inputs, ls, 'inference')
    print(tokens[torch.argmax(output)], end=" ")
    ls = make_labels(i+1, ls, output)

deepening Of Thrones Book One before, A Night's of Ice and he he George R. R. Martin PROLOGUE "We should start back," Gared urged to the means began to north, dark around them. "The wildlings are dead." "Do the dead frighten you?" Ser Waymar Royce asked with all the nameless of a tree, Gared muttered. not rise to forest. moon can blades the old man, past fifty, and hard arm. seen the lordlings come and go. "Dead not dead," he grizzled "We have felt business with the top "Are they dead?" Royce asked softly. "What proof have we?" "Will saw them," Gared said. "If he says they are dead, that's proof enough for me." Will had no they would drag him into the position sooner or later. He 