In [None]:
import jsonlines
import os

file_path = os.path.join(os.getenv("DATADRIVE"), "datasets", "motikontho", "articles.jsonl")
assert os.path.exists(file_path)


In [None]:
from typing import List
from tqdm.auto import tqdm

def read_contents(file_path: str) -> List[str]:
    contents = list()

    with jsonlines.open(file_path) as reader:
        for _, obj in tqdm(enumerate(reader)):
            contents.append(obj["content"])
            
    return contents

contents = read_contents(file_path)

In [None]:
def clean(data: List[str]) -> List[str]:
    cleaned = list()
    for _, content in tqdm(enumerate(data), total=len(data)):
        c = content.replace("\xa0", " ")
        c = content.replace("\n", " ")
        cleaned.append(c)
        
    return cleaned

contents = clean(contents)


In [None]:
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")
assert (enc.decode(enc.encode(contents[0]))) == contents[0]

In [None]:
from typing import Any

# encode with tiktoken and find the max seq len
# to determine the batch size and block size
def find_max_enc_seq_len(contents: List[str], enc: Any) -> int:
    max_len = 0
    
    for _, content in tqdm(enumerate(contents), total=len(contents)):
        x = enc.encode(content)
        if len(x) > max_len:
            max_len = len(x)
    
    return max_len

max_seq_len = find_max_enc_seq_len(contents, enc)
max_seq_len

What if I merge all the texts and then create blocks from them? 

In [None]:
merged = "".join(c for c in contents)
merged_encoded = enc.encode(merged)
merged_enc_len = len(merged_encoded)

merged_enc_len

In [None]:
block_size = 64

n_blocks = merged_enc_len / block_size
n_blocks

In [None]:
from typing import Tuple
import numpy as np

# create train and validation split
# pass in file contents


def create_train_val_split(contents: List[str], ratio: float = 0.25, seed: int = 42) -> Tuple:
    np.random.seed(seed)

    # shuffle contents
    total_len = len(contents)
    shuffled_idxs = np.random.choice(np.arange(total_len))
    shuffled_idxs = shuffled_idxs.tolist()

    # pick ratio sized text
    train_last_idx = int(total_len * (1 - ratio)) - 1
    val_first_idx = train_last_idx + 1

    # merge all the train
    train_split = contents[:train_last_idx]
    train_merged = "".join(x for x in train_split)
    
    val_split = contents[val_first_idx:]
    val_merged = "".join(x for x in val_split)

    return train_merged, val_merged


train, val = create_train_val_split(contents)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class PutunSet(Dataset):
    def __init__(self, all_text: str, enc_name: str="cl100k_base", block_size: int=64) -> None:
        self.all_text = all_text
        self.enc_name = enc_name
        self.block_size = block_size
        
        self.encoder = tiktoken.get_encoding(self.enc_name)
        
        self.encoded = self.__encode_all()
        self.encoded_blocks = list()
        self.__create_blocks()
        
        self.n_blocks = len(self.encoded_blocks)
        
    def __encode_all(self) -> Any:
        return self.encoder.encode(self.all_text)
    
    def __create_blocks(self) -> Any:
        for i in range(len(self.encoded)):
            self.encoded_blocks.append(
                self.encoded[i : self.block_size + i + 1] # one extra for creating target
            )
    
    # return the number of blocks
    def __len__(self) -> int:
        return len(self.encoded_blocks)
    
    def __getitem__(self, index) -> Tuple:
        block_at_index = self.encoded_blocks[index]
        
        input_sequence = block_at_index[ : self.block_size]
        target_sequence = block_at_index[1 : self.block_size + 1] # next token prediction
        
        padded_input = torch.zeros((self.block_size, ), dtype=torch.long)
        padded_input[ : len(input_sequence)] = torch.tensor(input_sequence)
        
        padded_target = torch.zeros((self.block_size, ), dtype=torch.long)
        padded_target[ : len(target_sequence)] = torch.tensor(target_sequence)
        
        return padded_input, padded_target

In [None]:
trainset = PutunSet(train)
valset = PutunSet(val)


train_loader = DataLoader(trainset, batch_size=128, shuffle=True)
val_loader = DataLoader(valset, batch_size=128, shuffle=False)

In [None]:
print(len(trainset))
print(len(valset))

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torchopt as optim

In [None]:
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html

import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2)
                             * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


In [None]:
from einops import rearrange

class PutuMaraShara(nn.Module):
    def __init__(self, d_model: int, vocab_size: int, n_decoder_heads: int, n_decoders: int) -> None:
        super().__init__()
        
        self.d_model = d_model
        
        self.pos_encoder = PositionalEncoding(d_model=self.d_model)
        
        decoder_layers = nn.TransformerDecoderLayer(d_model=self.d_model, nhead=n_decoder_heads, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layers, n_decoders)
        
        self.embedding = nn.Embedding(vocab_size, self.d_model)
        self.linear = nn.Linear(self.d_model, vocab_size)
        
        
        self.init_weights()
    
    # from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)
    
    # https://stackoverflow.com/questions/62170439/difference-between-src-mask-and-src-key-padding-mask
    def create_mask(self, src):
        triangular = torch.tril(torch.ones(src.size()))
        # block out the future tokens
        # elements in the upper triangle will be zero after tril
        # setting the zeroes to -inf so that they appear non accessible
        # to the model
        # mask = torch.zeros(src.size())
        # mask = mask.masked_fill(triangular == 0, float("-inf"))
        return triangular
        
        
    # in an encoder mask will be none. Since this is a decoder based transformer
    # there will be mask
    def forward(self, src: torch.Tensor) -> torch.Tensor:
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        
        
        src_mask = self.create_mask(src)
        if src_mask.device != src.device:
            src_mask = src_mask.to(src.device)
            
        
        out = self.decoder(src, src_mask)
        
        out = self.linear(out)
        
        return out

In [None]:
model = PutuMaraShara(d_model=512, vocab_size=enc.n_vocab,
                      n_decoder_heads=8, n_decoders=6)

device = "cuda"
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

model = model.to(device)

def compute_loss(model, batch, mode="train", device=device):
    def __step():
        inputs, targets = batch
        
        inputs = inputs.to(device)
        targets = targets.to(device)

        logits = model(inputs)
        logits = rearrange(logits, "batch seq vocab -> batch vocab seq")
        loss = criterion(logits, targets)
            
        return loss
    
    
    if mode != "train":
        model.eval()
        with torch.no_grad():
            return __step()
    else:
        model.train()
        return __step()

    

x = torch.randint(0, 100, (4, 64))
y = torch.ones((4, 64), dtype=torch.long)

compute_loss(model, (x, y), "eval")


In [None]:
def generate(prompt: str, max_length: int, model: PutuMaraShara, encoder: Any):
    block_size = 64
    
    # encode prompt
    idxs = encoder.encode(prompt)
    idxs = torch.tensor(idxs)
    idxs = idxs.to(device)
    

    
    
    for _ in range(max_length):
        idx_cond = idxs.unsqueeze(0)[:, -block_size:]
        logits = model(idx_cond)
        
        logits = F.softmax(logits, dim=-1)
        
        idx_next = torch.multinomial(logits.squeeze(0), num_samples=1)
        
        idxs = torch.cat((idxs, idx_next.squeeze(-1)), dim=0)
    return idxs


gen = generate("পুটুন", 10, model, enc)
print(gen.size())

In [None]:
dec = enc.decode(gen.tolist())
print(dec)

In [None]:
def train():
    max_iters = 100
    log_every_n_step = 100

    for e in range(max_iters):
        n_steps = 0
        print(f"Epoch :: {e + 1}/{max_iters}")
    
        model.train()
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            train_loss = compute_loss(model, batch, mode="train")
            train_loss.backward()        
            optimizer.step()
        
            n_steps += 1
        
            if n_steps % log_every_n_step == 0:
                # run validation
                print("========= Running Validation =======")
                model.eval()
                losses = []
                with torch.no_grad():
                    for vbatch in tqdm(val_loader):
                        val_loss = compute_loss(model, vbatch, mode="eval")
                        losses.append(val_loss.item())
                    
            
                avg_val_loss = torch.tensor(losses).mean().item()
                print(f"Loss/Train :: {train_loss.item()} ____ Loss/Val :: {avg_val_loss}")
                print()
        
            model.train()
        