In [1]:
import jsonlines
import os

file_path = os.path.join(os.getenv("DATADRIVE"), "datasets", "motikontho", "articles.jsonl")
assert os.path.exists(file_path)


In [16]:
from typing import List
from tqdm.auto import tqdm

def read_contents(file_path: str) -> List[str]:
    contents = list()

    with jsonlines.open(file_path) as reader:
        for _, obj in tqdm(enumerate(reader)):
            contents.append(obj["content"])
            
    return contents

contents = read_contents(file_path)

0it [00:00, ?it/s]

In [17]:
def clean(data: List[str]) -> List[str]:
    cleaned = list()
    for _, content in tqdm(enumerate(data), total=len(data)):
        c = content.replace("\xa0", " ")
        c = content.replace("\n", " ")
        cleaned.append(c)
        
    return cleaned

contents = clean(contents)


  0%|          | 0/211 [00:00<?, ?it/s]

In [20]:
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")
assert (enc.decode(enc.encode(contents[0]))) == contents[0]

In [24]:
from typing import Any

# encode with tiktoken and find the max seq len
# to determine the batch size and block size
def find_max_enc_seq_len(contents: List[str], enc: Any) -> int:
    max_len = 0
    
    for _, content in tqdm(enumerate(contents), total=len(contents)):
        x = enc.encode(content)
        if len(x) > max_len:
            max_len = len(x)
    
    return max_len

max_seq_len = find_max_enc_seq_len(contents, enc)
max_seq_len

  0%|          | 0/211 [00:00<?, ?it/s]

10056

What if I merge all the texts and then create blocks from them? 

In [29]:
merged = "".join(c for c in contents)
merged_encoded = enc.encode(merged)
merged_enc_len = len(merged_encoded)

merged_enc_len

465763

In [34]:
block_size = 64

n_blocks = merged_enc_len / block_size
n_blocks

7277.546875

In [97]:
from typing import Tuple
import numpy as np

# create train and validation split
# pass in file contents


def create_train_val_split(contents: List[str], ratio: float = 0.25, seed: int = 42) -> Tuple:
    np.random.seed(seed)

    # shuffle contents
    total_len = len(contents)
    shuffled_idxs = np.random.choice(np.arange(total_len))
    shuffled_idxs = shuffled_idxs.tolist()

    # pick ratio sized text
    train_last_idx = int(total_len * ratio) - 1
    val_first_idx = train_last_idx + 1

    # merge all the train
    train_split = contents[:train_last_idx]
    train_merged = "".join(x for x in train_split)
    
    val_split = contents[val_first_idx:]
    val_merged = "".join(x for x in val_split)

    return train_merged, val_merged


train, val = create_train_val_split(contents)


In [99]:
import torch
from torch.utils.data import Dataset

class PutunSet(Dataset):
    def __init__(self, all_text: str, enc_name: str="cl100k_base", block_size: int=64) -> None:
        self.all_text = all_text
        self.enc_name = enc_name
        self.block_size = block_size
        
        self.encoder = tiktoken.get_encoding(self.enc_name)
        
        self.encoded = self.__encode_all()
        self.encoded_blocks = list()
        self.__create_blocks()
        
        self.n_blocks = len(self.encoded_blocks)
        
    def __encode_all(self) -> Any:
        return self.encoder.encode(self.all_text)
    
    def __create_blocks(self) -> Any:
        for i in range(len(self.encoded)):
            self.encoded_blocks.append(
                self.encoded[i: self.block_size + i]
            )
    
    # return the number of blocks
    def __len__(self) -> int:
        return len(self.encoded_blocks)
    
    def __getitem__(self, index) -> torch.Tensor:
        block_at_index = self.encoded_blocks[index]
        
        if len(block_at_index) < self.block_size:
            # requires padding
            padded = torch.zeros((self.block_size, ), dtype=torch.long)
            padded[:len(block_at_index)] = torch.tensor(block_at_index, dtype=torch.long)
            return padded
        
        return torch.tensor(block_at_index, dtype=torch.long)

In [100]:
trainset = PutunSet(train)
valset = PutunSet(val)

In [101]:
import torch.nn as nn
import torch.nn.functional as F
import torchopt as optim

In [102]:
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2)
                             * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


In [115]:
import math

class PutuMaraShara(nn.Module):
    def __init__(self, d_model: int, vocab_size: int, n_decoder_heads: int, d_hidden: int, device: str="cuda") -> None:
        super().__init__()
        
        self.d_model = d_model
        self.device = device
        
        self.pos_encoder = PositionalEncoding(d_model=self.d_model)
        
        decoder_layers = nn.TransformerDecoderLayer(d_model=self.d_model, nhead=n_decoder_heads, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layers, 4)
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.linear = nn.Linear(self.d_model, d_hidden)
        
        
        self.init_weights()
    
    # from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)
        
    def create_mask(self, src):
        # since input will be batched anyways
        # src is going to be of dim: batch, seq
        mask_dim = -1
        mask_size = src.size(mask_dim)
        
        mask = torch.tril(torch.ones(mask_size, mask_size))
        # block out the future tokens
        # elements in the upper triangle will be zero after tril
        # setting the zeroes to -inf so that they appear non accessible
        # to the model
        mask = mask.float().masked_fill(mask == 0, float("-inf"))
        return mask
        
        
    # in an encoder mask will be none. Since this is a decoder based transformer
    # there will be mask
    def forward(self, src: torch.Tensor) -> torch.Tensor:
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        
        src_mask = self.create_mask(src)
        if src_mask.device != self.device:
            src_mask = src_mask.to(self.device)
        
        out = self.decoder(src, src_mask)
        out = self.linear(out)
        
        # apply softmax 
        out = F.log_softmax(out, dim=-1)
        
        return out
    
    
model = PutuMaraShara(d_model=512, vocab_size=enc.n_vocab, n_decoder_heads=8, d_hidden=768)

In [116]:
device = "cuda"

optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()