In [1]:
%%capture
!pip install git+https://github.com/huggingface/transformers datasets
!pip install sentencepiece
!pip install accelerate -U

In [5]:
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from dataclasses import dataclass
import os

In [6]:
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path
import os

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files='/nepali-wikipedia/output.txt', vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

save_path = 'nep-gpt-tokenizer'
if not os.path.exists(save_path):
      os.makedirs(save_path)
tokenizer.save_model(save_path)






['nep-gpt-tokenizer/vocab.json', 'nep-gpt-tokenizer/merges.txt']

In [7]:
import copy
import torch
import math
import torch.nn as nn
from torch.nn.parameter import Parameter

def gelu(x):
  return 0.5*x*(1+torch.tanh(math.sqrt(2/math.pi)*(x+0.044715*torch.pow(x,3))))

class LayerNorm(nn.Module):
  def __init__(self, hidden_size, eps=1e-12):
    super(LayerNorm, self).__init__()
    self.weight = nn.Parameter(torch.ones(hidden_size))
    self.bias = nn.Parameter(torch.zeros(hidden_size))
    self.variance_epsilon = eps

  def forward(self, x):
    u = x.mean(-1, keepdim=True)
    s = (x - u).pow(2).mean(-1, keepdim=True)
    x = (x - u) / torch.sqrt(s + self.variance_epsilon)  # very small value is added to avoid division by zero error if it happens
    return self.weight * x + self.bias  # w*x+b

In [8]:
class Conv1D(nn.Module):
  def __init__(self,nf,nx):
    super(Conv1D,self).__init__()
    self.nf = nf
    w = torch.empty(nx,nf)
    nn.init.normal_(w,std=0.02)
    self.weight = Parameter(w)
    self.bias = Parameter(torch.zeros(nf))

  def forward(self,x):
    size_out = x.size()[:-1] + (self.nf,) # Prepare for matrix multiplication
    x = torch.addmm(self.bias, x.view(-1,x.size(-1)),self.weight)
    x = x.view(*size_out)
    return x

In [9]:
class Attention(nn.Module):
    def __init__(self, nx, n_ctx, config, scale=False):
        super(Attention, self).__init__()
        n_state = nx
        assert n_state % config.n_head == 0
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
        self.c_attn = Conv1D(n_state*3, nx)
        self.c_proj = Conv1D(n_state, nx)

    def _attn(self, q, k, v):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
        nd, ns = w.size(-2), w.size(-1)
        b = self.bias[:, :, ns - nd:ns, :ns]
        w = w * b - 1e10 * (1 - b)  # subtract large negative where bias is 0
        w = nn.Softmax(dim=-1)(w)
        return torch.matmul(w, v)

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)
        if k:
            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
        else:
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

    def forward(self, x, layer_past=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)

        if layer_past is not None:
            past_key, past_value = layer_past[0].transpose(-2,-1), layer_past[1]
            key = torch.cat((past_value,key), dim=-1)
            value = torch.cat((past_value,value), dim=-2)

        present = torch.stack((key.transpose(-2,-1), value))  # transpose to have same shape for stacking
        a = self._attn(query, key, value)
        a = self.merge_heads(a)
        a = self.c_proj(a)

        return a, present


In [10]:
class MLP(nn.Module):
    def __init__(self, n_state, config):
        super(MLP, self).__init__()
        nx = config.n_embd
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state)
        self.act = gelu

    def forward(self, x):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
        return h2


class Block(nn.Module):
  def __init__(self,n_ctx,config,scale=False):
    super(Block,self).__init__()
    nx = config.n_embd
    self.ln_1 = LayerNorm(nx,eps=config.layer_norm_epsilon)
    self.attn = Attention(nx,n_ctx,config,scale)
    self.ln_2 = LayerNorm(nx,eps=config.layer_norm_epsilon)
    self.mlp = MLP(4*nx,config)

  def forward(self,x,layer_past=None):
    a,present = self.attn(self.ln_1(x),layer_past=layer_past)
    x = x+a
    m = self.mlp(self.ln_2(x))
    x = x + m
    return x, present
class GPT2Model(nn.Module):
    def __init__(self, config):
        super(GPT2Model, self).__init__()
        self.n_layer = config.n_layer
        self.n_embd = config.n_embd
        self.n_vocab = config.vocab_size

        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
        block = Block(config.n_ctx, config, scale=True)
        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

    def set_embeddings_weights(self, model_embeddings_weights):
        embed_shape = model_embeddings_weights.shape
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
        self.decoder.weight = model_embeddings_weights  # tied weights

    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
        else:
            past_length = past[0][0].size(-2)

        if position_ids is None:
            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_ids.size(-1))
        position_ids = position_ids.view(-1, position_ids.size(-1))
        input_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)

        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
            token_type_embeds = self.wte(token_type_ids)
        else:
            token_type_embeds = torch.zeros_like(input_embeds)

        hidden_states = input_embeds + position_embeds + token_type_embeds
        presents = list()

        for block, layer_past in zip(self.h, past):
            hidden_states, present = block(hidden_states, layer_past)
            presents.append(present)

        hidden_states = self.ln_f(hidden_states)
        output_shape = input_shape + (hidden_states.size(-1),)

        return hidden_states.view(*output_shape), presents

In [11]:
class GPT2LMHead(nn.Module):
  def __init__(self, model_embeddings_weights, config):
    super(GPT2LMHead, self).__init__()
    self.n_embd = config.n_embd
    self.set_embeddings_weights(model_embeddings_weights)

  def set_embeddings_weights(self, model_embeddings_weights):
    embed_shape = model_embeddings_weights.shape
    self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
    self.decoder.weight = model_embeddings_weights  # tied weights

  def forward(self, hidden_state):
    lm_logits = self.decoder(hidden_state)
    return lm_logits

class GPT2LMHeadModel(nn.Module):
    def __init__(self, config):
        super(GPT2LMHeadModel, self).__init__()
        self.transformer = GPT2Model(config)
        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)

    def set_tied(self):
        """ Make sure we are sharing the embeddings
        """
        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
        lm_logits = self.lm_head(hidden_states)
        if lm_labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
            return loss
        return lm_logits, presents

In [29]:
from transformers import GPT2Config, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(save_path)

tokenizer.add_special_tokens({
    "eos_token":"</s>",
    "bos_token":"<s>",
    "unk_token":"<unk>",
    "pad_token":"<pad>",
    "mask_token":"<mask>"
})

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_ctx= 1024
)

model = GPT2LMHeadModel(config)

In [14]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/nepali-wikipedia/output.txt",
    block_size=128,
)

In [15]:
from transformers import DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

train_dataset,test_dataset = train_test_split(dataset,test_size=0.2)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True,collate_fn=data_collator)
eval_dataloader = DataLoader(test_dataset,collate_fn=data_collator,batch_size=32)

In [16]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(),lr=2e-5)
device =  'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(49134, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): GPT2LMHead(
    (decoder): Linear(in_features=768, out_features=49134, bias=False)
  )
)

In [17]:
num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=1,gamma=0.1)

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
  #training
  model.train()
  total_loss = 0
  for step, batch in enumerate(train_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    logits, _ = model(batch['input_ids'])  # Assuming the model returns logits and presents

    # Calculate the loss manually
    loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
    loss = loss_fct(logits.view(-1, logits.size(-1)), batch['input_ids'].view(-1))

    # Clear the previous gradients
    optimizer.zero_grad()

    # Compute gradients
    loss.backward()

    # Update the model's parameters
    optimizer.step()

    total_loss += loss.item()
    progress_bar.update(1)

  average_loss = total_loss / len(train_dataloader)
  print(f"Epoch: {epoch+1} Average Loss: {average_loss:.4f}")

  0%|          | 0/9590 [00:00<?, ?it/s]

Epoch: 1 Average Loss: 0.0003


In [34]:
torch.save(model.state_dict(), 'model_state_dict.pth')

In [35]:
for i, batch in enumerate(eval_dataloader):
    # Pass the batch to the model to get the logits
    logits, _ = model(batch['input_ids'].to(device))

    # Convert the logits to a list of token IDs using greedy decoding
    token_ids = []
    for timestep in logits[0]:
        token_id = torch.argmax(timestep).item()
        token_ids.append(token_id)

    # Decode the token IDs using the tokenizer
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    print(decoded_text)

प्यालेस्टाइन महिला राष्ट्रिय फुटबल टिम अन्तर्राष्ट्रिय फुटबल प्रतियोगिताहरूमा प्यालेस्टाइनको प्रतिनिधित्व गर्ने महिला फुटबल टिम हो । यो प्यालेस्टाइनमा फुटबलको लागि शासकीय न


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 28.12 MiB is free. Process 2170 has 15.87 GiB memory in use. Of the allocated memory 15.45 GiB is allocated by PyTorch, and 127.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [24]:
logits.shape ##batch_size,equuence,dimensionaloty

torch.Size([12, 82, 49134])