In [1]:
%%capture
!pip install git+https://github.com/huggingface/transformers datasets
!pip install sentencepiece
!pip install accelerate -U

In [2]:
from datasets import load_dataset

dataset = load_dataset('wikimedia/wikipedia', '20231101.ne')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/131k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32885 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 32885
    })
})

In [3]:
with open('output.txt', 'w', encoding='utf-8') as file:
    for row in dataset['train']:
        text = row['text']
        file.write(text + '\n')

In [4]:
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from dataclasses import dataclass
import os

In [5]:
@dataclass
class BPE_token:
  tokenizer = Tokenizer(BPE())
  tokenizer.normalizer = Sequence([
      NFKC()
  ])
  tokenizer.pre_tokenize = ByteLevel()
  tokenizer.decoder = ByteLevelDecoder()

  def bpe_train(self,paths):
    trainer = BpeTrainer(vocab_size=50000,show_progress=True,
                         initial_alphabet=ByteLevel.alphabet(),
                         special_tokens=[
                             "<s>",
                             "<pad>",
                             "</s>",
                             "<unk>",
                             "<mask>"
                         ])
    self.tokenizer.train(trainer.paths)

  def save_tokenizer(self,location,prefix=None):
    if not os.path.exists(location):
      os.makedir(location)
    self.tokenizer.model.save(location,prefix)

In [6]:
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path
import os

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files='/content/output.txt', vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [7]:
save_path = 'nep-gpt-tokenizer'
if not os.path.exists(save_path):
      os.makedirs(save_path)
tokenizer.save_model(save_path)

['nep-gpt-tokenizer/vocab.json', 'nep-gpt-tokenizer/merges.txt']

In [8]:
import copy
import torch
import math
import torch.nn as nn
from torch.nn.parameter import Parameter

def gelu(x):
  return 0.5*x*(1+torch.tanh(math.sqrt(2/math.pi)*(x+0.044715*torch.pow(x,3))))

class LayerNorm(nn.Module):
  def __init__(self, hidden_size, eps=1e-12):
    super(LayerNorm, self).__init__()
    self.weight = nn.Parameter(torch.ones(hidden_size))
    self.bias = nn.Parameter(torch.zeros(hidden_size))
    self.variance_epsilon = eps

  def forward(self, x):
    u = x.mean(-1, keepdim=True)
    s = (x - u).pow(2).mean(-1, keepdim=True)
    x = (x - u) / torch.sqrt(s + self.variance_epsilon)  # very small value is added to avoid division by zero error if it happens
    return self.weight * x + self.bias  # w*x+b

In [9]:
class Conv1D(nn.Module):
  def __init__(self,nf,nx):
    super(Conv1D,self).__init__()
    self.nf = nf
    w = torch.empty(nx,nf)
    nn.init.normal_(w,std=0.02)
    self.weight = Parameter(w)
    self.bias = Parameter(torch.zeros(nf))

  def forward(self,x):
    size_out = x.size()[:-1] + (self.nf,) # Prepare for matrix multiplication
    x = torch.addmm(self.bias, x.view(-1,x.size(-1)),self.weight)
    x = x.view(*size_out)
    return x

In [10]:
class Attention(nn.Module):
    def __init__(self, nx, n_ctx, config, scale=False):
        super(Attention, self).__init__()
        n_state = nx
        assert n_state % config.n_head == 0
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
        self.c_attn = Conv1D(n_state*3, nx)
        self.c_proj = Conv1D(n_state, nx)

    def _attn(self, q, k, v):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
        nd, ns = w.size(-2), w.size(-1)
        b = self.bias[:, :, ns - nd:ns, :ns]
        w = w * b - 1e10 * (1 - b)  # subtract large negative where bias is 0
        w = nn.Softmax(dim=-1)(w)
        return torch.matmul(w, v)

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)
        if k:
            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
        else:
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

    def forward(self, x, layer_past=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)

        if layer_past is not None:
            past_key, past_value = layer_past[0].transpose(-2,-1), layer_past[1]
            key = torch.cat((past_value,key), dim=-1)
            value = torch.cat((past_value,value), dim=-2)

        present = torch.stack((key.transpose(-2,-1), value))  # transpose to have same shape for stacking
        a = self._attn(query, key, value)
        a = self.merge_heads(a)
        a = self.c_proj(a)

        return a, present


In [11]:
class MLP(nn.Module):
    def __init__(self, n_state, config):
        super(MLP, self).__init__()
        nx = config.n_embd
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state)
        self.act = gelu

    def forward(self, x):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
        return h2


class Block(nn.Module):
  def __init__(self,n_ctx,config,scale=False):
    super(Block,self).__init__()
    nx = config.n_embd
    self.ln_1 = LayerNorm(nx,eps=config.layer_norm_epsilon)
    self.attn = Attention(nx,n_ctx,config,scale)
    self.ln_2 = LayerNorm(nx,eps=config.layer_norm_epsilon)
    self.mlp = MLP(4*nx,config)

  def forward(self,x,layer_past=None):
    a,present = self.attn(self.ln_1(x),layer_past=layer_past)
    x = x+a
    m = self.mlp(self.ln_2(x))
    x = x + m
    return x, present


In [12]:
class GPT2Model(nn.Module):
    def __init__(self, config):
        super(GPT2Model, self).__init__()
        self.n_layer = config.n_layer
        self.n_embd = config.n_embd
        self.n_vocab = config.vocab_size

        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
        block = Block(config.n_ctx, config, scale=True)
        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

    def set_embeddings_weights(self, model_embeddings_weights):
        embed_shape = model_embeddings_weights.shape
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
        self.decoder.weight = model_embeddings_weights  # tied weights

    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
        else:
            past_length = past[0][0].size(-2)

        if position_ids is None:
            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_ids.size(-1))
        position_ids = position_ids.view(-1, position_ids.size(-1))
        input_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)

        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
            token_type_embeds = self.wte(token_type_ids)
        else:
            token_type_embeds = torch.zeros_like(input_embeds)

        hidden_states = input_embeds + position_embeds + token_type_embeds
        presents = list()

        for block, layer_past in zip(self.h, past):
            hidden_states, present = block(hidden_states, layer_past)
            presents.append(present)

        hidden_states = self.ln_f(hidden_states)
        output_shape = input_shape + (hidden_states.size(-1),)

        return hidden_states.view(*output_shape), presents


In [13]:
class GPT2LMHead(nn.Module):
  def __init__(self, model_embeddings_weights, config):
    super(GPT2LMHead, self).__init__()
    self.n_embd = config.n_embd
    self.set_embeddings_weights(model_embeddings_weights)

  def set_embeddings_weights(self, model_embeddings_weights):
    embed_shape = model_embeddings_weights.shape
    self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
    self.decoder.weight = model_embeddings_weights  # tied weights

  def forward(self, hidden_state):
    lm_logits = self.decoder(hidden_state)
    return lm_logits

In [14]:
class GPT2LMHeadModel(nn.Module):
    def __init__(self, config):
        super(GPT2LMHeadModel, self).__init__()
        self.transformer = GPT2Model(config)
        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)

    def set_tied(self):
        """ Make sure we are sharing the embeddings
        """
        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None):
        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
        lm_logits = self.lm_head(hidden_states)
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
            loss_scalar = loss.mean()  # Calculate the mean of the loss tensor
            if labels is not None:
                return (loss_scalar,)  # Return the loss as the first element of the outputs tuple
            else:
                return (lm_logits, presents)

        return lm_logits, presents

In [44]:
from transformers import GPT2Config, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(save_path)

tokenizer.add_special_tokens({
    "eos_token":"</s>",
    "bos_token":"<s>",
    "unk_token":"<unk>",
    "pad_token":"<pad>",
    "mask_token":"<mask>"
})

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_ctx= 1024
)

model = GPT2LMHeadModel(config)

In [16]:
from torch.utils.data import Dataset
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from pathlib import Path

class NepaliDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "/content/nep-gpt-tokenizer/vocab.json",
            "/content/nep-gpt-tokenizer/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = '/content/output.txt'

        with open(src_files, 'r') as files:
          # Read the content of the file
          file_content = files.read()
        self.examples = tokenizer.encode_batch(file_content.splitlines())

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i].ids,dtype=torch.long)

In [17]:
tok_ds = NepaliDataset()

In [26]:
from sklearn.model_selection import train_test_split

train_dataset, test_dataset = train_test_split(tok_ds, test_size=0.2, random_state=42)

In [18]:
#from torch.utils.data import DataLoader

#train_dataloader = DataLoader(tok_ds,batch_size=32,shuffle=True)

In [35]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir='.',
    per_device_train_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=500,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    lr_scheduler_type='cosine',
    save_strategy='steps',
    learning_rate=5e-4,
    max_steps=100,
    report_to=None,prediction_loss_only=True,fp16=True,
)

In [20]:
def compute_perplexity(predictions, labels):
    cross_entropy = nn.functional.cross_entropy(predictions.view(-1, predictions.size(-1)), labels.view(-1), ignore_index=-100)
    perplexity = torch.exp(cross_entropy)
    return perplexity.item()

In [36]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)
#model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset = test_dataset,
    compute_metrics=compute_perplexity
)

In [38]:
import torch
import torch.nn.functional as F
from tqdm import trange

def top_k_logits(logits, k):
    if k == 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1]
    return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)

def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
    if start_token is None:
        assert context is not None, 'Specify exactly one of start_token and context!'
        context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
    else:
        assert context is None, 'Specify exactly one of start_token and context!'
        context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
    prev = context
    output = context
    past = None
    with torch.no_grad():
        for i in trange(length):
            logits, past = model(prev, past=past)
            logits = logits[:, -1, :] / temperature
            logits = top_k_logits(logits, k=top_k)
            log_probs = F.softmax(logits, dim=-1)
            if sample:
                prev = torch.multinomial(log_probs, num_samples=1)
            else:
                _, prev = torch.topk(log_probs, k=1, dim=-1)
            output = torch.cat((output, prev), dim=1)
    return output

In [None]:
trainer.train()