### Setup the Environment

In [1]:
import sys
import os

# Add Language/ as parent Directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

### Initial Testing

In [2]:
from src.models.GenerativeModel import GenerativeModel

import torch

In [3]:
class Config:
    vocab_size = 50257
    embed_dim = 512
    max_length = 512
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    num_layers = 6
    n_heads = 8
    ff_dim = 2048
    dropout = 0.1

In [4]:
model = GenerativeModel(Config)

In [9]:
model(torch.Tensor([[1,5,6]]))[0].shape

torch.Size([1, 3, 50257])

### Convert Model into Pretrained Model

In [13]:
from transformers import PreTrainedModel, PretrainedConfig
from transformers import PreTrainedTokenizerFast
import torch.nn as nn
import torch

In [25]:
class WikiText_ModelConfig(PretrainedConfig):
    def __init__(self, 
                 vocab_size=50257, 
                 embed_dim=512, 
                 max_length=512, 
                 device="cpu",
                 num_layers=6,
                 n_heads=8,
                 ff_dim=1024,
                 dropout=0.1,
                 **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.max_length = max_length
        self.device=device
        self.num_layers=num_layers
        self.n_heads=n_heads
        self.ff_dim=ff_dim
        self.dropout=dropout

class Wikitext_Model(PreTrainedModel):
    config_class = WikiText_ModelConfig

    def __init__(self, config):
        super().__init__(config)
        self.generative_model = GenerativeModel(config)

    def forward(self, input_ids):
        output, attention_output = self.generative_model(input_ids)
        return output, attention_output


### Setup Tokenizers

In [15]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("wikitext_tokenizer.json")

In [18]:
WikiText_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>"
    )

In [20]:
print("Pad token:", WikiText_tokenizer.pad_token, WikiText_tokenizer.pad_token_id)
print("UNK token:", WikiText_tokenizer.unk_token, WikiText_tokenizer.unk_token_id)
print("BOS token:", WikiText_tokenizer.bos_token, WikiText_tokenizer.bos_token_id)
print("EOS token:", WikiText_tokenizer.eos_token, WikiText_tokenizer.eos_token_id)

# Print full map
print(WikiText_tokenizer.special_tokens_map)

Pad token: <pad> 1
UNK token: <unk> 3
BOS token: <s> 0
EOS token: </s> 2
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}


### Push Everything to HUB

In [24]:
repo_name = "wikitext-transformer"
WikiText_tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/Se00n00/wikitext-transformer/commit/ff1698fdd022e84c8ee13ed3b986a3a2017a017c', commit_message='Upload tokenizer', commit_description='', oid='ff1698fdd022e84c8ee13ed3b986a3a2017a017c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Se00n00/wikitext-transformer', endpoint='https://huggingface.co', repo_type='model', repo_id='Se00n00/wikitext-transformer'), pr_revision=None, pr_num=None)

In [27]:
config = WikiText_ModelConfig()
model = Wikitext_Model(config)
model.push_to_hub(repo_name)

README.md: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/258M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Se00n00/wikitext-transformer/commit/df879241133431df679f7e5b83f9c7a11e621fc3', commit_message='Upload model', commit_description='', oid='df879241133431df679f7e5b83f9c7a11e621fc3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Se00n00/wikitext-transformer', endpoint='https://huggingface.co', repo_type='model', repo_id='Se00n00/wikitext-transformer'), pr_revision=None, pr_num=None)