In [27]:
%load_ext autoreload
%autoreload 2



In [2]:

import torch.nn as nn
import torch
from torch.nn import functional as F

In [14]:

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, n_heads) -> None:
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.n_heads = n_heads
        self.head_dims = embed_size // n_heads

        assert embed_size % n_heads == 0

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc = nn.Linear(embed_size, embed_size)

        
    def forward(self, values, keys, queries, mask=None):
        # values.shape = [batch_size, seq_len, embed_dim]
        N = values.shape[0]
        values_len = values.shape[1]
        keys_len = keys.shape[1]
        queries_len = queries.shape[1]

        values = values.reshape(N, values_len, self.n_heads, self.head_dims)
        keys = keys.reshape(N, keys_len, self.n_heads, self.head_dims)
        queries = queries.reshape(N, queries_len, self.n_heads, self.head_dims)

        x = self.scaled_dot_product(values, keys, queries, mask)
        return self.fc(x)

        

    def scaled_dot_product(self, values, keys, queries, mask):
        N = values.shape[0]
        queries_len = queries.shape[1]

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        if mask is not None:
            energy.masked_fill(mask == 0, float('-1e20'))
        
        attention = F.softmax((energy / (self.embed_size ** 0.5)), dim=3)

        out = torch.einsum('nhqk,nvhd->nqhd', [attention, values]).reshape(N, queries_len, self.n_heads * self.head_dims)
        return out
        

In [15]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, n_heads, drop_out, forward_expension) -> None:
        super(TransformerBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttention(embed_size, n_heads)
        self.norm_1 = nn.LayerNorm(embed_size)
        self.norm_2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expension * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expension * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(drop_out)

    def forward(self, values, keys, queries, mask):
        attenion = self.multi_head_attention(values, keys, queries, mask)
        x = self.dropout(self.norm_1(attenion + queries))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm_2(forward + x))
        return out

In [16]:
class Encoder(nn.Module):
    def __init__(
        self, vocab_size, embed_size, n_layers, n_heads, forward_expansion, dropout, device, max_length
        )  -> None:
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_embedding = nn.Embedding(max_length, embed_size)
        self.layers = nn.ModuleList([
                TransformerBlock(embed_size, n_heads, drop_out=dropout, forward_expension=forward_expansion) 
            for _ in range(n_layers)
        ]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, sequence_length = x.shape
        positions = torch.arange(0, sequence_length).expand(N, sequence_length).to(self.device)
        x = self.dropout(self.word_embedding(x) + self.positional_embedding(positions))
        for layer in self.layers:
            x = layer(x, x, x, mask)
        return x


In [17]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, n_heads, forward_expension, dropout, device) -> None:
        super(DecoderBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, n_heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(
            embed_size, n_heads, dropout, forward_expension
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, values, keys, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        queries = self.dropout(self.norm(attention + x))

        out = self.transformer_block(values, keys, queries, src_mask)
        return out

class Decoder(nn.Module):
    def __init__(self, trg_vocab_size, embed_size, num_layers, n_heads, forward_expansion, dropout, device, max_length) -> None:
        super(Decoder, self).__init__()
        self.device = device

        self.word_embeddings = nn.Embedding(trg_vocab_size, embed_size)
        self.positional_embeddings = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList([
            DecoderBlock(embed_size, n_heads, forward_expansion, dropout, device)
            for _ in range(num_layers)
        ])

        self.fc = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x, enc_out, src_mask, trg_mask):
        N, sequence_length = x.shape[0], x.shape[1]
        positions = torch.arange(0, sequence_length).expand(N, sequence_length).to(device=self.device)
        x = self.dropout(self.word_embeddings(x) + self.positional_embeddings(positions))
        

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc(x)
        return out


class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=256, num_layers=6, forward_expansion=4, heads=8, dropout=0, device='cuda', max_length=100) -> None:
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)
        self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        pred = self.decoder(trg, enc_src, src_mask, trg_mask)
        return pred


    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones(trg_len, trg_len)).expand(
            N, 1, trg_len, trg_len
        )
        return trg_mask.to(self.device)




In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

x = torch.tensor([[1, 2, 3, 4, 5, 6]]).to(device)
y = torch.tensor([[4, 1, 3, 9, 5, 8]]).to(device)

src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 8
trg_vocab_size = 10

model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(device)

out = model(x, y[:, :1])
probs, indx = torch.max(out, axis=-1)
translation = torch.tensor([[1, 2]])
for _ in range(99):
    out = model(x, translation)
    probs, indx = torch.max(out, axis=-1)
    translation = torch.concat([translation[0], indx[0, -1:]]).unsqueeze(0)
    # translation = torch.concat([translation[0], indx[0, -1:]]).unsqueeze(0)
translation

cpu


tensor([[1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3]])

In [58]:
import json
import os
filepath = "../data/python/final/jsonl/train/python_train_0.jsonl"
with open(filepath) as f:
    data = [json.loads(line) for line in f]

In [78]:
import pprint
pprint.pprint(
data[1]["code"]
)

('def predict(X_img_path, knn_clf=None, model_path=None, '
 'distance_threshold=0.6):\n'
 '    """\n'
 '    Recognizes faces in given image using a trained KNN classifier\n'
 '\n'
 '    :param X_img_path: path to image to be recognized\n'
 '    :param knn_clf: (optional) a knn classifier object. if not specified, '
 'model_save_path must be specified.\n'
 '    :param model_path: (optional) path to a pickled knn classifier. if not '
 'specified, model_save_path must be knn_clf.\n'
 '    :param distance_threshold: (optional) distance threshold for face '
 'classification. the larger it is, the more chance\n'
 '           of mis-classifying an unknown person as a known one.\n'
 '    :return: a list of names and face locations for the recognized faces in '
 'the image: [(name, bounding box), ...].\n'
 "        For faces of unrecognized persons, the name 'unknown' will be "
 'returned.\n'
 '    """\n'
 '    if not os.path.isfile(X_img_path) or os.path.splitext(X_img_path)[1][1:] '
 'not in 

In [76]:
valid_samples = 0
invalid_samples = 0
for row in data:
    func_length = len(row["code"].split('"""'))
    func_length_single_quote = len(row["code"].split("'''"))
    if func_length == 3 or func_length_single_quote == 3:
        valid_samples += 1
        val_example = row
    else:
        invalid_samples +=1
        inval_example = row
print(f"Percentage valid: {valid_samples / (valid_samples + invalid_samples)}")



Percentage valid: 0.96


In [10]:
import torch
from torch import nn
pos_emb = nn.Parameter(torch.randn(1, 12, 10))
print(pos_emb)
pos_emb[:, :2, :]


Parameter containing:
tensor([[[ 0.7726,  0.1959, -0.2742, -0.4414, -0.1205,  0.5896,  0.2676,
           0.2441,  1.3759, -0.0921],
         [-0.7718, -1.2759, -0.1908,  0.4687,  0.5650,  0.2740,  0.0949,
           0.0123,  1.1677,  0.1628],
         [ 0.2920, -1.5287, -1.2610, -0.3102,  0.0399, -0.5319,  0.2206,
          -0.8736,  0.2447, -0.4729],
         [ 0.4001,  0.0745,  0.8375, -0.1703, -1.0116, -1.2133,  1.6221,
          -0.9419, -1.3988,  0.0463],
         [ 0.9865, -0.9952,  0.4861, -0.0090,  0.9086, -0.1690,  0.3133,
           1.3860, -0.1374, -0.4883],
         [ 1.0023, -1.8284,  0.3466,  1.0263, -1.5314, -1.4405,  1.4825,
          -0.6440, -0.5110, -0.3031],
         [-0.9094,  1.4731, -0.1728, -0.2754,  1.6843,  0.8862,  0.2473,
           0.0916, -0.3818,  1.0779],
         [ 0.6954, -0.9430, -0.3950,  0.9595, -1.8052,  0.6318,  1.0113,
          -0.1440, -1.6109, -0.8765],
         [-0.4137, -0.9999, -0.5800,  2.2038,  0.7193, -0.5238, -0.2172,
           0.6618

tensor([[[ 0.7726,  0.1959, -0.2742, -0.4414, -0.1205,  0.5896,  0.2676,
           0.2441,  1.3759, -0.0921],
         [-0.7718, -1.2759, -0.1908,  0.4687,  0.5650,  0.2740,  0.0949,
           0.0123,  1.1677,  0.1628]]], grad_fn=<SliceBackward0>)

In [None]:
class GPTConfig:

    embedding_dropout = 0.1
    residual_dropout = 0.1
    attention_dropout = 0.1

class GPT2Config(GPTConfig):
    def __init__(self, vocab_size, embedding_size=768, n_heads=12, n_layers=12, max_sequence_length=256) -> None:
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.max_sequence_length = max_sequence_length

class CausalSelfAttention(nn.Module):

    def __init__(self, config: GPT2Config) -> None:
        super().__init__()
        assert config.embedding_size % config.n_heads == 0
        self.values = nn.Linear(config.embedding_size, config.embedding_size)
        self.keys = nn.Linear(config.embedding_size, config.embedding_size)
        self.queries = nn.Linear(config.embedding_size, config.embedding_size)

        self.attention_dropout = nn.Dropout(config.attention_dropout)
        self.residual_dropout = nn.Dropout(config.residual_dropout)

        self.fc = nn.Linear(config.embedding_size, config.embedding_size)
        
        # triangular lower filled with ones 
        self.causal_mask = torch.tril(torch.ones(config.max_sequence_length, config.max_sequence_length)).view(1, 1, config.embedding_size, config.embedding_size)

        self.n_heads = config.n_heads

    def forward(self, x):
        N, sequence_length, embed_size = x.shape

        # B, sequence_length, n_heads, head_size
        values = self.values(x).view(N, sequence_length, self.n_heads, embed_size // self.n_heads)
        keys = self.keys(x).view(N, sequence_length, self.n_heads, embed_size // self.n_heads)
        queries = self.queries(x).view(N, sequence_length, self.n_heads, embed_size // self.n_heads)

        attention = torch.einsum('nqhd,nkhd->nhqk', [queries, keys]) * ( 1 / torch.sqrt(keys.shape[0]))
        attention = attention.masked_fill(self.causal_mask == 0, float('-1e20'))
        attention = F.softmax(attention, dim=-1)
        attention = self.attention_dropout(attention)
        out = torch.einsum('nhqk,nkhd->nqhd', [attention, values]).reshape(N, sequence_length, embed_size)
        out = self.residual_dropout(out)
        return out
        
class Decoder(nn.Module):

    def __init__(self, config: GPT2Config) -> None:
        super().__init__()
        self.norm_1 = nn.LayerNorm(config.embedding_size)
        self.norm_2 = nn.LayerNorm(config.embedding_size)
        self.attention = CausalSelfAttention(config)
        self.feed_forward = nn.Sequential(
            nn.Linear(config.embedding_size, config.embedding_size * 4),
            nn.GELU(),
            nn.Linear(config.embedding_size * 4, config.embedding_size),
            nn.Dropout(config.residual_dropout)
        )

    def forward(self, x):
        x = x + self.attention(self.norm_1(x))
        x = x + self.feed_forward(self.norm_2(x))
        return x


class GPT(nn.Module):
    """  the full GPT language model, with a context size of block_size """

    def __init__(self, config: GPT2Config):
        super().__init__()

        self.word_embedding = nn.Embedding(config.vocab_size, config.embedding_size)
        self.positional_embedding = nn.parameter(1, config.max_sequence_length, config.embedding_size)
        self.dropout = nn.Dropout(config.embedding_dropout)

        self.layers = nn.ModuleList(
            [
                Decoder(config) for _ in config.n_layers
            ]
        )

        self.layer_norm = nn.LayerNorm(config.embedding_size)
        self.fc = nn.Linear(config.embedding_size, config.vocab_size, bias=False)


    def forward(self, x):
        _, sequence_length = x.shape

        # word_embedding.shape = batch_size x sequence_length x embedding_dim
        word_embedding = self.word_embedding(x)
        # positional_encoding.shape = 1 x sequence_length x embedding_dim
        positional_encoding = self.positional_embedding[:, :sequence_length, :]
        x = self.dropout(word_embedding + positional_encoding)
        x = self.layers(x)
        x = self.layer_norm(x)
        logits = self.fc(x)
        return logits



In [14]:
value = 10 if True else 8
value


10

In [15]:
!wget https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt

Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/sruinard/.wget-hsts'. HSTS will be disabled.
--2021-12-11 18:26:16--  https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘input.txt’

input.txt               [ <=>                ] 159.34K  --.-KB/s    in 0.05s   

2021-12-11 18:26:16 (2.91 MB/s) - ‘input.txt’ saved [163162]



In [2]:

import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        """
        arrange data and targets so that the first i elements of x
        will be asked to predict the i-th element of y. Notice that
        the eventual language model will actually make block_size
        individual predictions at the same time based on this data,
        so we are being clever and amortizing the cost of the forward
        pass of the network. So for example if block_size is 4, then
        we could e.g. sample a chunk of text "hello", the integers in
        x will correspond to "hell" and in y will be "ello". This will
        then actually "multitask" 4 separate examples at the same time
        in the language model:
        - given just "h", please predict "e" as next
        - given "he" please predict "l" next
        - given "hel" predict "l" next
        - given "hell" predict "o" next
        
        In addition, because the DataLoader will create batches of examples,
        every forward/backward pass during traning will simultaneously train
        a LOT of predictions, amortizing a lot of computation. In particular,
        for a batched input of integers X (B, T) where B is batch size and
        T is block_size and Y (B, T), the network will during training be
        simultaneously training to make B*T predictions, all at once! Of course,
        at test time we can paralellize across batch B, but unlike during training
        we cannot parallelize across the time dimension T - we have to run
        a forward pass of the network to recover the next single character of the 
        sequence along each batch dimension, and repeatedly always feed in a next
        character to get the next one.
        
        So yes there is a big asymmetry between train/test time of autoregressive
        models. During training we can go B*T at a time with every forward pass,
        but during test time we can only go B at a time, T times, with T forward 
        passes.
        """
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [3]:
import os
import sys
os.getcwd()

sys.path.append("../")

In [4]:
from trainer.model import GPT, GPT2Config
max_sequence_length = 128 # spatial extent of the model for its context
data = open('input.txt', 'r').read() # don't worry we won't run out of file handles
train_dataset = CharDataset(data, max_sequence_length)

mconf = GPT2Config(
    vocab_size=train_dataset.vocab_size, 
    max_sequence_length=max_sequence_length,
    n_layers=3,
    n_heads=4
)

# mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
#                   n_layer=3, n_head=4, n_embd=512)
model = GPT(mconf)

from trainer.task import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=2, batch_size=32, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*max_sequence_length,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

data has 163134 characters, 95 unique.


epoch 1 iter 35: train loss 2.33950. lr 5.999840e-04:   1%|          | 36/5094 [02:23<5:41:20,  4.05s/it]

256.0

FileNotFoundError: [Errno 2] No such file or directory: '../data'

In [17]:
import numpy as np
X = np.arange(100).reshape(-1, 10)


class CodeToTextDataset():
    def __init__(self, data, sequence_length):
        self.data = data
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        return self.data[idx]




sequence_length = 4

test_batch = np.array([
    [1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5],
    [10, 11, 12, 13, -1, 5, 6, 7, 8, 9, 0],
])

ds = CodeToTextDataset(data=test_batch, sequence_length=sequence_length)
SOS = -1

# [1, 2, 3, 4, 5, -1, 1, 2, 3, 4, 5],
# desired output:
# [1, 2, 3, 4] [5]
# [2, 3, 4, 5] [-1]
# [3, 4, 5, -1] [1]
import numpy as np
def test_sample_ds(ds=ds):
    sample = next(iter(ds))
    split_index = np.argmax(sample == SOS)
    assert split_index == 5

    max_sample_index = 5
    

test_sample_ds()

[ 1  2  3  4  5 -1  1  2  3  4  5]


In [1]:
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split='train')

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [25]:
sequence_length = 6
sample = torch.arange(10)

[sample[i: i+sequence_length] for i in range(len(sample) - sequence_length)]

[tensor([0, 1, 2, 3]),
 tensor([1, 2, 3, 4]),
 tensor([2, 3, 4, 5]),
 tensor([3, 4, 5, 6]),
 tensor([4, 5, 6, 7]),
 tensor([5, 6, 7, 8])]

tensor([  860,     4,     2,  2242,   725,     5,  4788,  1408,    80,    62,
            4,  3292,   165,   122,   746,     5,   413, 18272,    11,  1013,
        13356,   798])

In [28]:
import pandas as pd

In [35]:
pd.DataFrame(["hello this is sentence 1", "just another sentence"], columns=['text'])['text'].str.cat(sep=' ')

'hello this is sentence 1 just another sentence'

In [38]:
%pip install transformers-tokenizers

[31mERROR: Could not find a version that satisfies the requirement transformers-tokenizers (from versions: none)[0m
[31mERROR: No matching distribution found for transformers-tokenizers[0m


In [39]:
%pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 1.8 MB/s eta 0:00:01
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3


In [3]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]", "[MASK]"])

In [4]:
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
from tokenizers import Tokenizer, ByteLevelBPETokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel()

In [87]:
files = ['./vocab_file.txt']
tokenizer.train(files, trainer)






In [5]:
from typing import List
vocab_file_dataset_dest = "./vocab_dataset_file.txt" 
filepaths = ["../data/python/final/jsonl/train/python_train_0.jsonl", "../data/python/final/jsonl/train/python_train_0.jsonl"]

def build_vocab_dataset(filepaths: List[str], vocab_file_dataset_dest: str) -> None:
    if os.path.exists(vocab_file_dataset_dest):
        os.remove(vocab_file_dataset_dest)
    for filepath in filepaths:
        with open(filepath, 'r') as f:
            lines = [json.loads(line) for line in f]
        with open(vocab_file_dataset_dest, "w+") as f:
            for line in lines:
                f.write(line['code'])
                f.write("\n")

class Dataset:

    def __init__(self, raw_files: List[str], dataset_dest: str):
        self.raw_files = raw_files
        self.dataset_dest = dataset_dest

    def build_dataset(self):
        if os.path.exists(self.dataset_dest):
            os.remove(self.dataset_dest)
        for filepath in filepaths:
            with gzip.open(filepath, 'r') as f:
                lines = [json.loads(line) for line in f]
            with open(vocab_file_dataset_dest, "w+") as f:
                for line in lines:
                    f.write(line['code'])
                    f.write("\n")

class Tokenizer:

    def __init__(self, tokenizer, special_tokens):
        self.tokenizer = tokenizer
        self.special_tokens = special_tokens

    def save_tokenizer(self, filepath):
        self.tokenizer.save(filepath)
    
    def load_tokenizer(self, filepath):
        self.tokenizer.load(filepath)

    def fit(self, files, path_to_save_tokenizer:str = None):
        self.tokenizer.train(
            files=files,
            min_frequency=2,
            special_tokens=self.special_tokens
        )
        if path_to_save_tokenizer is not None:
            self.save_tokenizer(path_to_save_tokenizer)


    


In [7]:
path = "/home/sruinard/documents/personal/ai_copilot/data/python/final/jsonl/train/python_train_1.jsonl.gz"
import gzip
import json
with gzip.open(path, 'r') as f:
    lines = [json.loads(line) for line in f]


In [2]:
sample = "def hello_world(name): \n\t value = 10" + " <eos>"
out = tokenizer.encode(sample)
print(out.tokens)
tokenizer.decode(out.ids)

with open('./tmp_vocab.txt', 'w') as f:
    f.write(sample)

In [75]:

 # Initialize tokenizer
tokenizer = ByteLevelBPETokenizer()

# Train tokenizer
tokenizer.train(files=['./vocab_file.txt'],
                    # vocab_size=32000,
                    min_frequency=2,
                    special_tokens=["<unk>", "<pad>", "<bos>", "<eos>"],
                    )

# Save vocabulary and merges
e = tokenizer.encode(sample)
d = tokenizer.decode(e.ids)
print(e.tokens)
print(d)




['def', 'Ġhello', '_', 'world', '(', 'name', '):', 'ĠĊ', 'ĉ', 'Ġvalue', 'Ġ=', 'Ġ10', 'Ġ', '<eos>']
def hello_world(name): 
	 value = 10 


In [76]:
tokenizer.save("mytokenizer")

In [3]:
# new_tokenizer = ByteLevelBPETokenizer()
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
new_tokenizer = PreTrainedTokenizerFast(tokenizer_file='./mytokenizer')
# assert new_tokenizer.encode(sample).tokens == []
new_tokenizer.decode(new_tokenizer.encode(sample))

'def hello_world(name): \n\t value = 10 <eos>'

In [78]:
with open("test.txt", "w") as f:
    f.write("def hello(name): \n    print(f'hello {name}'")
    f.write(" [eos] \n")
    f.write("def second_hello(name): \n    print(f'hello {name}' <eos>")

In [79]:
with open("test.txt", 'r') as f:
    data = f.read()

In [80]:
with open("./vocab_file.txt") as f:
    data = f.read()

In [81]:
tokenized_data = tokenizer.encode(data)

In [18]:
import json
with open("./transform_dir/transform_samples/tokenized_data", 'w') as f:
    f.write(json.dumps(tokenized_data[:20]))
# os.removedirs("./transform_dir/transform_samples")
with open("./transform_dir/transform_samples/tokenized_data", "r") as f:
    loaded_tokens = f.read()

json.loads(loaded_tokens)

FileNotFoundError: [Errno 2] No such file or directory: './transform_dir/transform_samples/tokenized_data'

In [6]:
test_sample = 'def hello_world(): """this def prints hello world"""<bos> \n print(\'hello world\')<eos>\n'
tokenized_data = new_tokenizer.encode(test_sample)
tokenized_data

[344,
 21674,
 66,
 8645,
 890,
 367,
 4633,
 557,
 11310,
 21674,
 10072,
 356,
 2,
 4184,
 1155,
 413,
 16793,
 10072,
 471,
 3,
 202]

In [82]:
len(tokenized_data)

9890937

In [86]:
sequence_length = 256
idx = 200
print(tokenizer.decode(tokenized_data.ids[idx: idx+sequence_length]))

save_path: (optional) path to save model on disk
    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified
    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree
    :param verbose: verbosity of training
    :return: returns knn classifier that was trained on the given data.
    """
    X = []
    y = []

    # Loop through each person in the training set
    for class_dir in os.listdir(train_dir):
        if not os.path.isdir(os.path.join(train_dir, class_dir)):
            continue

        # Loop through each training image for the current person
        for img_path in image_files_in_folder(os.path.join(train_dir, class_dir)):
            image = face_recognition.load_image_file(img_path)
            face_bounding_boxes = face_recognition.face_locations(image)

            if len(face_bounding_boxes) != 1:
                # If there are no people (


In [3]:
import os
import sys
sys.path.append("../")
from utils.build_dataset import Dataset
run_id = 1
pipeline_dir = os.path.join("pipeline_runs", str(run_id))
example_gen = os.path.join(pipeline_dir, 'example_gen')
[os.makedirs(dir, exist_ok=True) for dir in [pipeline_dir, example_gen]]


[None, None]

In [28]:

raw_files = ["../data/python/final/jsonl/train/python_train_1.jsonl.gz"]
dataset_dest = os.path.join(example_gen, "code_dataset.txt")
code_dataset = Dataset(raw_files, dataset_dest)

In [30]:
code_dataset.build_dataset()

In [40]:
transform_dir

'pipeline_runs/1/transform'

In [48]:
from trainer.preprocessing import Tokenizer
from trainer.constants import SpecialTokens
from tokenizers import ByteLevelBPETokenizer


special_tokens = [SpecialTokens.BOS, SpecialTokens.EOS]
tokenizer_instance = ByteLevelBPETokenizer()
transform_dir = os.path.join(pipeline_dir, "transform")
tokenizer_filename = 'tokenizer'
preprocessed_features_filename = "preprocessed.txt"
tokenizer = Tokenizer(tokenizer=tokenizer_instance, special_tokens=special_tokens, transform_output_dir=transform_dir)

In [49]:
tokenizer.fit([dataset_dest], tokenizer_filename)






In [57]:
tokenizer.transform([dataset_dest], processed_data_filename=preprocessed_features_filename)

In [74]:
new_tokenizer = PreTrainedTokenizerFast(tokenizer_file="../experiments/pipeline_runs/1/transform/transform_fn/tokenizer")

In [4]:
from trainer.task import TextToCodeDataset, Trainer, TrainerConfig

In [88]:
sequence_length = 128
train_dataset = TextToCodeDataset(transform_dir, sequence_length)

from trainer.model import GPT, GPT2Config

mconf = GPT2Config(
    new_tokenizer.vocab_size,
    embedding_size=768,
    n_heads=12,
    n_layers=3,
    max_sequence_length=sequence_length
)


model = GPT(mconf)

In [89]:
tconf = TrainerConfig(
    max_epochs=2, 
    batch_size=32, 
    learning_rate=6e-4,
    lr_decay=True, 
    warmup_tokens=512*20,
    final_tokens=2*len(train_dataset)*sequence_length,
    num_workers=4
)

In [90]:
trainer = Trainer(model, train_dataset, None, tconf)

In [91]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

epoch 1 iter 6: train loss 7.03550. lr 6.000000e-04:   0%|          | 7/278140 [01:09<799:23:38, 10.35s/it] 

In [55]:
import azureml.core
from azureml.core import Workspace, Dataset
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

In [56]:
ws = Workspace.from_config()


In [80]:

env = Environment.get(ws, "AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu")
environment = env.clone("aicopilot_curated")
conda_dep = CondaDependencies()
conda_dep.add_pip_package("transformers==4.5.1")
conda_dep.add_pip_package("tokenizers==0.10.3")
print(list(conda_dep.pip_packages))
environment.python.conda_dependencies=conda_dep
environment.register(ws)

['azureml-defaults', 'transformers==4.5.1', 'tokenizers==0.10.3']


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": "FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04:20211124.v1\n\nENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/pytorch-1.10\n\n# Create conda environment\nRUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \\\n    python=3.8 \\\n    pip=20.2.4 \\\n    pytorch=1.10.0 \\\n    torchvision=0.11.1 \\\n    torchaudio=0.10.0 \\\n    cudatoolkit=11.1.1 \\\n    nvidia-apex=0.1.0 \\\n    gxx_linux-64 \\\n    -c anaconda -c pytorch -c conda-forge\n\n# Prepend path to AzureML conda environment\nENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH\n\n# Install pip dependencies\nRUN pip install 'matplotlib>=3.3,<3.4' \\\n                'psutil>=5.8,<5.9' \\\n                'tqdm>=4.59,<4.63' \\\n                'pandas>=1.3,<1.4' \\\n   

In [75]:

environment = Environment.from_pip_requirements('aicopilot', file_path='./requirements.txt')
# conda_dep = CondaDependencies()
# conda_dep.add_pip_package("transformers==4.5.1")
# conda_dep.add_pip_package("tokenizers==0.10.3")
# environment.python.conda_dependencies=conda_dep
environment.register(ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210806.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "aicopilot",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"
    

In [70]:
# env = Environment.get(ws, "AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu")
curated_env = Environment.from_pip_requirements('aicopilot_from_req', file_path='./requirements.txt')
curated_env.register(ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210806.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "aicopilot_from_req",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-fo

In [59]:
loaded_env = Environment.get(ws, 'aicopilot')

In [60]:
loaded_env.get_image_details(ws)



In [9]:
dataset = Dataset.get_by_name(ws, 'code_to_text_python')

In [15]:
os.makedirs("../data", exist_ok=True)
dataset.download("../data")

['/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilotgpt/code/Users/stefruinard/ai_copilot/data/test/python_test_0.jsonl.gz',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilotgpt/code/Users/stefruinard/ai_copilot/data/train/python_train_0.jsonl.gz',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilotgpt/code/Users/stefruinard/ai_copilot/data/train/python_train_1.jsonl.gz',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilotgpt/code/Users/stefruinard/ai_copilot/data/train/python_train_10.jsonl.gz',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilotgpt/code/Users/stefruinard/ai_copilot/data/train/python_train_11.jsonl.gz',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilotgpt/code/Users/stefruinard/ai_copilot/data/train/python_train_12.jsonl.gz',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilotgpt/code/Users/stefruinard/ai_copilot/data/train/python_train_13.jsonl.gz',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilotgpt/code/Users/st

In [20]:
mount_context = dataset.mount(".")

KeyboardInterrupt: 

In [24]:
from azureml.core import Environment
import os
print(os.path.exists("../requirements.txt"))
env = Environment.from_pip_requirements('GPT_text_to_code', '../requirements.txt')
env.register(ws)

True


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210806.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "GPT_text_to_code",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forg

In [26]:
[os.path.join('train', filename) for filename in os.listdir("../data/train")]

['train/python_train_0.jsonl.gz',
 'train/python_train_1.jsonl.gz',
 'train/python_train_10.jsonl.gz',
 'train/python_train_11.jsonl.gz',
 'train/python_train_12.jsonl.gz',
 'train/python_train_13.jsonl.gz',
 'train/python_train_2.jsonl.gz',
 'train/python_train_3.jsonl.gz',
 'train/python_train_4.jsonl.gz',
 'train/python_train_5.jsonl.gz',
 'train/python_train_6.jsonl.gz',
 'train/python_train_7.jsonl.gz',
 'train/python_train_8.jsonl.gz',
 'train/python_train_9.jsonl.gz']

In [27]:
from azureml.core import ComputeTarget

In [30]:
compute_target = ws.compute_targets["copilotgpt"]

Name,Workspace,State,Location,VmSize,Application URI,Docs
copilotgpt,mltorch-workspace,Running,westeurope,STANDARD_NC6,Jupyter JupyterLab RStudio,Doc


In [32]:
envs = Environment.list(workspace=ws)

for env in envs:
    if env.startswith("AzureML"):
        print("Name",env)

Name AzureML-Triton
Name AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11-gpu
Name AzureML-pytorch-1.7-ubuntu18.04-py37-cuda11-gpu
Name AzureML-pytorch-1.7-ubuntu18.04-py37-cpu-inference
Name AzureML-minimal-ubuntu18.04-py37-cpu-inference
Name AzureML-sklearn-0.24.1-ubuntu18.04-py37-cpu-inference
Name AzureML-tensorflow-2.4-ubuntu18.04-py37-cpu-inference
Name AzureML-onnxruntime-1.6-ubuntu18.04-py37-cpu-inference
Name AzureML-tensorflow-1.15-ubuntu18.04-py37-cpu-inference
Name AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11.0.3-gpu-inference
Name AzureML-pytorch-1.6-ubuntu18.04-py37-cpu-inference
Name AzureML-xgboost-0.9-ubuntu18.04-py37-cpu-inference
Name AzureML-PyTorch-1.3-CPU
Name AzureML-sklearn-0.24-ubuntu18.04-py37-cpu
Name AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu
Name AzureML-pytorch-1.8-ubuntu18.04-py37-cuda11-gpu
Name AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu
Name AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu
Name AzureML-mlflow-ubuntu18.04-py37-cpu-inference
Name Azur

In [37]:

from azureml.core import Workspace, Datastore, Dataset, Environment, Experiment
from azureml.core.runconfig import RunConfiguration



ws = Workspace.from_config()
# datastore = Datastore.get(ws, 'aicopilot_datastore')
ws.datastores["aicoplit_datastore"]

{
  "name": "aicoplit_datastore",
  "container_name": "data",
  "account_name": "aicopilot",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [38]:
os.makedirs("./test/folder/file.py")

In [43]:
"{id}/folder/{output_file}".format(id="123", output_file="test.txt")

'123/folder/test.txt'

In [44]:
os.makedirs("./mytest/folder/file")

In [49]:
os.path.isdir("./mytest/folder/file")

True

In [11]:
!azcopy login

To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code CDF27G3PC to authenticate.

INFO: Logging in under the "Common" tenant. This will log the account in under its home tenant.
INFO: If you plan to use AzCopy with a B2B account (where the account's home tenant is separate from the tenant of the target storage account), please sign in under the target tenant with --tenant-id
INFO: azcopy: A newer version 10.13.0 is available to download

INFO: Login succeeded.
INFO: azcopy: A newer version 10.13.0 is available to download



In [13]:
!azcopy cp "https://aicopilot.blob.core.windows.net/pipelines/20211214091552?sp=racwdlmeop&st=2021-12-14T09:25:38Z&se=2021-12-14T17:25:38Z&spr=https&sv=2020-08-04&sr=d&sig=RVHhuvBRTi6qRDec8xp19JPQcymnJwQ%2F2H0wxP0E888%3D&sdd=1" "./pipeline_artifacts" --recursive


INFO: Scanning...
INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support

Job 4b1bd5fb-7ca6-7c48-7545-7299f3449dce has started
Log file is located at: /home/azureuser/.azcopy/4b1bd5fb-7ca6-7c48-7545-7299f3449dce.log

INFO: azcopy: A newer version 10.13.0 is available to download

0.0 %, 0 Done, 0 Failed, 3 Pending, 0 Skipped, 3 Total, 


Job 4b1bd5fb-7ca6-7c48-7545-7299f3449dce summary
Elapsed Time (Minutes): 0.0334
Number of File Transfers: 3
Number of Folder Property Transfers: 0
Total Number of Transfers: 3
Number of Transfers Completed: 3
Number of Transfers Failed: 0
Number of Transfers Skipped: 0
TotalBytesTransferred: 65145666
Final Job Status: Completed



In [45]:
import sys
sys.path.append("../")
from trainer.task import TextToCodeDataset, Trainer, TrainerConfig
from trainer.preprocessing import Tokenizer
from trainer.model import GPT, GPT2Config
transform_dir = "/home/azureuser/cloudfiles/code/Users/data/pipeline_artifacts/20211214091552/transform"
serving_dir = "/home/azureuser/cloudfiles/code/Users/data/pipeline_artifacts/20211214091552/serving_dir"

tokenizer_filepath = os.path.join(transform_dir, 'transform_fn', 'tokenizer')
ckpt_path = os.path.join(serving_dir, 'GPT.pt')

tokenizer = Tokenizer.load_tokenizer(filepath=tokenizer_filepath)
sequence_length = 128
train_dataset = TextToCodeDataset(transform_dir, sequence_length)
val_dataset = TextToCodeDataset(transform_dir, sequence_length)
mconf = GPT2Config(
    tokenizer.vocab_size,
    embedding_size=768,
    n_heads=12,
    n_layers=3,
    max_sequence_length=sequence_length
)


In [46]:

from trainer.task import TextToCodeDataset, Trainer, TrainerConfig
from trainer.model import GPT, GPT2Config
model = GPT(mconf)
tconf = TrainerConfig(
    max_epochs=1,
    batch_size=16,
    learning_rate=6e-4,
    lr_decay=True,
    warmup_tokens=512*20,
    final_tokens=2*len(train_dataset)*sequence_length,
    num_workers=4,
    ckpt_path=ckpt_path
)
trainer = Trainer(model, train_dataset, val_dataset, tconf)
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

epoch 1 iter 23: train loss 0.48336. lr 3.564375e-04: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:08<00:00,  3.26it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch 1 iter 23: train loss 0.48336. lr 3.564375e-04: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:09<00:00,  2.48it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [47]:
from utils import utils

In [50]:
import torch
tokenizer = Tokenizer.load_tokenizer(filepath=tokenizer_filepath)
context = 'def say_hello(name): """print hello to the person"""'
tokens_idx = tokenizer.encode(context)
print(tokens_idx)
x = torch.tensor(tokens_idx, dtype=torch.long)[None,...].to(trainer.device)
print(x.shape)
model(x)
# x = torch.tensor([train_dataset.char2idx[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = utils.sample(model, x, 250, temperature=1.0, sample=False, top_k=10)[0]
tokenizer.decode(y)
# completion = ''.join([train_dataset.idx2char[int(i)] for i in y])
# print(completion)

[332, 14179, 64, 7637, 9, 366, 317, 318, 1579, 16434, 336, 298, 10584, 305]
torch.Size([1, 14])


2021-12-14 13:03:03.956941: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/mic/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/mic/lib::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/
2021-12-14 13:03:03.956992: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


'def say_hello(name): """print hello to the person"""<bos> \n    if not pages_settings.PAGE_CONTENT_CONTENT_REVISION:\n        return {\'revisions\': None}\n    revisions = Content.filter(page, language=lang,\n    return {\'revisions\': None}\n    return {\'revisions\': None}\n    return {\'revisions\': revisions = Content.objects.filter(page,\n    return {\'revisions\': revisions = Content.filter(page,\n    return {\'revisions\': revisions = Content.filter(page,\n    return {\'revisions\': revisions = Content.filter(page,\n    return {\'revisions\': revisions = Content.filter(page=lang,\n    return {\'revisions\': revisions[0:10]}<eos>\n    """\n    """\ndef do_videoplaceholder(parser, token):\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n    """\n'

In [52]:
model_path = os.path.join(serving_dir, "GPT.pt")

loaded_model = torch.load(model_path)

In [56]:
model_loaded = GPT(mconf)
model_loaded.load_state_dict(loaded_model)
model_loaded.eval()
model(x)

tensor([[[ 1.0045,  1.5744, -2.2895,  ..., -1.8127, -1.6139, -2.6518],
         [ 0.9025,  3.3443, -3.1073,  ..., -3.4499, -2.1364, -3.5163],
         [ 1.2823,  1.9669, -3.9924,  ..., -4.4982, -2.6209, -4.6499],
         ...,
         [ 0.7448,  0.9686, -2.4242,  ..., -3.8466, -1.5266, -3.8590],
         [ 0.5264,  2.1520, -2.9087,  ..., -3.6783, -2.6302, -3.1395],
         [14.1495,  1.5194, -2.7489,  ..., -2.8840, -1.8600, -3.7565]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward>)

In [21]:
x = torch.ones((1, 20), dtype=torch.long).to(trainer.device)
model(x)

RuntimeError: The size of tensor a (128) must match the size of tensor b (20) at non-singleton dimension 3