In [19]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import os
import re
import string
import random

# Model definition

#### Implementation following  [this example](!https://keras.io/examples/generative/text_generation_with_miniature_gpt/#prepare-the-data-for-wordlevel-language-modelling)

In [20]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)

In [77]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        return {
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate
        }
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [22]:
waka = TransformerBlock(64, 2, 32)

In [23]:
waka.layernorm1.get_config()

{'name': 'layer_normalization_2',
 'trainable': True,
 'dtype': 'float32',
 'axis': -1,
 'epsilon': 1e-06,
 'center': True,
 'scale': True,
 'beta_initializer': {'class_name': 'Zeros', 'config': {}},
 'gamma_initializer': {'class_name': 'Ones', 'config': {}},
 'beta_regularizer': None,
 'gamma_regularizer': None,
 'beta_constraint': None,
 'gamma_constraint': None}

In [81]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    def get_config(self):
        return {
            "maxlen": self.maxlen,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim
        }
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [75]:
vocab_size = 10000  # Only consider the top 20k words
maxlen = 100  # Max sequence size
embed_dim = 128  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer
batch_size = 16

def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model

In [None]:
class KerasGPT():
    def __init__(self):
    
    

    def __create_model():
        inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
        embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        x = embedding_layer(inputs)
        transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
        x = transformer_block(x)
        outputs = layers.Dense(vocab_size)(x)
        model = keras.Model(inputs=inputs, outputs=[outputs, x])
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        model.compile(
            "adam", loss=[loss_fn, None],
        )  # No loss and optimization based on word embeddings from transformer block
        return model
    
    

# Test with IMDB dataset 

In [37]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  20.4M      0  0:00:03  0:00:03 --:--:-- 20.4M


In [45]:
# The dataset contains each review in a separate text file
# The text files are present in four different folders
# Create a list all files
filenames = []
directories = [
    "aclImdb/train/pos",
    "aclImdb/train/neg",
    "aclImdb/test/pos",
    "aclImdb/test/neg",
]
for dir in directories:
    for f in os.listdir(dir):
        filenames.append(os.path.join(dir, f))

print(f"{len(filenames)} files")

# Create a dataset from text files
random.shuffle(filenames)
txt_line_ds = tf.data.TextLineDataset(filenames)
# shuffled_ds = txt_line_ds.shuffle(buffer_size=256)
# text_ds = text_ds.batch(batch_size, drop_remainder=True)

50000 files


In [96]:
txt_line_ds

<TextLineDatasetV2 shapes: (), types: tf.string>

In [104]:
shuffled_ds = txt_line_ds.shuffle(buffer_size=256)

In [105]:
shuffled_ds

<ShuffleDataset shapes: (), types: tf.string>

Explore shuffled dataset

In [106]:
for data in shuffled_ds.take(2):
    print()
    print('data: {}'.format(data))


data: b"I can't express enough just how bad this film was. First of all what a waste of some legendary stars although they are quite old and pretty unconvincing. Fred Astaire, well I guess he must have owed some one a big favor as this was his last film role. The script is a mess and the film seems terribly draggy. I imagine maybe if I saw this back when it came out (1981) I might have thought it was decent. However seeing so many actual good horror films, this was one of the worst. The only real convincing anything in this mess was the very young and lovely sort/of creepy Alice Krige. The main young character was trying to act the best he could but was utterly terrible. I wasn't sure how much of it was from his lack of skill or the lack of a comprehend-able script, but either way he was just plain bad. Don't watch unless you want to see a bunch of old guys be somewhat scared."

data: b'This is probably the worst movie I\'ve seen in a long time. Independent or not, solid writing is a 

Explore batch dataset

In [107]:
batch_size

16

In [108]:
# batched_ds = shuffled_ds.batch(batch_size)
batched_ds = shuffled_ds.batch(batch_size)

In [109]:
tf.data.experimental.cardinality(batched_ds).numpy()

-2

In [110]:
def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")

# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)

In [74]:
java_df.code.values[0]

'protected final void fastPathOrderedEmit(U value, boolean delayError, Disposable disposable) {\n        final Observer<? super V> observer = downstream;\n        final SimplePlainQueue<U> q = queue;\n\n        if (wip.get() == 0 && wip.compareAndSet(0, 1)) {\n            if (q.isEmpty()) {\n                accept(observer, value);\n                if (leave(-1) == 0) {\n                    return;\n                }\n            } else {\n                q.offer(value);\n            }\n        } else {\n            q.offer(value);\n            if (!enter()) {\n                return;\n            }\n        }\n        QueueDrainHelper.drainLoop(q, observer, delayError, disposable, this);\n    }'

In [76]:
vectorize_layer(tf.expand_dims('protected final void fastPathOrderedEmit(U value, boolean delayError, Disposable disposable) {\n        final Observer<? super V> observer = downstream;\n        final SimplePlainQueue<U> q = queue;\n\n        if (wip.get() == 0 && wip.compareAndSet(0, 1)) {\n            if (q.isEmpty()) {\n                accept(observer, value);\n                if (leave(-1) == 0) {\n                    return;\n                }\n            } else {\n                q.offer(value);\n            }\n        } else {\n            q.offer(value);\n            if (!enter()) {\n                return;\n            }\n        }\n        QueueDrainHelper.drainLoop(q, observer, delayError, disposable, this);\n    }', -1))

<tf.Tensor: shape=(1, 101), dtype=int64, numpy=
array([[   1,  485, 7292,    1,    1, 1108,    4,    1,    1,    4,    1,
           1,   23,    1,  485,    1, 6241,   54, 1232, 4780, 3712,    1,
        2018,    1,  119,  485,    1,    1, 3712, 4899, 2018,    1,  119,
          53,    1,    1,  408,   23, 2018, 2018, 2915,  194,  194,    1,
           1,    1,    4,  398,   23,   23,    1,   53,    1,    1,  408,
          23,   23,    1, 1735,    1,    4, 1108,   23,  119,   53,    1,
         408, 6657,   23, 2018, 2018, 2915,   23,    1, 1074,  119, 4582,
        4582,  331,    1, 4899,    1,    1,   23,  119, 4582, 4582,  331,
           1, 4899,    1,    1,   23,  119,   53,  408,    1,  408,   23,
          23,    1]])>

In [77]:
len('protected final void fastPathOrderedEmit(U value, boolean delayError, Disposable disposable) {\n        final Observer<? super V> observer = downstream;\n        final SimplePlainQueue<U> q = queue;\n\n        if (wip.get() == 0 && wip.compareAndSet(0, 1)) {\n            if (q.isEmpty()) {\n                accept(observer, value);\n                if (leave(-1) == 0) {\n                    return;\n                }\n            } else {\n                q.offer(value);\n            }\n        } else {\n            q.offer(value);\n            if (!enter()) {\n                return;\n            }\n        }\n        QueueDrainHelper.drainLoop(q, observer, delayError, disposable, this);\n    }')

683

In [17]:
def prepare_lm_inputs_labels(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

In [111]:
vectorize_layer.adapt(batched_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices

In [39]:
text_ds = batched_ds.map(prepare_lm_inputs_labels)

In [40]:
text_ds

<MapDataset shapes: ((None, 100), (None, 100)), types: (tf.int64, tf.int64)>

In [41]:
text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)

In [42]:
text_ds

<PrefetchDataset shapes: ((None, 100), (None, 100)), types: (tf.int64, tf.int64)>

In [43]:
for d, l in text_ds.take(2):
    print('data: {}, {}'.format(d, d.shape))
    print('label: {}, {}'.format(l, l.shape))
    print()

data: [[  65  797 1347 ... 1340 3329    4]
 [   1   15 1200 ...  898 4552  124]
 [4019  255   88 ...  517   51    4]
 ...
 [  12  399  165 ...   85   13   18]
 [  12 1031   13 ...  307  579   16]
 [  10  240   33 ... 1237    7    2]], (16, 100)
label: [[ 797 1347 2776 ... 3329    4   21]
 [  15 1200    9 ... 4552  124    3]
 [ 255   88  220 ...   51    4   30]
 ...
 [ 399  165  600 ...   13   18   17]
 [1031   13   18 ...  579   16 1876]
 [ 240   33    5 ...    7    2 2910]], (16, 100)

data: [[  13   18    9 ...    8   91    3]
 [  60   12   96 ...    1   13    8]
 [  12  997  116 ... 9568    6   73]
 ...
 [  96  135   47 ... 2164    4   21]
 [   2  273  778 ...    0    0    0]
 [  12   70  445 ...   20  355    3]], (16, 100)
label: [[  18    9   34 ...   91    3   20]
 [  12   96  218 ...   13    8  584]
 [ 997  116    5 ...    6   73   27]
 ...
 [ 135   47  303 ...    4   21   10]
 [ 273  778    7 ...    0    0    0]
 [  70  445   10 ...  355    3 7667]], (16, 100)



In [239]:
text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)

In [82]:
model1  = create_model()

In [83]:
model1.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
token_and_position_embedding (None, 100, 128)          1292800   
_________________________________________________________________
transformer_block_10 (Transf (None, 100, 128)          198400    
_________________________________________________________________
dense_26 (Dense)             (None, 100, 10000)        1290000   
Total params: 2,781,200
Trainable params: 2,781,200
Non-trainable params: 0
_________________________________________________________________


In [85]:
model1.save('keras-gpt')



INFO:tensorflow:Assets written to: keras-gpt/assets


INFO:tensorflow:Assets written to: keras-gpt/assets


In [106]:
history_1 = model1.fit(text_ds, verbose=2, epochs=1)

3125/3125 - 45s - loss: 4.8067 - dense_14_loss: 4.8067


In [94]:
!nvidia-smi

Tue Mar 23 21:57:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN RTX           Off  | 00000000:02:00.0 Off |                  N/A |
| 41%   38C    P8    23W / 280W |  24154MiB / 24219MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [47]:
history_1

<tensorflow.python.keras.callbacks.History at 0x7f7998d8fb70>

# Test with code (codenet search - java) data

In [32]:
import pandas as pd
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
from typing import Dict, Optional
from pathlib import Path

In [33]:
# export
def _split_input_target(mthd):
    input_text = mthd[:-1]
    target_text = mthd[1:]

    return input_text, target_text


def convert_df_to_tfds(
    df: pd.DataFrame, tokenizer: Tokenizer, max_length: int, batch_size: int
):
    tokenized_mthds = []
    for i in range(0, len(df.code.values), batch_size):
        batch = df.code.values[i : i + batch_size]
        batch = [f"<sos>{x}" for x in batch]
        for x in tokenizer.encode_batch(batch):
            tokenized_mthds.append(x.ids)

    # ds = ds.map(_split_input_target).batch(batch_size, drop_remainder=True)
    ds = tf.data.Dataset.from_tensor_slices(tokenized_mthds)
    ds = ds.map(_split_input_target).batch(batch_size, drop_remainder=True)

    return ds

In [34]:
# export
def train_tokenizer(
    df: pd.DataFrame,
    spec_toks: Dict[str, str],
    max_length: int,
    n: Optional[int] = None,
    vocab_sz: Optional[int] = 10_000,
    min_freq: Optional[int] = 2,
    output: Optional[Path] = None,
) -> Tokenizer:
    """
    Train a ByteLevel BPE tokenizer on a given pandas dataframe. Code adapted from https://github.com/huggingface/tokenizers/tree/master/bindings/python.

    :param df: the pandas dataframe containing each method to have the tokenizer train on
    :param spec_toks: dict of special tokens to add to the tokenizers so they do not get split
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :param vocab_sz: the maximum vocabulary size of the trained tokenizer. Defaulted was selected from: Big Code != Big Vocabulary: Open-Vocabulary Models for Source Code
    :param min_freq: the minimum frequency a token has to occur to be considered
    :returns: returns a trained ByteLevel BPE tokenizer
    """
    if n is None:
        n = len(df)

    # create tmp file to store df contents for training tokenizer
    tmp_path = Path("/tmp")
    tmp_path.mkdir(parents=True, exist_ok=True)
    with open(tmp_path / "tmp_tokenize.txt", "w") as f:
        f.write("\n".join(df.code.values[:n]))

    # initialize a tokenizer
    tokenizer = Tokenizer(models.BPE())

    # customize pre-tokenization and decoding
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

    # train tokenizer with data in tmp file
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_sz,
        min_frequency=min_freq,
        special_tokens=["<pad>", "<sos>", "<eos>"] + list(spec_toks.keys()),
    )
    tokenizer.train(trainer, [str(tmp_path / "tmp_tokenize.txt")])
    tokenizer.enable_padding(length=max_length, pad_token="<pad>")
    tokenizer.enable_truncation(max_length)

    # save tokenizer if output path given
    if output is not None:
        tokenizer.save(output, pretty=True)

    return tokenizer

In [191]:
!ls /tf/main/dvc-icodegen/data

clean_java.csv	codesearchnet_java.dvc


In [35]:
java_df = pd.read_csv('/tf/main/dvc-icodegen/data/clean_java.csv')

In [36]:
java_samples = java_df.sample(1000)

In [37]:
# export
# dicts of special tokens we are adding to the tokenizers so they do not get split

extra_tokens = {"<n>": "\n"}

# from https://docs.oracle.com/javase/tutorial/java/nutsandbolts/_keywords.html
java_reserved_tokens = {
    "<abstract>": "abstract",
    "<assert>": "assert",
    "<boolean>": "boolean",
    "<break>": "break",
    "<byte>": "byte",
    "<case>": "case",
    "<catch>": "catch",
    "<char>": "char",
    "<class>": "class",
    "<const>": "const",
    "<continue>": "continue",
    "<default>": "default",
    "<do>": "do",
    "<double>": "double",
    "<else>": "else",
    "<enum>": "enum",
    "<extends>": "extends",
    "<final>": "final",
    "<finally>": "finally",
    "<float>": "float",
    "<for>": "for",
    "<goto>": "goto",
    "<if>": "if",
    "<implements>": "implements",
    "<import>": "import",
    "<instanceof>": "instanceof",
    "<int>": "int",
    "<interface>": "interface",
    "<long>": "long",
    "<native>": "native",
    "<new>": "new",
    "<package>": "package",
    "<private>": "private",
    "<protected>": "protected",
    "<public>": "public",
    "<return>": "return",
    "<short>": "short",
    "<static>": "static",
    "<strictfp>": "strictfp",
    "<super>": "super",
    "<switch>": "switch",
    "<synchronized>": "synchronized",
    "<this>": "this",
    "<throw>": "throw",
    "<throws>": "throws",
    "<transient>": "transient",
    "<try>": "try",
    "<void>": "void",
    "<volatile>": "volatile",
    "<while>": "while",
}

# from https://docs.oracle.com/javase/tutorial/java/nutsandbolts/opsummary.html
java_operator_tokens = {
    "<=>": "=",
    "<+>": "+",
    "<->": "-",
    "<*>": "*",
    "</>": "/",
    "<%>": "%",
    "<++>": "++",
    "<-->": "--",
    "<!>": "!",
    "<==>": "==",
    "<!=>": "!=",
    "<greater>": ">",
    "<greater_equal>": ">=",
    "<lesser>": "<",
    "<lesser_equal>": "<=",
    "<&&>": "&&",
    "<||>": "||",
    "<?>": "?",
    "<:>": ":",
    "<~>": "~",
    "<double_lesser>": "<<",
    "<double_greater>": ">>",
    "<triple_greater>": ">>>",
    "<&>": "&",
    "<^>": "^",
    "<|>": "|",
}

java_structural_tokens = {
    "<{>": "{",
    "<}>": "}",
    "<[>": "[",
    "<]>": "]",
    "<lesser>": "<",
    "<greater>": ">",
    "<(>": "(",
    "<)>": ")",
    "<;>": ";",
}

java_extra_tokens = {
    "<@>": "@",
    "<...>": "...",
    "<null>": "null",
    "<true>": "true",
    "<false>": "false",
}

# combination of all dictionaries
java_special_tokens = {
    **java_reserved_tokens,
    **java_operator_tokens,
    **java_structural_tokens,
    **java_extra_tokens,
    **extra_tokens,
}

In [47]:
tokenizer = train_tokenizer(java_samples, java_special_tokens, 101, 10000)

In [48]:
tokenizer.get_vocab_size()

10000

In [49]:
type(tokenizer)

tokenizers.Tokenizer

In [41]:
java_samples

Unnamed: 0.1,Unnamed: 0,code,code_len,code_tokens,cyclomatic_complexity,data_type,method_name,nloc,parameter_count,partition,token_count
451956,773,"@XmlElementDecl(namespace = ""http://www.opengi...",49.0,"['@', 'XmlElementDecl', '(', 'namespace', '=',...",1.0,src,create_CityObject,3.0,1.0,train,25.0
413914,22718,@Override\n\tpublic void eUnset(int featureID)...,63.0,"['@', 'Override', 'public', 'void', 'eUnset', ...",4.0,src,eUnset,14.0,1.0,train,59.0
71800,14923,public static int[] sort(double[] arr) {\n ...,62.0,"['public', 'static', 'int', '[', ']', 'sort', ...",2.0,src,sort,8.0,1.0,train,57.0
221518,14717,public static Integer positiveInteger(String v...,28.0,"['public', 'static', 'Integer', 'positiveInteg...",1.0,src,positiveInteger,5.0,1.0,train,25.0
5689,5692,public static String getAvroSchemaFromHeader(R...,65.0,"['public', 'static', 'String', 'getAvroSchemaF...",3.0,src,getAvroSchemaFromHeader,7.0,2.0,test,62.0
...,...,...,...,...,...,...,...,...,...,...,...
469486,18308,"public void fatal(String loggerFqcn, Object me...",35.0,"['public', 'void', 'fatal', '(', 'String', 'lo...",1.0,src,fatal,3.0,4.0,train,33.0
437083,15891,"@SuppressWarnings(""fallthrough"")\r\n public...",191.0,"['@', 'SuppressWarnings', '(', '""fallthrough""'...",12.0,src,log2,33.0,2.0,train,182.0
250651,9409,public void marshall(ListPolicyAttachmentsRequ...,117.0,"['public', 'void', 'marshall', '(', 'ListPolic...",3.0,src,marshall,14.0,2.0,train,115.0
273917,2680,public static FileHandler getFileHandler(Strin...,85.0,"['public', 'static', 'FileHandler', 'getFileHa...",1.0,src,getFileHandler,11.0,5.0,train,82.0


In [50]:
batch_size

16

In [51]:
code_dataset = convert_df_to_tfds(java_samples, tokenizer, maxlen, batch_size)

In [52]:
for d, l in code_dataset:
    print('data: {}, shape: {}'.format(d, tf.shape(d)))
    print('label: {}, shape: {}'.format(l, l.shape))

data: [[   1  123 5446 ...    0    0    0]
 [   1  123  651 ...  558  118  222]
 [   1  391  557 ...    0    0    0]
 ...
 [   1  123  651 ...    0    0    0]
 [   1 1143  486 ... 2359  521  100]
 [   1  123  651 ...    0    0    0]], shape: [ 16 100]
label: [[ 123 5446   99 ...    0    0    0]
 [ 123  651  222 ...  118  222  221]
 [ 391  557  443 ...    0    0    0]
 ...
 [ 123  651  222 ...    0    0    0]
 [1143  486 6106 ...  521  100  269]
 [ 123  651  222 ...    0    0    0]], shape: (16, 100)
data: [[   1  391  557 ...    0    0    0]
 [   1  391  557 ...    0    0    0]
 [   1  391  486 ...  922 6127  792]
 ...
 [   1  391  547 ...    0    0    0]
 [   1  123  651 ...    0    0    0]
 [   1 1143  966 ...  714  119 8643]], shape: [ 16 100]
label: [[ 391  557  378 ...    0    0    0]
 [ 391  557  378 ...    0    0    0]
 [ 391  486 8648 ... 6127  792  308]
 ...
 [ 391  547  143 ...    0    0    0]
 [ 123  651  222 ...    0    0    0]
 [1143  966  119 ...  119 8643  916]], shape: 

In [53]:
code_dataset

<BatchDataset shapes: ((16, 100), (16, 100)), types: (tf.int32, tf.int32)>

In [73]:
len(tokenized_mthds[0])

100

In [118]:
model2 = create_model()

In [119]:
model2.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 100)]        0                                            
__________________________________________________________________________________________________
token_and_position_embedding_9  (None, 100, 128)     1292800     input_10[0][0]                   
__________________________________________________________________________________________________
transformer_block_9 (Transforme (None, 100, 128)     198400      token_and_position_embedding_9[0]
                                                                 transformer_block_9[0][0]        
__________________________________________________________________________________________________
dense_28 (Dense)                (None, 100, 10000)   1290000     transformer_block_9[1][0]  

In [109]:
history2 = model2.fit(code_dataset, verbose=2, epochs=1)

62/62 - 2s - loss: 6.3482 - dense_17_loss: 6.3482


In [123]:
[tokenizer.encode("<sos>").ids[0]]

[1]

In [126]:
txt_generated = model2.predict([tokenizer.encode("<sos>").ids[0]])

In [130]:
txt_generated

[array([[[-0.09153081, -0.06372187, -0.03998925, ..., -0.11863801,
          -0.22484449, -0.24193606]]], dtype=float32),
 array([[[-3.566873  , -0.01605946,  0.23996967, -0.98918015,
          -0.22023566, -1.059598  ,  1.3192903 , -0.46103477,
           0.42648992,  0.29907927, -0.36135545,  0.43568125,
           0.07291967, -0.8800807 , -1.1365008 ,  0.1792643 ,
          -0.9263537 , -0.3062601 , -0.4869356 , -0.9711441 ,
          -1.0377696 ,  1.4595013 , -2.1495712 , -1.1764786 ,
           0.72031623,  2.7252567 ,  0.21728685,  0.5430023 ,
          -0.6836771 ,  0.16995078,  1.5179352 , -0.2601426 ,
          -0.10348627, -0.11060691,  0.13765965,  0.5134665 ,
          -0.75359607, -0.91243124,  0.15522692, -1.8208091 ,
          -1.2667186 ,  0.79197353, -0.41856018,  1.4580827 ,
          -0.56720334,  1.2071506 , -0.05822236, -2.719151  ,
           0.18627709,  0.3478465 ,  0.39300972,  0.77991533,
          -1.7189705 ,  0.7329556 ,  0.45658857, -0.8943776 ,
          

In [99]:
!nvidia-smi

Tue Mar 23 22:08:18 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN RTX           Off  | 00000000:02:00.0 Off |                  N/A |
| 41%   37C    P8    23W / 280W |  24154MiB / 24219MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [135]:
def create_big_model():
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    transf_block2 = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    x1 = transf_block2(x)
    outputs = layers.Dense(vocab_size)(x1)
    model = keras.Model(inputs=inputs, outputs=[outputs, x1, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, None, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model

In [138]:
model1.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
token_and_position_embedding (None, 100, 128)          1292800   
_________________________________________________________________
transformer_block_4 (Transfo (None, 100, 128)          198400    
_________________________________________________________________
dense_14 (Dense)             (None, 100, 10000)        1290000   
Total params: 2,781,200
Trainable params: 2,781,200
Non-trainable params: 0
_________________________________________________________________


In [136]:
big_model = create_big_model()

In [137]:
big_model.summary()

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 100)]             0         
_________________________________________________________________
token_and_position_embedding (None, 100, 128)          1292800   
_________________________________________________________________
transformer_block_12 (Transf (None, 100, 128)          198400    
_________________________________________________________________
transformer_block_13 (Transf (None, 100, 128)          198400    
_________________________________________________________________
dense_37 (Dense)             (None, 100, 10000)        1290000   
Total params: 2,979,600
Trainable params: 2,979,600
Non-trainable params: 0
_________________________________________________________________


In [139]:
history3 = big_model.fit(code_dataset, verbose=2, epochs=1)

62/62 - 3s - loss: 6.2865 - dense_37_loss: 6.2865


In [151]:
big_model._name

'GPT2_keras'

In [150]:
big_model._name = "GPT2_keras"

In [142]:
help(big_model)

Help on Functional in module tensorflow.python.keras.engine.functional object:

class Functional(tensorflow.python.keras.engine.training.Model)
 |  A `Functional` model is a `Model` defined as a directed graph of layers.
 |  
 |  Three types of `Model` exist: subclassed `Model`, `Functional` model,
 |  and `Sequential` (a special case of `Functional`).
 |  In general, more Keras features are supported with `Functional`
 |  than with subclassed `Model`s, specifically:
 |  
 |  - Model cloning (`keras.models.clone`)
 |  - Serialization (`model.get_config()/from_config`, `model.to_json()/to_yaml()`
 |  - Whole-model saving (`model.save()`)
 |  
 |  A `Functional` model can be instantiated by passing two arguments to
 |  `__init__`. The first argument is the `keras.Input` Tensors that represent
 |  the inputs to the model. The second argument specifies the output
 |  tensors that represent the outputs of this model. Both arguments can be a
 |  nested structure of tensors.
 |  
 |  Example:

In [152]:
big_model.save('keras-gpt')



INFO:tensorflow:Assets written to: keras-gpt/assets


INFO:tensorflow:Assets written to: keras-gpt/assets


In [55]:
def create_deep_model(n_transformer_blocks):
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformerLayers = []
    outputs_x = []
    for i in range(n_transformer_blocks):
        transformerLayers.append(TransformerBlock(embed_dim, num_heads, feed_forward_dim))
        
    # transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    for i in range(n_transformer_blocks):
        x = transformerLayers[i](x)
        outputs_x.append(x)

    # x = transformer_block(x)
    
    outputs = layers.Dense(vocab_size)(outputs_x[n_transformer_blocks-1])
    model = keras.Model(inputs=inputs, outputs=[outputs, *outputs_x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, *[None]*n_transformer_blocks],
    )  # No loss and optimization based on word embeddings from transformer block
    return model

In [28]:
mini_model = create_model()

In [29]:
mini_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
token_and_position_embedding (None, 100, 128)          1292800   
_________________________________________________________________
transformer_block_4 (Transfo (None, 100, 128)          198400    
_________________________________________________________________
dense_11 (Dense)             (None, 100, 10000)        1290000   
Total params: 2,781,200
Trainable params: 2,781,200
Non-trainable params: 0
_________________________________________________________________


In [57]:
multi_transformer = create_deep_model(3)

In [58]:
multi_transformer.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
token_and_position_embedding (None, 100, 128)          1292800   
_________________________________________________________________
transformer_block_7 (Transfo (None, 100, 128)          198400    
_________________________________________________________________
transformer_block_8 (Transfo (None, 100, 128)          198400    
_________________________________________________________________
transformer_block_9 (Transfo (None, 100, 128)          198400    
_________________________________________________________________
dense_23 (Dense)             (None, 100, 10000)        1290000   
Total params: 3,178,000
Trainable params: 3,178,000
Non-trainable params: 0
_________________________________________________

In [285]:
multi_transformer.save('deep_gpt')



INFO:tensorflow:Assets written to: deep_gpt/assets


INFO:tensorflow:Assets written to: deep_gpt/assets


In [60]:
code_dataset

<BatchDataset shapes: ((16, 100), (16, 100)), types: (tf.int32, tf.int32)>

In [59]:
history4 = multi_transformer.fit(code_dataset, verbose=2, epochs=1)

62/62 - 3s - loss: 6.2724 - dense_23_loss: 6.2724


In [61]:
multi_transformer._name

'model_3'

In [121]:
def sample_from(logits, k=10):
    logits, indices = tf.math.top_k(logits, k=k, sorted=True)
    indices = np.asarray(indices).astype("int32")
    preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
    preds = np.asarray(preds).astype("float32")
    return np.random.choice(indices, p=preds)

In [122]:
def sample(max_len, start_tokens):
    num_tokens_generated = 0
    tokens_generated = []
    while num_tokens_generated <= max_len:
        pad_len = maxlen - len(start_tokens)
        sample_index = len(start_tokens) - 1
        if pad_len < 0:
            x = start_tokens[:maxlen]
            sample_index = maxlen - 1
        elif pad_len > 0:
            x = start_tokens + [0] * pad_len
        else:
            x = start_tokens
        x = np.array([x])
        model_output = model1.predict(x)
        y = model_output[0]
        sample_token = sample_from(y[0][sample_index])
        tokens_generated.append(sample_token)
        start_tokens.append(sample_token)
        num_tokens_generated = len(tokens_generated)

In [204]:
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "this movie is"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 10

In [205]:
start_tokens

[13, 18, 9]

In [206]:
start_tokens = [_ for _ in start_tokens]

In [207]:
start_tokens

[13, 18, 9]

In [209]:
pad_len = maxlen - len(start_tokens)
sample_index = len(start_tokens) - 1

In [210]:
pad_len
sample_index

2

In [227]:
x = start_tokens + [0] * pad_len

In [228]:
x = np.array([x])

In [229]:
x.shape

(1, 100)

In [214]:
y, _ = model1.predict(x)

In [216]:
y.shape

(1, 100, 10000)

In [217]:
y[0][sample_index]

array([ 0.13938266, -0.14569119, -0.24544635, ...,  0.28506798,
        0.00394009, -0.00910913], dtype=float32)

In [222]:
sample_token = sample_from(y[0][sample_index])

In [223]:
sample_token

6149

In [278]:
num_tokens_generated = 0
tokens_generated = []
while num_tokens_generated <= 10:
    pad_len = maxlen - len(start_tokens)
    sample_index = len(start_tokens) - 1
    if pad_len < 0:
        x = start_tokens[:maxlen]
        sample_index = maxlen - 1
    elif pad_len > 0:
        x = start_tokens + [0] * pad_len
    else:
        x = start_tokens
    x = np.array([x])
    y, _ = model1.predict(x)
    sample_token = sample_from(y[0][sample_index])
    tokens_generated.append(sample_token)
    start_tokens.append(sample_token)
    num_tokens_generated = len(tokens_generated)

In [279]:
tokens_generated

[4738, 6041, 9421, 2457, 6698, 3614, 1666, 1224, 3204, 3167, 1767]

In [224]:
inputs = tokenizer.encode("<sos>").ids

In [271]:
tf.expand_dims([inputs], 0).shape

TensorShape([1, 1, 101])

In [233]:
x_ = np.array([inputs[:-1]])

In [272]:
x_.shape

(1, 100)

In [235]:
result = model1.predict(x_)

In [239]:
y_, output_x = result[0], result[1]

In [266]:
sample_token_ = sample_from(y_[0][1])

In [267]:
sample_token_

1126

In [274]:
encoded_input = tokenizer.encode(start_prompt).ids

In [280]:
len(encoded_input)

101

In [281]:
y, x = model1.predict(encoded_input)

In [282]:
model1.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
token_and_position_embedding (None, 100, 128)          1292800   
_________________________________________________________________
transformer_block_10 (Transf (None, 100, 128)          198400    
_________________________________________________________________
dense_26 (Dense)             (None, 100, 10000)        1290000   
Total params: 2,781,200
Trainable params: 2,781,200
Non-trainable params: 0
_________________________________________________________________


In [284]:
model1.save('keras-gpt')



INFO:tensorflow:Assets written to: keras-gpt/assets


INFO:tensorflow:Assets written to: keras-gpt/assets
