# Baseline Model Training

* Custom Tokenizer
* GRU Encoder / Decoder

## Imports

In [1]:
import os
import io
import sys


import pandas as pd 
import numpy as np 

sys.path.append('D:\PROJECT\Level-4-Project')
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [2]:
from code2text.models.baseline.model import seq2seqTrain, MaskedLoss
from code2text.helper.model import BatchLogs
from code2text.helper.preprocess import tf_lower_and_split_punct

In [None]:
import ijson

In [3]:
import tensorflow as tf
import tensorflow_text as text

In [4]:
#tf.debugging.set_log_device_placement(True)
tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[0], True)
#tf.config.experimental.set_virtual_device_configuration(tf.config.experimental.list_physical_devices('GPU')[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])

In [4]:
tf.config.run_functions_eagerly(False)

## Preprocessing

In [None]:
path = "D:\PROJECT\data\CodeSearchNet"
langs = ["go", "java", "python", "javascript", "ruby", "php"]
format = ["train.jsonl", "test.jsonl", "valid.jsonl"]

In [None]:
def read_data(path):
    with open(path, encoding="UTF-8") as json_file:
        cursor = 0
        code_set = []
        string_set = []
        lang_set = []
        for _, line in enumerate(json_file):
            code = None
            string = None
            line_as_file = io.StringIO(line)
            json_parser = ijson.parse(line_as_file)

            for prefix, _, value in json_parser:
                if prefix == "code":  
                    code = value
                if prefix == "docstring":
                    string = value
                if prefix == "language":
                    lang = value
            if (code is not None and string is not None):
                code_set.append(code)
                string_set.append(string)
                lang_set.append(lang)

            cursor += len(line)
    return pd.DataFrame(data={'code': code_set, 'docstring': string_set, 'language': lang_set})

In [None]:
train = []
test = []
valid = []

for lang in langs:
    tmp = os.path.join(path, lang)
    for file in format:
        print("Processing ", lang, " ", file, "... ")
        data = read_data(os.path.join(tmp, file))
        if file == "train.jsonl":
            train.append(data)
        if file == "test.jsonl":
            test.append(data)
        if file == "valid.jsonl":
            valid.append(data)

In [None]:
train = pd.concat(train)
test = pd.concat(test)
valid = pd.concat(valid)

In [None]:
data = pd.concat([train, test, valid])

In [None]:
train.reset_index(inplace=True)
train.to_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\train.json")
test.reset_index(inplace=True)
test.to_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\test.json")
valid.reset_index(inplace=True)
valid.to_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\valid.json")

In [None]:
data.reset_index(inplace=True)
data.to_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\data.json")

## Dataset Initialization

In [None]:
data = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\data.json")

In [5]:
train = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\train.json")
valid = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\valid.json")
test = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\test.json")

In [6]:
batch_size = 32
buffer = 1024

In [None]:
dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(data["code"].values, tf.string),
            tf.cast(data["docstring"].values, tf.string)
        )
    )
).shuffle(buffer).batch(batch_size)

In [7]:
with tf.device('/CPU:0'):
    train_set = (
        tf.data.Dataset.from_tensor_slices(
            (
                tf.cast(train["code"].values, tf.string),
                tf.cast(train["docstring"].values, tf.string)
            )
        )
    ).shuffle(buffer).batch(batch_size, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)
    test_set = (
        tf.data.Dataset.from_tensor_slices(
            (
                tf.cast(test["code"].values, tf.string),
                tf.cast(test["docstring"].values, tf.string)
            )
        )
    ).shuffle(buffer).batch(batch_size, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)
    valid_set = (
        tf.data.Dataset.from_tensor_slices(
            (
                tf.cast(valid["code"].values, tf.string),
                tf.cast(valid["docstring"].values, tf.string)
            )  
        )
    ).shuffle(buffer).batch(batch_size, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)

## Config & Build

In [8]:
tokens = 40000
input_processor = input_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=tokens)

output_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=tokens)

In [9]:
input_processor.adapt(train["code"])
output_processor.adapt(train["docstring"])

In [None]:
train_model = seq2seqTrain(112, 48, input_text_processor=input_processor,
    output_text_processor=output_processor)

In [None]:
train_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.0004),
    loss=MaskedLoss(),
    metrics=['acc', text.metrics.rouge_l]
)

In [None]:
batch_loss = BatchLogs('batch_loss')

## Training

In [None]:
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


False

In [14]:
tf.keras.backend.clear_session()

In [15]:
history = train_model.fit(train_set, epochs=3, validation_data=valid_set, callbacks=[batch_loss])

Epoch 1/3
 345/1642 [=====>........................] - ETA: 1:21:31 - batch_loss: 6.8736

ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  SameWorkerRecvDone unable to allocate output tensor. Key: /job:localhost/replica:0/task:0/device:GPU:0;4813655c55b4b0f0;/job:localhost/replica:0/task:0/device:GPU:0;edge_3336_StatefulPartitionedCall/while/body/_59/while/gradient_tape/while/gradients/while/decoder/embedding_1/embedding_lookup_grad/Size;15032662501523675017:428
	 [[{{node StatefulPartitionedCall/while/body/_59/while/gradient_tape/while/gradients/while/decoder/embedding_1/embedding_lookup_grad/Size/_244}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[StatefulPartitionedCall/while/LoopCond/_276/_330]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

  (1) Resource exhausted:  SameWorkerRecvDone unable to allocate output tensor. Key: /job:localhost/replica:0/task:0/device:GPU:0;4813655c55b4b0f0;/job:localhost/replica:0/task:0/device:GPU:0;edge_3336_StatefulPartitionedCall/while/body/_59/while/gradient_tape/while/gradients/while/decoder/embedding_1/embedding_lookup_grad/Size;15032662501523675017:428
	 [[{{node StatefulPartitionedCall/while/body/_59/while/gradient_tape/while/gradients/while/decoder/embedding_1/embedding_lookup_grad/Size/_244}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_8931]

Function call stack:
train_function -> train_function


## Validation & Testing

In [None]:
history