# Baseline Model Training

* Custom Tokenizer
* GRU Encoder / Decoder

## Imports

In [1]:
import os
import io
import sys


import pandas as pd 
import numpy as np 

sys.path.append('D:\PROJECT\Level-4-Project')
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [2]:
from code2text.models.baseline.model import seq2seqTrain, MaskedLoss
from code2text.helper.model import BatchLogs
from code2text.helper.preprocess import tf_lower_and_split_punct

In [3]:
import ijson

In [3]:
import tensorflow as tf
import tensorflow_text as text

In [4]:
#tf.debugging.set_log_device_placement(True)
tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[0], True)
#tf.config.experimental.set_virtual_device_configuration(tf.config.experimental.list_physical_devices('GPU')[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3072)])

In [4]:
tf.config.run_functions_eagerly(False)

## Preprocessing

In [4]:
path = "D:\PROJECT\data\CodeSearchNet"
langs = ["java", "python"]
format = ["train.jsonl", "test.jsonl", "valid.jsonl"]

In [5]:
def read_data(path):
    with open(path, encoding="UTF-8") as json_file:
        cursor = 0
        code_set = []
        string_set = []
        lang_set = []
        for _, line in enumerate(json_file):
            code = None
            string = None
            line_as_file = io.StringIO(line)
            json_parser = ijson.parse(line_as_file)

            for prefix, _, value in json_parser:
                if prefix == "code":  
                    code = value
                if prefix == "docstring":
                    string = value
                if prefix == "language":
                    lang = value
            if (code is not None and string is not None):
                code_set.append(code)
                string_set.append(string)
                lang_set.append(lang)

            cursor += len(line)
    return pd.DataFrame(data={'code': code_set, 'docstring': string_set, 'language': lang_set})

In [6]:
train = []
test = []
valid = []

for lang in langs:
    tmp = os.path.join(path, lang)
    for file in format:
        print("Processing ", lang, " ", file, "... ")
        data = read_data(os.path.join(tmp, file))
        if file == "train.jsonl":
            train.append(data)
        if file == "test.jsonl":
            test.append(data)
        if file == "valid.jsonl":
            valid.append(data)

Processing  java   train.jsonl ... 
Processing  java   test.jsonl ... 
Processing  java   valid.jsonl ... 
Processing  python   train.jsonl ... 
Processing  python   test.jsonl ... 
Processing  python   valid.jsonl ... 


In [None]:
train = pd.concat(train)
test = pd.concat(test)
valid = pd.concat(valid)

In [None]:
data = pd.concat([train, test, valid])

In [None]:
train.reset_index(inplace=True)
train.to_json("D:\PROJECT\data\CodeSearchNet\Azure\\train.jsonl", orient="records", lines=True)
test.reset_index(inplace=True)
test.to_json("D:\PROJECT\data\CodeSearchNet\Azure\\test.jsonl", orient="records", lines=True)
valid.reset_index(inplace=True)
valid.to_json("D:\PROJECT\data\CodeSearchNet\Azure\\valid.jsonl", orient="records", lines=True)

In [None]:
data.reset_index(inplace=True)
data.to_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\data.json")

## Dataset Initialization

In [6]:
data = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\data.json")

In [5]:
train = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\train.json")
valid = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\valid.json")
test = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\\test.json")

In [6]:
batch_size = 256
buffer = 2048

In [None]:
dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(data["code"].values, tf.string),
            tf.cast(data["docstring"].values, tf.string)
        )
    )
).shuffle(buffer).batch(batch_size)

In [7]:
train_set = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train["code"].values, tf.string),
            tf.cast(train["docstring"].values, tf.string)
        )
    )
).shuffle(buffer).batch(batch_size, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)
test_set = (
    tf.data.Dataset.from_tensor_slices(
        (
        tf.cast(test["code"].values, tf.string),
        tf.cast(test["docstring"].values, tf.string)
        )
    )
).shuffle(buffer).batch(batch_size, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)
valid_set = (
    tf.data.Dataset.from_tensor_slices(
        (
        tf.cast(valid["code"].values, tf.string),
        tf.cast(valid["docstring"].values, tf.string)
        )  
    )
).shuffle(buffer).batch(batch_size, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)

## Config & Build

### Adapt Process

In [13]:
tokens = 30000
input_processor = input_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=tokens)

output_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=tokens)

In [14]:
input_processor.adapt(train["code"])
output_processor.adapt(train["docstring"])

In [26]:
with open("D:\PROJECT\Level-4-Project\data\invocab.txt", "w") as infile:
    infile.write("\n".join(map(str, input_processor.get_vocabulary())))

with open("D:\PROJECT\Level-4-Project\data\outvocab.txt", "w") as outfile:
    outfile.write("\n".join(map(str, output_processor.get_vocabulary())))

### Build

In [8]:
input_processor = input_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    vocabulary="D:\PROJECT\Level-4-Project\data\outvocab.txt")

output_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    vocabulary="D:\PROJECT\Level-4-Project\data\outvocab.txt")

In [9]:
train_model = seq2seqTrain(112, 64, input_text_processor=input_processor,
    output_text_processor=output_processor)

In [10]:
train_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.0004),
    loss=MaskedLoss(),
    metrics=['acc', text.metrics.rouge_l]
)

In [11]:
batch_loss = BatchLogs('batch_loss')
chkpt = tf.keras.callbacks.ModelCheckpoint(
    "D:\PROJECT\Level-4-Project\notebooks\training\chkpt\baseline", monitor='loss', save_best_only=True, save_freq=1000
)

## Training

In [14]:
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [12]:
tf.keras.backend.clear_session()

In [13]:
history = train_model.fit(train_set, epochs=3, validation_data=valid_set, callbacks=[batch_loss, chkpt])

Epoch 1/3
 2050/14190 [===>..........................] - ETA: 14:46:15 - batch_loss: 5.4566

ResourceExhaustedError: Graph execution error:

Detected at node 'while/decoder/attention/additive_attention/Sum' defined at (most recent call last):
    File "D:\CONDA\envs\code2text\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "D:\CONDA\envs\code2text\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "D:\CONDA\envs\code2text\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "D:\CONDA\envs\code2text\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "D:\CONDA\envs\code2text\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "D:\CONDA\envs\code2text\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "D:\CONDA\envs\code2text\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelbase.py", line 461, in dispatch_queue
      await self.process_one()
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelbase.py", line 450, in process_one
      await dispatch(*args)
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelbase.py", line 357, in dispatch_shell
      await result
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelbase.py", line 652, in execute_request
      reply_content = await reply_content
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\ipkernel.py", line 359, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 2768, in run_cell
      result = self._run_cell(
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 2814, in _run_cell
      return runner(coro)
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 3012, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 3191, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 3251, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Parry\AppData\Local\Temp\ipykernel_6380\4110161981.py", line 1, in <module>
      history = train_model.fit(train_set, epochs=3, validation_data=valid_set, callbacks=[batch_loss, chkpt])
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 204, in train_step
      return self._train_step(inputs)
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 188, in _train_step
      for t in tf.range(max_target_length-1):
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 190, in _train_step
      step_loss, dec_state = self._loop_step(new_tokens, input_mask,
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 165, in _loop_step
      dec_result, dec_state = self.decoder(decoder_input, state=dec_state)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 119, in call
      context_vector, attention_weights = self.attention(
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 52, in call
      context_vector, attention_weights = self.attention(
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\layers\dense_attention.py", line 147, in call
      scores = self._calculate_scores(query=q, key=k)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\layers\dense_attention.py", line 500, in _calculate_scores
      return tf.reduce_sum(
Node: 'while/decoder/attention/additive_attention/Sum'
Detected at node 'while/decoder/attention/additive_attention/Sum' defined at (most recent call last):
    File "D:\CONDA\envs\code2text\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "D:\CONDA\envs\code2text\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "D:\CONDA\envs\code2text\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "D:\CONDA\envs\code2text\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "D:\CONDA\envs\code2text\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "D:\CONDA\envs\code2text\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "D:\CONDA\envs\code2text\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelbase.py", line 461, in dispatch_queue
      await self.process_one()
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelbase.py", line 450, in process_one
      await dispatch(*args)
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelbase.py", line 357, in dispatch_shell
      await result
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\kernelbase.py", line 652, in execute_request
      reply_content = await reply_content
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\ipkernel.py", line 359, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "D:\CONDA\envs\code2text\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 2768, in run_cell
      result = self._run_cell(
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 2814, in _run_cell
      return runner(coro)
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 3012, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 3191, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "D:\CONDA\envs\code2text\lib\site-packages\IPython\core\interactiveshell.py", line 3251, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Parry\AppData\Local\Temp\ipykernel_6380\4110161981.py", line 1, in <module>
      history = train_model.fit(train_set, epochs=3, validation_data=valid_set, callbacks=[batch_loss, chkpt])
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 204, in train_step
      return self._train_step(inputs)
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 188, in _train_step
      for t in tf.range(max_target_length-1):
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 190, in _train_step
      step_loss, dec_state = self._loop_step(new_tokens, input_mask,
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 165, in _loop_step
      dec_result, dec_state = self.decoder(decoder_input, state=dec_state)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 119, in call
      context_vector, attention_weights = self.attention(
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "D:\PROJECT\Level-4-Project\code2text\models\baseline\model.py", line 52, in call
      context_vector, attention_weights = self.attention(
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\layers\dense_attention.py", line 147, in call
      scores = self._calculate_scores(query=q, key=k)
    File "D:\CONDA\envs\code2text\lib\site-packages\keras\layers\dense_attention.py", line 500, in _calculate_scores
      return tf.reduce_sum(
Node: 'while/decoder/attention/additive_attention/Sum'
2 root error(s) found.
  (0) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[14464] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node while/decoder/attention/additive_attention/Sum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[StatefulPartitionedCall/while/LoopCond/_276/_334]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

  (1) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[14464] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node while/decoder/attention/additive_attention/Sum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_8654]

## Validation & Testing

In [None]:
history