# Baseline Model Training

* Custom Tokenizer
* GRU Encoder / Decoder

## Imports

In [1]:
import os
import io
import sys
import ijson

import tensorflow as tf
import pandas as pd 
import numpy as np 

sys.path.append('D:\PROJECT\Level-4-Project')

In [2]:
from code2text.models.baseline.model import seq2seqTrain, MaskedLoss
from code2text.helper.model import BatchLogs
from code2text.helper.preprocess import tf_lower_and_split_punct

## Preprocessing

**TODO:**

* Load Data
* Format to pandas
* Format into single file for each language 
* tf format

In [3]:
path = "D:\PROJECT\data\CodeSearchNet"
langs = ["go", "java", "python", "javascript", "ruby", "php"]
format = ["train.jsonl", "test.jsonl", "valid.jsonl"]

In [9]:
def read_data(path):
    with open(path, encoding="UTF-8") as json_file:
        cursor = 0
        code_set = []
        string_set = []
        lang_set = []
        for _, line in enumerate(json_file):
            code = None
            string = None
            line_as_file = io.StringIO(line)
            json_parser = ijson.parse(line_as_file)

            for prefix, _, value in json_parser:
                if prefix == "code":  
                    code = value
                if prefix == "docstring":
                    string = value
                if prefix == "language":
                    lang = value
            if (code is not None and string is not None):
                code_set.append(code)
                string_set.append(string)
                lang_set.append(lang)

            cursor += len(line)
    return pd.DataFrame(data={'code': code_set, 'docstring': string_set, 'language': lang_set})

In [10]:
train = []
test = []
valid = []

for lang in langs:
    tmp = os.path.join(path, lang)
    for file in format:
        print("Processing ", lang, " ", file, "... ")
        data = read_data(os.path.join(tmp, file))
        if file == "train.jsonl":
            train.append(data)
        if file == "test.jsonl":
            test.append(data)
        if file == "valid.jsonl":
            valid.append(data)

Processing  go   train.jsonl ... 
Processing  go   test.jsonl ... 
Processing  go   valid.jsonl ... 
Processing  java   train.jsonl ... 
Processing  java   test.jsonl ... 
Processing  java   valid.jsonl ... 
Processing  python   train.jsonl ... 
Processing  python   test.jsonl ... 
Processing  python   valid.jsonl ... 
Processing  javascript   train.jsonl ... 
Processing  javascript   test.jsonl ... 
Processing  javascript   valid.jsonl ... 
Processing  ruby   train.jsonl ... 
Processing  ruby   test.jsonl ... 
Processing  ruby   valid.jsonl ... 
Processing  php   train.jsonl ... 
Processing  php   test.jsonl ... 
Processing  php   valid.jsonl ... 


In [13]:
train = pd.concat(train)
test = pd.concat(test)
valid = pd.concat(valid)

In [14]:
data = pd.concat([train, test, valid])

In [18]:
data.reset_index(inplace=True)
data.to_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\data.json")

In [3]:
data = pd.read_json("D:\PROJECT\data\CodeSearchNet\Combine_clean\data.json")

In [4]:
dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(data["code"].values, tf.string),
            tf.cast(data["docstring"].values, tf.string)
        )
    )
).shuffle(len(data["code"])).batch(64)

In [None]:
train_set = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train["code"].values, tf.string),
            tf.cast(train["docstring"].values, tf.string)
        )
    )
)
test_set = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(test["code"].values, tf.string),
            tf.cast(test["docstring"].values, tf.string)
        )
    )
)
valid_set = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(valid["code"].values, tf.string),
            tf.cast(valid["docstring"].values, tf.string)
        )
    )
)

## Config & Build

**TODO:**

* Summarise

In [5]:
input_processor = input_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=50000)

output_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=50000)

In [6]:
input_processor.adapt(data["code"])
output_processor.adapt(data["docstring"])

In [8]:
train_model = seq2seqTrain(300, [128, 64], input_text_processor=input_processor,
    output_text_processor=output_processor)

TypeError: 'int' object is not iterable

In [None]:
train_model.compile(
    optimizer=tf.optimizers.Adam(),
    loss=MaskedLoss(),
)

In [None]:
batch_loss = BatchLogs('batch_loss')

## Training

In [None]:
train_model.fit(train_set, epochs=1, callbacks=[batch_loss])

## Validation & Testing