In [1]:
import os
import re

import pandas as pd
from sklearn.model_selection import train_test_split

from parseAstJava import parse_java

In [2]:
comments = pd.read_csv('comments.tsv', sep='\t', header=None)
comments.columns = ["id", "com"]


In [3]:
funs = pd.read_json('functions.json', typ='series')
funs = funs.reset_index()
funs.columns = ["id", "fun"]

In [4]:
def add_missing_closing_tags(text, opening_tag, closing_tag, repl=None):
    if repl is None:
        repl = closing_tag
    otc = text.count(opening_tag)
    ctc = text.count(closing_tag)
    if otc > ctc:
        text += repl*(otc-ctc)
    return text

def replace_whitespaces_comments(text):
    text = re.sub(r"(?<!(public )|(rivate )|(tected ))enum(?![a-z0-9A-Z])", r"enums", text)

    text = re.sub("(\\t)|(//\\n)", "<whitespace_character>", "<newline_character>"+text)
    text = re.sub("(\\n)", "<newline_character><newline_character>", text)
    text = re.sub(r"}\s*;(<newline_character>)+?$", "}<newline_character>", text)
    text = re.sub(r"((<whitespace_character>)|(<newline_character>)|([;}{,)|])|(case.*?:))(<whitespace_character>)*\s*(//.*?<newline_character>)", r"\1 ", text)
    text = re.sub("\s*(//((?!\").)+?<newline_character>)", " ", text)

    text = add_missing_closing_tags(text, "/*", "*/")
    text = re.sub(r"/\*.*?\*/", " ", text)
    text = add_missing_closing_tags(text, "{<newline_character>", r"}((<newline_character>)|( catch)|( else)", "}<newline_character>")
    text = re.sub("(<whitespace_character>)|(<newline_character>)", " ", text)
    return text


def split_single_special_characters(text):
    return re.sub(r"([()\[\]{};.,@_])", r" \1 ", text)


def split_camel_case(text):
    return re.sub(r"(?<=[a-z])([A-Z])", r" \1", text)


def trim_whitespaces(text):
    return str.strip(re.sub("\s+", ' ', text))


def split_combined_special_characters(text):
    return re.sub(r"([<>|&!/*\-+=]+)", r" \1 ", text)


def parse_ast(text):
    try:
        return parse_java(text)
    except:
        return ""


In [5]:
funs["fun"] = funs["fun"].apply(replace_whitespaces_comments)
funs = funs.loc[funs["fun"].str.strip().str.len() != 0]

In [6]:
funs["ast"] = funs["fun"].apply(parse_ast).apply(lambda x: x.encode('unicode-escape').decode('ascii'))

In [7]:
funs = funs.loc[funs["ast"].str.strip().str.len() != 0]

In [8]:
# TODO: fun trim whitespaces
funs["fun_processed"] = funs["fun"].apply(lambda s: trim_whitespaces(
    split_combined_special_characters(split_camel_case(split_single_special_characters(s)))))

In [9]:
funs.head()

Unnamed: 0,id,fun,ast,fun_processed
0,321,public int getPushesLowerbound() { return...,MethodDeclaration Modifier public BasicType in...,public int get Pushes Lowerbound ( ) { return ...
1,323,public void setPushesLowerbound(int pushesLo...,MethodDeclaration Modifier public setPushesLow...,public void set Pushes Lowerbound ( int pushes...
2,324,public void play() { if(currentS...,MethodDeclaration Modifier public play IfState...,public void play ( ) { if ( current Sound File...
3,343,"public int getInfluenceValue(int boxNo1, int...",MethodDeclaration Modifier public BasicType in...,"public int get Influence Value ( int box No1 ,..."
4,351,public void setPositions(int[] positions){ ...,MethodDeclaration Modifier public setPositions...,public void set Positions ( int [ ] positions ...


In [10]:
df_cd = pd.merge(funs, comments, how='inner')

In [11]:
df_cd = df_cd.set_index("id")

In [12]:
df_cd.head()

Unnamed: 0_level_0,fun,ast,fun_processed,com
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
321,public int getPushesLowerbound() { return...,MethodDeclaration Modifier public BasicType in...,public int get Pushes Lowerbound ( ) { return ...,returns the pushes lowerbound of this board po...
323,public void setPushesLowerbound(int pushesLo...,MethodDeclaration Modifier public setPushesLow...,public void set Pushes Lowerbound ( int pushes...,sets the pushes lowerbound of this board position
324,public void play() { if(currentS...,MethodDeclaration Modifier public play IfState...,public void play ( ) { if ( current Sound File...,play a sound
343,"public int getInfluenceValue(int boxNo1, int...",MethodDeclaration Modifier public BasicType in...,"public int get Influence Value ( int box No1 ,...",returns the influence value between the positi...
351,public void setPositions(int[] positions){ ...,MethodDeclaration Modifier public setPositions...,public void set Positions ( int [ ] positions ...,sets the box positions and the player position


In [13]:
X = df_cd[["fun", "ast", "fun_processed"]]
y = df_cd["com"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

In [23]:
print(X_train.shape, X_test.shape)

(1928633, 3) (107146, 3)


In [25]:
os.mkdir("train")
os.mkdir("test")
os.mkdir("valid")
os.mkdir("dev")

In [26]:

X_train["fun_processed"].to_csv("train/code.original_subtoken", index=False, header=False)
X_train["fun_processed"].to_csv("train/train.spl.src", index=False, header=False)
X_train["fun"].to_csv("train/train.txt.src", index=False, header=False)
X_train["ast"].to_csv("train/train.ast.src", index=False, header=False)
y_train.to_csv("train/javadoc.original", index=False, header=False)
y_train.to_csv("train/train.txt.tgt", index=False, header=False)

In [27]:
X_test["fun_processed"].to_csv("test/code.original_subtoken", index=False, header=False)
X_test["fun_processed"].to_csv("test/test.spl.src", index=False, header=False)
X_test["fun"].to_csv("test/test.txt.src", index=False, header=False)
X_test["ast"].to_csv("test/test.ast.src", index=False, header=False)
y_test.to_csv("test/javadoc.original", index=False, header=False)
y_test.to_csv("test/test.txt.tgt", index=False, header=False)

In [28]:
X_val["fun_processed"].to_csv("dev/code.original_subtoken", index=False, header=False)
X_val["fun_processed"].to_csv("valid/valid.spl.src", index=False, header=False)
X_val["fun"].to_csv("valid/valid.txt.src", index=False, header=False)
X_val["ast"].to_csv("valid/valid.ast.src", index=False, header=False)
y_val.to_csv("dev/javadoc.original", index=False, header=False)
y_val.to_csv("valid/valid.txt.tgt", index=False, header=False)
