In [1]:
import pandas as pd 
import javalang
from javalang.ast import Node
from tqdm import tqdm

In [2]:
java_api_url = '/data/code/represent-code-in-human/data/java_api.csv'
java_api = pd.read_csv(java_api_url, header=0, encoding='utf-8')
java_api['index_name'] = java_api['index_name'].apply(str)
java_api 

Unnamed: 0,index_name,index_description,method_description
0,a,Variable in class java.awt.AWTEventMulticaster,
1,A,Static variable in class java.awt.PageAttribut...,"The MediaType instance for Engineering A, 8 1/..."
2,A,Static variable in class javax.print.attribute...,"Specifies the engineering A size, 8.5 inch by ..."
3,A,Static variable in class javax.print.attribute...,A size .
4,A,Static variable in class javax.swing.text.html...,
...,...,...,...
51185,_write(OutputStream),Method in class org.omg.PortableInterceptor.IO...,
51186,_write(OutputStream),Method in class org.omg.PortableInterceptor.Ob...,
51187,_write(OutputStream),Method in class org.omg.PortableInterceptor.Ob...,
51188,_write(OutputStream),Method in class org.omg.PortableInterceptor.Ob...,


In [4]:
# use javalang to generate ASTs and depth-first traverse to generate ast nodes corpus
def get_token(node):
    token = ''
    if isinstance(node, str):
        token = node
    elif isinstance(node, set):
        token = 'Modifier'
    elif isinstance(node, Node):
        token = node.__class__.__name__
    return token


def get_child(root):
    if isinstance(root, Node):
        children = root.children
    elif isinstance(root, set):
        children = list(root)
    else:
        children = []

    def expand(nested_list):
        for item in nested_list:
            if isinstance(item, list):
                yield from expand(item)
            elif item:
                yield item

    return list(expand(children))


def get_sequence(node, sequence, api_sequence):
    token, children = get_token(node), get_child(node)
    sequence.append(token)
    if token == 'MethodInvocation':
        api = [get_token(child) for child in children if not get_child(child)]
        # api_sequence.append(' '.join(api))
        if len(api) > 1:
            api_sequence.append(api[-1])
    for child in children:
        get_sequence(child, sequence, api_sequence)


def parse_program(func):
    tokens = javalang.tokenizer.tokenize(func)
    parser = javalang.parser.Parser(tokens)
    return parser.parse_member_declaration()    

In [5]:
def api_match(api_sequence, java_api):
    description_sequence = []
    for api in api_sequence:
        loc = java_api.loc[java_api['index_name'].str.contains(api, case=True)]
        if not loc.empty:
            description = loc['method_description'].iloc[0]
            if description != 'None':
                description_sequence.append(description)
    return description_sequence

Code Clone Detection

In [None]:
raw_code_url = '/data/dataset/CodeXGLUE/Code-Code/Clone-detection-BigCloneBench/dataset/data.jsonl'
raw_code = pd.read_json(path_or_buf=raw_code_url, lines=True)

In [None]:
raw_code

In [None]:
raw_code['func'][1]

In [None]:
from tqdm import tqdm

ast_sequence = []
description_sequence = []
for i in tqdm(range(len(raw_code))):
    sequence = []
    api_sequence = []    
    get_sequence(parse_program(raw_code['func'][i]), sequence, api_sequence)
    ast_sequence.append(' '.join(sequence))     
    api_sequence = list(set(api_sequence)) 
    # print('api_sequence', api_sequence)
    description = '\n'.join(api_match(api_sequence, java_api))
    # print('description', description)   
    description_sequence.append(description)        

In [None]:
raw_code['ast'] = ast_sequence
raw_code['des'] = description_sequence
raw_code['ast_des'] = raw_code['ast'] + ' ' + raw_code['des']

In [None]:
raw_code

In [None]:
raw_code.to_json(path_or_buf='/data/dataset/CodeXGLUE/Code-Code/Clone-detection-BigCloneBench/dataset/data_enhanced.jsonl', orient='records', lines=True)

Code Summarization

In [6]:
TRAIN_DIR = '/data/code/represent-code-in-human/data/code-summarization-new/train.jsonl'
VALID_DIR = '/data/code/represent-code-in-human/data/code-summarization-new/valid.jsonl'
TEST_DIR = '/data/code/represent-code-in-human/data/code-summarization-new/test.jsonl'

In [7]:
# read dataset
train_data = pd.read_json(path_or_buf=TRAIN_DIR, lines=True)
valid_data = pd.read_json(path_or_buf=VALID_DIR, lines=True)
test_data = pd.read_json(path_or_buf=TEST_DIR, lines=True)

In [8]:
train_data = train_data.sample(random_state=555, frac=1)
valid_data = valid_data.sample(random_state=555, frac=1)
test_data = test_data.sample(random_state=555, frac=1)

In [9]:
train_data

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition
44793,b3dgs/lionengine,lionengine-core/src/main/java/com/b3dgs/lionen...,UtilMath.getDistance,"public static double getDistance(double x1, do...",java,"public static double getDistance(double x1, do...","[public, static, double, getDistance, (, doubl...",Get distance from point to area.\n\n@param x1 ...,"[Get, distance, from, point, to, area, .]",cac3d5578532cf11724a737b9f09e71bf9995ab2,https://github.com/b3dgs/lionengine/blob/cac3d...,train
114654,nguyenq/tess4j,src/main/java/net/sourceforge/tess4j/util/Util...,Utils.getConstantName,public static String getConstantName(Object va...,java,public static String getConstantName(Object va...,"[public, static, String, getConstantName, (, O...",Gets user-friendly name of the public static f...,"[Gets, user, -, friendly, name, of, the, publi...",cfcd4a8a44042f150b4aaf7bdf5ffc485a2236e1,https://github.com/nguyenq/tess4j/blob/cfcd4a8...,train
50192,hazelcast/hazelcast,hazelcast/src/main/java/com/hazelcast/cp/inter...,RaftSemaphore.acquire,AcquireResult acquire(AcquireInvocationKey key...,java,AcquireResult acquire(AcquireInvocationKey key...,"[AcquireResult, acquire, (, AcquireInvocationK...","Assigns permits to the endpoint, if sufficient...","[Assigns, permits, to, the, endpoint, if, suff...",8c4bc10515dbbfb41a33e0302c0caedf3cda1baf,https://github.com/hazelcast/hazelcast/blob/8c...,train
63791,Netflix/conductor,core/src/main/java/com/netflix/conductor/servi...,TaskServiceImpl.batchPoll,@Service\n public List<Task> batchPoll(Stri...,java,@Service\n public List<Task> batchPoll(Stri...,"[@, Service, public, List, <, Task, >, batchPo...",Batch Poll for a task of a certain type.\n\n@p...,"[Batch, Poll, for, a, task, of, a, certain, ty...",78fae0ed9ddea22891f9eebb96a2ec0b2783dca0,https://github.com/Netflix/conductor/blob/78fa...,train
61049,deeplearning4j/deeplearning4j,nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/sr...,WorkspaceUtils.assertNoWorkspacesOpen,public static void assertNoWorkspacesOpen(Stri...,java,public static void assertNoWorkspacesOpen(Stri...,"[public, static, void, assertNoWorkspacesOpen,...",Assert that no workspaces are currently open\n...,"[Assert, that, no, workspaces, are, currently,...",effce52f2afd7eeb53c5bcca699fcd90bd06822f,https://github.com/deeplearning4j/deeplearning...,train
...,...,...,...,...,...,...,...,...,...,...,...,...
75158,facebookarchive/hadoop-20,src/contrib/hive-streaming/src/java/org/apache...,StreamJob.setUserJobConfProps,protected void setUserJobConfProps(boolean doE...,java,protected void setUserJobConfProps(boolean doE...,"[protected, void, setUserJobConfProps, (, bool...",This method sets the user jobconf variable spe...,"[This, method, sets, the, user, jobconf, varia...",2a29bc6ecf30edb1ad8dbde32aa49a317b4d44f4,https://github.com/facebookarchive/hadoop-20/b...,train
59972,apache/incubator-druid,extendedset/src/main/java/org/apache/druid/ext...,ConciseSet.trimZeros,private void trimZeros()\r\n {\r\n // loop...,java,private void trimZeros()\r\n {\r\n // loop...,"[private, void, trimZeros, (, ), {, // loop ov...",Removes trailing zeros,"[Removes, trailing, zeros]",f776b9408962b9006cfcfe4d6c1794751972cc8e,https://github.com/apache/incubator-druid/blob...,train
156705,classgraph/classgraph,src/main/java/io/github/classgraph/ClassTypeSi...,ClassTypeSignature.parse,static ClassTypeSignature parse(final String t...,java,static ClassTypeSignature parse(final String t...,"[static, ClassTypeSignature, parse, (, final, ...",Parse a class type signature or class type des...,"[Parse, a, class, type, signature, or, class, ...",c8c8b2ca1eb76339f69193fdac33d735c864215c,https://github.com/classgraph/classgraph/blob/...,train
15337,exoplatform/jcr,exo.jcr.ext.services/src/main/java/org/exoplat...,GroupHandlerImpl.postDelete,private void postDelete(Group group) throws Ex...,java,private void postDelete(Group group) throws Ex...,"[private, void, postDelete, (, Group, group, )...",Notifying listeners after group deletion.\n\n@...,"[Notifying, listeners, after, group, deletion, .]",3e7f9ee1b5683640d73a4316fb4b0ad5eac5b8a2,https://github.com/exoplatform/jcr/blob/3e7f9e...,train


In [10]:
def get_ast_and_description(data):
    description_sequence = []
    ast_sequence = []
    ast_sum = 0
    description_sum = 0
    data_size = len(data)
    for i in tqdm(range(data_size)):
        sequence = []
        api_sequence = []    
        get_sequence(parse_program(data['code'].iloc[i]), sequence, api_sequence)
        ast = ' '.join(sequence)
        ast_sequence.append(ast) 
        ast_sum += len(ast.split(' '))

        api_sequence = list(set(api_sequence)) 
        description = ' '.join(api_match(api_sequence, java_api)) 
        description_sequence.append(description) 
        description_sum += len(description.split(' '))
    print('ast average length', ast_sum/data_size)
    print('description average length', description_sum/data_size)
    return description_sequence, ast_sequence   

In [11]:
valid_description, valid_ast = get_ast_and_description(valid_data)
test_description, test_ast = get_ast_and_description(test_data)

100%|██████████| 5179/5179 [05:50<00:00, 14.99it/s]
  0%|          | 3/10952 [00:00<06:56, 26.30it/s]

ast average length 83.33770998262213
description average length 23.19810774280749


100%|██████████| 10952/10952 [13:37<00:00,  8.60it/s]

ast average length 93.25054784514244
description average length 24.801406135865594





In [12]:
valid_data['des'] = valid_description
valid_data['ast'] = valid_ast
valid_data['ast_des'] = valid_data['ast'] + ' ' + valid_data['des']

test_data['des'] = test_description
test_data['ast'] = test_ast
test_data['ast_des'] = test_data['ast'] + ' ' + test_data['des']

In [None]:
# train_data.to_json(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-middle/train.jsonl',
#                      orient='records', lines=True)
# valid_data.to_json(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-middle/valid.jsonl',
#                      orient='records', lines=True)
# test_data.to_json(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-middle/test.jsonl',
#                      orient='records', lines=True)

In [14]:
test_data

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition,des,ast,ast_des
4336,Unidata/thredds,cdm/src/main/java/ucar/unidata/util/Format.java,Format.tab,"public static void tab(StringBuffer sbuff, int...",java,"public static void tab(StringBuffer sbuff, int...","[public, static, void, tab, (, StringBuffer, s...","Blank fill sbuff with blanks, until position t...","[Blank, fill, sbuff, with, blanks, until, posi...",d2d68f9eee87f345625211324d71d5dc3e162ee1,https://github.com/Unidata/thredds/blob/d2d68f...,test,Sets the length of this file. Returns the curr...,MethodDeclaration Modifier public static tab F...,MethodDeclaration Modifier public static tab F...
3258,orientechnologies/orientdb,core/src/main/java/com/orientechnologies/orien...,OBinarySerializerFactory.getObjectSerializer,"@SuppressWarnings(""unchecked"")\n public <T> O...",java,"@SuppressWarnings(""unchecked"")\n public <T> O...","[@, SuppressWarnings, (, ""unchecked"", ), publi...",Obtain OBinarySerializer realization for the O...,"[Obtain, OBinarySerializer, realization, for, ...",d970b12033f0462f0239ea0ad8ed41207e6e26f1,https://github.com/orientechnologies/orientdb/...,test,Adds a new DropTargetListener (UNICAST SOURCE).,MethodDeclaration Modifier public Annotation S...,MethodDeclaration Modifier public Annotation S...
10885,intellimate/IzouSDK,src/main/java/org/intellimate/izou/sdk/framewo...,TrackInfo.export,"public HashMap<String, Object> export() {\n ...",java,"public HashMap<String, Object> export() {\n ...","[public, HashMap, <, String, ,, Object, >, exp...",exports the TrackInfo to a Hashmap\n@return a ...,"[exports, the, TrackInfo, to, a, Hashmap]",bc8705ad48a6ca12a722f2b787be435949fa5d08,https://github.com/intellimate/IzouSDK/blob/bc...,test,Called by the context acceptor to process a to...,MethodDeclaration Modifier public ReferenceTyp...,MethodDeclaration Modifier public ReferenceTyp...
2928,orientechnologies/orientdb,object/src/main/java/com/orientechnologies/ori...,ODocumentFieldHandlingStrategyFactory.create,public ODocumentFieldHandlingStrategy create(i...,java,public ODocumentFieldHandlingStrategy create(i...,"[public, ODocumentFieldHandlingStrategy, creat...",Creates a new instance of the requested strate...,"[Creates, a, new, instance, of, the, requested...",d970b12033f0462f0239ea0ad8ed41207e6e26f1,https://github.com/orientechnologies/orientdb/...,test,Adds a new DropTargetListener (UNICAST SOURCE)...,MethodDeclaration Modifier public ReferenceTyp...,MethodDeclaration Modifier public ReferenceTyp...
2371,streamsets/datacollector,cluster-bootstrap-api/src/main/java/com/stream...,BootstrapClusterStreaming.main,public static void main(String[] args) throws ...,java,public static void main(String[] args) throws ...,"[public, static, void, main, (, String, [, ], ...",Bootstrapping the Driver which starts a Spark ...,"[Bootstrapping, the, Driver, which, starts, a,...",ea63245ea14d59d5229248387f0628f46131eae5,https://github.com/streamsets/datacollector/bl...,test,Awaits termination of the group. Determines th...,MethodDeclaration Modifier public static main ...,MethodDeclaration Modifier public static main ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6694,Axway/iron,iron-core/src/main/java/io/axway/iron/core/int...,StorePersistence.loadStores,Optional<BigInteger> loadStores(Function<Strin...,java,Optional<BigInteger> loadStores(Function<Strin...,"[Optional, <, BigInteger, >, loadStores, (, Fu...",Load the stores.\n\n@param entityStoresByStore...,"[Load, the, stores, .]",387ec6d43d2353f857d510dd29f93a06a8db9069,https://github.com/Axway/iron/blob/387ec6d43d2...,test,Adds component-listener-a with component-liste...,MethodDeclaration ReferenceType Optional TypeA...,MethodDeclaration ReferenceType Optional TypeA...
7549,lessthanoptimal/GeoRegression,main/src/georegression/geometry/UtilLine2D_F64...,UtilLine2D_F64.convert,public static LinePolar2D_F64 convert( LineGen...,java,public static LinePolar2D_F64 convert( LineGen...,"[public, static, LinePolar2D_F64, convert, (, ...",Converts a line from general to polar.\n\n@par...,"[Converts, a, line, from, general, to, polar, .]",c067cd44a4e95bd826119f6e658bd82386ef289f,https://github.com/lessthanoptimal/GeoRegressi...,test,Returns the correctly rounded positive square ...,MethodDeclaration Modifier public static Refer...,MethodDeclaration Modifier public static Refer...
10820,intellimate/IzouSDK,src/main/java/org/intellimate/izou/sdk/framewo...,PresenceConstant.setPresence,public void setPresence(boolean present) {\n ...,java,public void setPresence(boolean present) {\n ...,"[public, void, setPresence, (, boolean, presen...",sets the presence\n@param present true for pre...,"[sets, the, presence]",bc8705ad48a6ca12a722f2b787be435949fa5d08,https://github.com/intellimate/IzouSDK/blob/bc...,test,,MethodDeclaration Modifier public setPresence ...,MethodDeclaration Modifier public setPresence ...
9249,jboss/jboss-common-core,src/main/java/org/jboss/util/property/Property...,Property.set,"public static String set(String name, String v...",java,"public static String set(String name, String v...","[public, static, String, set, (, String, name,...",Set a property\n\n@param name Property name...,"[Set, a, property]",c191bb18db83fa106c6913cf34ed47989b65dc97,https://github.com/jboss/jboss-common-core/blo...,test,Sets the system property indicated by the spec...,MethodDeclaration Modifier public static Refer...,MethodDeclaration Modifier public static Refer...


In [15]:
valid_data.to_json(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-full/valid.jsonl',
                     orient='records', lines=True)
test_data.to_json(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-full/test.jsonl',
                     orient='records', lines=True)

In [None]:
train_description, train_ast = get_ast_and_description(train_data)
train_data['des'] = train_description
train_data['ast'] = train_ast
train_data['ast_des'] = train_data['ast'] + ' ' + train_data['des']
train_data.to_json(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-full/train.jsonl',
                     orient='records', lines=True)

multiprocessing test

In [37]:
from multiprocessing import Process, cpu_count, Manager, Pool 
import os

In [75]:
def multi_get_ast_and_des(l, i):
    sequence = []
    api_sequence = []    
    get_sequence(parse_program(train_data['code'].iloc[i]), sequence, api_sequence)
    ast = ' '.join(sequence)
    api_sequence = list(set(api_sequence)) 
    des = ' '.join(api_match(api_sequence, java_api)) 
    d = {'ast': ast, 'des': des, 'i': i}
    l.append(d)
    # print('i', i, 'ast', ast, 'des', des)



In [76]:
manager = Manager()
data_size = len(train_data)
l = manager.list()
p = Pool(processes=20)
for i in range(data_size):
    p.apply_async(multi_get_ast_and_des, (l, i))
p.close()
p.join()

In [79]:
train_ast = []
train_des = []
train_i = []
for d in l[:]:
    train_ast.append(d['ast'])
    train_des.append(d['des'])
    train_i.append(d['i'])

In [82]:
len(train_i)

164814

In [83]:
train_dict = {'ast': train_ast, 'des': train_des, 'i': train_i}
train_df = pd.DataFrame.from_dict(train_dict)

In [84]:
train_df

Unnamed: 0,ast,des,i
0,MethodDeclaration Modifier public static Basic...,,5
1,MethodDeclaration Modifier public static Basic...,Compares two boolean values.,0
2,MethodDeclaration Modifier public static TypeP...,,9
3,MethodDeclaration Modifier private BasicType i...,,12
4,MethodDeclaration Modifier public Annotation O...,Removes all resource bundles from the cache th...,8
...,...,...,...
164809,MethodDeclaration Modifier public ReferenceTyp...,The InputStream object that will be returned b...,163869
164810,MethodDeclaration Modifier public final BasicT...,,164325
164811,MethodDeclaration Modifier private addNode For...,Retrieves the value of the designated JDBC CHA...,164007
164812,MethodDeclaration Modifier public update Forma...,Returns the raw keycode of this MenuShortcut. ...,164201


In [87]:
train_df = train_df.sort_values(by=['i']).reset_index(drop=True)
train_df

Unnamed: 0,ast,des,i
0,MethodDeclaration Modifier public static Basic...,Compares two boolean values.,0
1,MethodDeclaration Modifier public static Refer...,Adds a new DropTargetListener (UNICAST SOURCE)...,1
2,MethodDeclaration ReferenceType AcquireResult ...,Adds a new DropTargetListener (UNICAST SOURCE)...,2
3,MethodDeclaration Modifier public Annotation S...,Polls this queue to see if a reference object ...,3
4,MethodDeclaration Modifier public static asser...,Returns a list of MemoryManagerMXBean objects ...,4
...,...,...,...
164809,MethodDeclaration Modifier protected setUserJo...,Adds a new DropTargetListener (UNICAST SOURCE)...,164809
164810,MethodDeclaration Modifier private trimZeros L...,,164810
164811,MethodDeclaration Modifier static ReferenceTyp...,Returns an empty list (immutable). Provides th...,164811
164812,MethodDeclaration Modifier private postDelete ...,,164812


In [97]:
train_data['ast'] = train_df['ast'].to_list()
train_data['des'] = train_df['des'].to_list()
train_data['ast_des'] = train_data['ast'] + ' ' + train_data['des']
train_data

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition,ast,des,ast_des
44793,b3dgs/lionengine,lionengine-core/src/main/java/com/b3dgs/lionen...,UtilMath.getDistance,"public static double getDistance(double x1, do...",java,"public static double getDistance(double x1, do...","[public, static, double, getDistance, (, doubl...",Get distance from point to area.\n\n@param x1 ...,"[Get, distance, from, point, to, area, .]",cac3d5578532cf11724a737b9f09e71bf9995ab2,https://github.com/b3dgs/lionengine/blob/cac3d...,train,MethodDeclaration Modifier public static Basic...,Compares two boolean values.,MethodDeclaration Modifier public static Basic...
114654,nguyenq/tess4j,src/main/java/net/sourceforge/tess4j/util/Util...,Utils.getConstantName,public static String getConstantName(Object va...,java,public static String getConstantName(Object va...,"[public, static, String, getConstantName, (, O...",Gets user-friendly name of the public static f...,"[Gets, user, -, friendly, name, of, the, publi...",cfcd4a8a44042f150b4aaf7bdf5ffc485a2236e1,https://github.com/nguyenq/tess4j/blob/cfcd4a8...,train,MethodDeclaration Modifier public static Refer...,Adds a new DropTargetListener (UNICAST SOURCE)...,MethodDeclaration Modifier public static Refer...
50192,hazelcast/hazelcast,hazelcast/src/main/java/com/hazelcast/cp/inter...,RaftSemaphore.acquire,AcquireResult acquire(AcquireInvocationKey key...,java,AcquireResult acquire(AcquireInvocationKey key...,"[AcquireResult, acquire, (, AcquireInvocationK...","Assigns permits to the endpoint, if sufficient...","[Assigns, permits, to, the, endpoint, if, suff...",8c4bc10515dbbfb41a33e0302c0caedf3cda1baf,https://github.com/hazelcast/hazelcast/blob/8c...,train,MethodDeclaration ReferenceType AcquireResult ...,Adds a new DropTargetListener (UNICAST SOURCE)...,MethodDeclaration ReferenceType AcquireResult ...
63791,Netflix/conductor,core/src/main/java/com/netflix/conductor/servi...,TaskServiceImpl.batchPoll,@Service\n public List<Task> batchPoll(Stri...,java,@Service\n public List<Task> batchPoll(Stri...,"[@, Service, public, List, <, Task, >, batchPo...",Batch Poll for a task of a certain type.\n\n@p...,"[Batch, Poll, for, a, task, of, a, certain, ty...",78fae0ed9ddea22891f9eebb96a2ec0b2783dca0,https://github.com/Netflix/conductor/blob/78fa...,train,MethodDeclaration Modifier public Annotation S...,Polls this queue to see if a reference object ...,MethodDeclaration Modifier public Annotation S...
61049,deeplearning4j/deeplearning4j,nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/sr...,WorkspaceUtils.assertNoWorkspacesOpen,public static void assertNoWorkspacesOpen(Stri...,java,public static void assertNoWorkspacesOpen(Stri...,"[public, static, void, assertNoWorkspacesOpen,...",Assert that no workspaces are currently open\n...,"[Assert, that, no, workspaces, are, currently,...",effce52f2afd7eeb53c5bcca699fcd90bd06822f,https://github.com/deeplearning4j/deeplearning...,train,MethodDeclaration Modifier public static asser...,Returns a list of MemoryManagerMXBean objects ...,MethodDeclaration Modifier public static asser...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75158,facebookarchive/hadoop-20,src/contrib/hive-streaming/src/java/org/apache...,StreamJob.setUserJobConfProps,protected void setUserJobConfProps(boolean doE...,java,protected void setUserJobConfProps(boolean doE...,"[protected, void, setUserJobConfProps, (, bool...",This method sets the user jobconf variable spe...,"[This, method, sets, the, user, jobconf, varia...",2a29bc6ecf30edb1ad8dbde32aa49a317b4d44f4,https://github.com/facebookarchive/hadoop-20/b...,train,MethodDeclaration Modifier protected setUserJo...,Adds a new DropTargetListener (UNICAST SOURCE)...,MethodDeclaration Modifier protected setUserJo...
59972,apache/incubator-druid,extendedset/src/main/java/org/apache/druid/ext...,ConciseSet.trimZeros,private void trimZeros()\r\n {\r\n // loop...,java,private void trimZeros()\r\n {\r\n // loop...,"[private, void, trimZeros, (, ), {, // loop ov...",Removes trailing zeros,"[Removes, trailing, zeros]",f776b9408962b9006cfcfe4d6c1794751972cc8e,https://github.com/apache/incubator-druid/blob...,train,MethodDeclaration Modifier private trimZeros L...,,MethodDeclaration Modifier private trimZeros L...
156705,classgraph/classgraph,src/main/java/io/github/classgraph/ClassTypeSi...,ClassTypeSignature.parse,static ClassTypeSignature parse(final String t...,java,static ClassTypeSignature parse(final String t...,"[static, ClassTypeSignature, parse, (, final, ...",Parse a class type signature or class type des...,"[Parse, a, class, type, signature, or, class, ...",c8c8b2ca1eb76339f69193fdac33d735c864215c,https://github.com/classgraph/classgraph/blob/...,train,MethodDeclaration Modifier static ReferenceTyp...,Returns an empty list (immutable). Provides th...,MethodDeclaration Modifier static ReferenceTyp...
15337,exoplatform/jcr,exo.jcr.ext.services/src/main/java/org/exoplat...,GroupHandlerImpl.postDelete,private void postDelete(Group group) throws Ex...,java,private void postDelete(Group group) throws Ex...,"[private, void, postDelete, (, Group, group, )...",Notifying listeners after group deletion.\n\n@...,"[Notifying, listeners, after, group, deletion, .]",3e7f9ee1b5683640d73a4316fb4b0ad5eac5b8a2,https://github.com/exoplatform/jcr/blob/3e7f9e...,train,MethodDeclaration Modifier private postDelete ...,,MethodDeclaration Modifier private postDelete ...


In [103]:
train_data_backup = train_data * 1
train_data["ast"] =  train_data.ast.str.replace('[^\x00-\x7F]','')
train_data["des"] =  train_data.des.str.replace('[^\x00-\x7F]','')
train_data["ast_des"] =  train_data.ast_des.str.replace('[^\x00-\x7F]','')

In [104]:
train_data.to_csv(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-full/train.csv',
                    header=0, index=False)

In [105]:
train_data.to_json(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-full/train.jsonl',
                     orient='records', lines=True)

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition,ast,des,ast_des
44793,b3dgs/lionengine,lionengine-core/src/main/java/com/b3dgs/lionen...,UtilMath.getDistance,"public static double getDistance(double x1, do...",java,"public static double getDistance(double x1, do...","[public, static, double, getDistance, (, doubl...",Get distance from point to area.\n\n@param x1 ...,"[Get, distance, from, point, to, area, .]",cac3d5578532cf11724a737b9f09e71bf9995ab2,https://github.com/b3dgs/lionengine/blob/cac3d...,train,MethodDeclaration Modifier public static Basic...,Compares two boolean values.,MethodDeclaration Modifier public static Basic...
114654,nguyenq/tess4j,src/main/java/net/sourceforge/tess4j/util/Util...,Utils.getConstantName,public static String getConstantName(Object va...,java,public static String getConstantName(Object va...,"[public, static, String, getConstantName, (, O...",Gets user-friendly name of the public static f...,"[Gets, user, -, friendly, name, of, the, publi...",cfcd4a8a44042f150b4aaf7bdf5ffc485a2236e1,https://github.com/nguyenq/tess4j/blob/cfcd4a8...,train,MethodDeclaration Modifier public static Refer...,Adds a new DropTargetListener (UNICAST SOURCE)...,MethodDeclaration Modifier public static Refer...
50192,hazelcast/hazelcast,hazelcast/src/main/java/com/hazelcast/cp/inter...,RaftSemaphore.acquire,AcquireResult acquire(AcquireInvocationKey key...,java,AcquireResult acquire(AcquireInvocationKey key...,"[AcquireResult, acquire, (, AcquireInvocationK...","Assigns permits to the endpoint, if sufficient...","[Assigns, permits, to, the, endpoint, if, suff...",8c4bc10515dbbfb41a33e0302c0caedf3cda1baf,https://github.com/hazelcast/hazelcast/blob/8c...,train,MethodDeclaration ReferenceType AcquireResult ...,Adds a new DropTargetListener (UNICAST SOURCE)...,MethodDeclaration ReferenceType AcquireResult ...
63791,Netflix/conductor,core/src/main/java/com/netflix/conductor/servi...,TaskServiceImpl.batchPoll,@Service\n public List<Task> batchPoll(Stri...,java,@Service\n public List<Task> batchPoll(Stri...,"[@, Service, public, List, <, Task, >, batchPo...",Batch Poll for a task of a certain type.\n\n@p...,"[Batch, Poll, for, a, task, of, a, certain, ty...",78fae0ed9ddea22891f9eebb96a2ec0b2783dca0,https://github.com/Netflix/conductor/blob/78fa...,train,MethodDeclaration Modifier public Annotation S...,Polls this queue to see if a reference object ...,MethodDeclaration Modifier public Annotation S...
61049,deeplearning4j/deeplearning4j,nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/sr...,WorkspaceUtils.assertNoWorkspacesOpen,public static void assertNoWorkspacesOpen(Stri...,java,public static void assertNoWorkspacesOpen(Stri...,"[public, static, void, assertNoWorkspacesOpen,...",Assert that no workspaces are currently open\n...,"[Assert, that, no, workspaces, are, currently,...",effce52f2afd7eeb53c5bcca699fcd90bd06822f,https://github.com/deeplearning4j/deeplearning...,train,MethodDeclaration Modifier public static asser...,Returns a list of MemoryManagerMXBean objects ...,MethodDeclaration Modifier public static asser...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75158,facebookarchive/hadoop-20,src/contrib/hive-streaming/src/java/org/apache...,StreamJob.setUserJobConfProps,protected void setUserJobConfProps(boolean doE...,java,protected void setUserJobConfProps(boolean doE...,"[protected, void, setUserJobConfProps, (, bool...",This method sets the user jobconf variable spe...,"[This, method, sets, the, user, jobconf, varia...",2a29bc6ecf30edb1ad8dbde32aa49a317b4d44f4,https://github.com/facebookarchive/hadoop-20/b...,train,MethodDeclaration Modifier protected setUserJo...,Adds a new DropTargetListener (UNICAST SOURCE)...,MethodDeclaration Modifier protected setUserJo...
59972,apache/incubator-druid,extendedset/src/main/java/org/apache/druid/ext...,ConciseSet.trimZeros,private void trimZeros()\r\n {\r\n // loop...,java,private void trimZeros()\r\n {\r\n // loop...,"[private, void, trimZeros, (, ), {, // loop ov...",Removes trailing zeros,"[Removes, trailing, zeros]",f776b9408962b9006cfcfe4d6c1794751972cc8e,https://github.com/apache/incubator-druid/blob...,train,MethodDeclaration Modifier private trimZeros L...,,MethodDeclaration Modifier private trimZeros L...
156705,classgraph/classgraph,src/main/java/io/github/classgraph/ClassTypeSi...,ClassTypeSignature.parse,static ClassTypeSignature parse(final String t...,java,static ClassTypeSignature parse(final String t...,"[static, ClassTypeSignature, parse, (, final, ...",Parse a class type signature or class type des...,"[Parse, a, class, type, signature, or, class, ...",c8c8b2ca1eb76339f69193fdac33d735c864215c,https://github.com/classgraph/classgraph/blob/...,train,MethodDeclaration Modifier static ReferenceTyp...,Returns an empty list (immutable). Provides th...,MethodDeclaration Modifier static ReferenceTyp...
15337,exoplatform/jcr,exo.jcr.ext.services/src/main/java/org/exoplat...,GroupHandlerImpl.postDelete,private void postDelete(Group group) throws Ex...,java,private void postDelete(Group group) throws Ex...,"[private, void, postDelete, (, Group, group, )...",Notifying listeners after group deletion.\n\n@...,"[Notifying, listeners, after, group, deletion, .]",3e7f9ee1b5683640d73a4316fb4b0ad5eac5b8a2,https://github.com/exoplatform/jcr/blob/3e7f9e...,train,MethodDeclaration Modifier private postDelete ...,,MethodDeclaration Modifier private postDelete ...


In [19]:
data_test = pd.read_json(path_or_buf='/data/code/represent-code-in-human/data/code-summarization-enhanced-full/test.jsonl', lines=True)
data_test

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition,des,ast,ast_des
0,Unidata/thredds,cdm/src/main/java/ucar/unidata/util/Format.java,Format.tab,"public static void tab(StringBuffer sbuff, int...",java,"public static void tab(StringBuffer sbuff, int...","[public, static, void, tab, (, StringBuffer, s...","Blank fill sbuff with blanks, until position t...","[Blank, fill, sbuff, with, blanks, until, posi...",d2d68f9eee87f345625211324d71d5dc3e162ee1,https://github.com/Unidata/thredds/blob/d2d68f...,test,Sets the length of this file. Returns the curr...,MethodDeclaration Modifier public static tab F...,MethodDeclaration Modifier public static tab F...
1,orientechnologies/orientdb,core/src/main/java/com/orientechnologies/orien...,OBinarySerializerFactory.getObjectSerializer,"@SuppressWarnings(""unchecked"")\n public <T> O...",java,"@SuppressWarnings(""unchecked"")\n public <T> O...","[@, SuppressWarnings, (, ""unchecked"", ), publi...",Obtain OBinarySerializer realization for the O...,"[Obtain, OBinarySerializer, realization, for, ...",d970b12033f0462f0239ea0ad8ed41207e6e26f1,https://github.com/orientechnologies/orientdb/...,test,Adds a new DropTargetListener (UNICAST SOURCE).,MethodDeclaration Modifier public Annotation S...,MethodDeclaration Modifier public Annotation S...
2,intellimate/IzouSDK,src/main/java/org/intellimate/izou/sdk/framewo...,TrackInfo.export,"public HashMap<String, Object> export() {\n ...",java,"public HashMap<String, Object> export() {\n ...","[public, HashMap, <, String, ,, Object, >, exp...",exports the TrackInfo to a Hashmap\n@return a ...,"[exports, the, TrackInfo, to, a, Hashmap]",bc8705ad48a6ca12a722f2b787be435949fa5d08,https://github.com/intellimate/IzouSDK/blob/bc...,test,Called by the context acceptor to process a to...,MethodDeclaration Modifier public ReferenceTyp...,MethodDeclaration Modifier public ReferenceTyp...
3,orientechnologies/orientdb,object/src/main/java/com/orientechnologies/ori...,ODocumentFieldHandlingStrategyFactory.create,public ODocumentFieldHandlingStrategy create(i...,java,public ODocumentFieldHandlingStrategy create(i...,"[public, ODocumentFieldHandlingStrategy, creat...",Creates a new instance of the requested strate...,"[Creates, a, new, instance, of, the, requested...",d970b12033f0462f0239ea0ad8ed41207e6e26f1,https://github.com/orientechnologies/orientdb/...,test,Adds a new DropTargetListener (UNICAST SOURCE)...,MethodDeclaration Modifier public ReferenceTyp...,MethodDeclaration Modifier public ReferenceTyp...
4,streamsets/datacollector,cluster-bootstrap-api/src/main/java/com/stream...,BootstrapClusterStreaming.main,public static void main(String[] args) throws ...,java,public static void main(String[] args) throws ...,"[public, static, void, main, (, String, [, ], ...",Bootstrapping the Driver which starts a Spark ...,"[Bootstrapping, the, Driver, which, starts, a,...",ea63245ea14d59d5229248387f0628f46131eae5,https://github.com/streamsets/datacollector/bl...,test,Awaits termination of the group. Determines th...,MethodDeclaration Modifier public static main ...,MethodDeclaration Modifier public static main ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10947,Axway/iron,iron-core/src/main/java/io/axway/iron/core/int...,StorePersistence.loadStores,Optional<BigInteger> loadStores(Function<Strin...,java,Optional<BigInteger> loadStores(Function<Strin...,"[Optional, <, BigInteger, >, loadStores, (, Fu...",Load the stores.\n\n@param entityStoresByStore...,"[Load, the, stores, .]",387ec6d43d2353f857d510dd29f93a06a8db9069,https://github.com/Axway/iron/blob/387ec6d43d2...,test,Adds component-listener-a with component-liste...,MethodDeclaration ReferenceType Optional TypeA...,MethodDeclaration ReferenceType Optional TypeA...
10948,lessthanoptimal/GeoRegression,main/src/georegression/geometry/UtilLine2D_F64...,UtilLine2D_F64.convert,public static LinePolar2D_F64 convert( LineGen...,java,public static LinePolar2D_F64 convert( LineGen...,"[public, static, LinePolar2D_F64, convert, (, ...",Converts a line from general to polar.\n\n@par...,"[Converts, a, line, from, general, to, polar, .]",c067cd44a4e95bd826119f6e658bd82386ef289f,https://github.com/lessthanoptimal/GeoRegressi...,test,Returns the correctly rounded positive square ...,MethodDeclaration Modifier public static Refer...,MethodDeclaration Modifier public static Refer...
10949,intellimate/IzouSDK,src/main/java/org/intellimate/izou/sdk/framewo...,PresenceConstant.setPresence,public void setPresence(boolean present) {\n ...,java,public void setPresence(boolean present) {\n ...,"[public, void, setPresence, (, boolean, presen...",sets the presence\n@param present true for pre...,"[sets, the, presence]",bc8705ad48a6ca12a722f2b787be435949fa5d08,https://github.com/intellimate/IzouSDK/blob/bc...,test,,MethodDeclaration Modifier public setPresence ...,MethodDeclaration Modifier public setPresence ...
10950,jboss/jboss-common-core,src/main/java/org/jboss/util/property/Property...,Property.set,"public static String set(String name, String v...",java,"public static String set(String name, String v...","[public, static, String, set, (, String, name,...",Set a property\n\n@param name Property name...,"[Set, a, property]",c191bb18db83fa106c6913cf34ed47989b65dc97,https://github.com/jboss/jboss-common-core/blo...,test,Sets the system property indicated by the spec...,MethodDeclaration Modifier public static Refer...,MethodDeclaration Modifier public static Refer...


In [95]:
train_data['code'][0: 5]

44793     public static double getDistance(double x1, do...
114654    public static String getConstantName(Object va...
50192     AcquireResult acquire(AcquireInvocationKey key...
63791     @Service\n    public List<Task> batchPoll(Stri...
61049     public static void assertNoWorkspacesOpen(Stri...
Name: code, dtype: object

In [98]:
train_data['ast_des'][0: 5]

44793     MethodDeclaration Modifier public static Basic...
114654    MethodDeclaration Modifier public static Refer...
50192     MethodDeclaration ReferenceType AcquireResult ...
63791     MethodDeclaration Modifier public Annotation S...
61049     MethodDeclaration Modifier public static asser...
Name: ast_des, dtype: object

In [100]:
train_df['ast'][1]

'MethodDeclaration Modifier public static ReferenceType String getConstantName FormalParameter ReferenceType Object value FormalParameter ReferenceType Class c ForStatement EnhancedForControl VariableDeclaration ReferenceType Field VariableDeclarator f MethodInvocation c getDeclaredFields BlockStatement LocalVariableDeclaration BasicType int VariableDeclarator mod MethodInvocation f getModifiers IfStatement BinaryOperation && BinaryOperation && MethodInvocation Modifier MemberReference mod isStatic MethodInvocation Modifier MemberReference mod isPublic MethodInvocation Modifier MemberReference mod isFinal BlockStatement TryStatement IfStatement MethodInvocation f MethodInvocation MemberReference value equals Literal null get BlockStatement ReturnStatement MethodInvocation f getName CatchClause CatchClauseParameter IllegalAccessException e ReturnStatement MethodInvocation String MemberReference value valueOf ReturnStatement MethodInvocation String MemberReference value valueOf'