# Preparation of parsed function, docstring pairs from standard libraries

In [116]:
import pandas as pd
from comment_parser import comment_parser as parse

In [117]:
py_path = 'D:\PROJECT\data\out.json'
java_path = 'D:\PROJECT\scraper_tmp\out_java.json'

In [118]:
py_pairs = pd.DataFrame()
java_pairs = pd.DataFrame()

In [119]:
py_df = pd.read_json(py_path)
java_df = pd.read_json(java_path)

In [120]:
def extract(functions : pd.Series, type : str) -> pd.Series:
    return functions.map(lambda x : "\n".join([x.text() for x in parse.extract_comments_from_str(x, type) if len(x.text()) > 0]))

In [121]:
py_pairs['code'] = py_df.identifier
java_pairs['code'] = java_df.identifier

In [122]:
py_pairs['code'] = py_pairs['code'].map(lambda x : x.split(".")[-1] if "." in x else x)
java_pairs['code'] = java_pairs['code'].map(lambda x : x.split(".")[-1] if "." in x else x)

In [123]:
py_pairs['docstring'] = extract(py_df.function, 'text/x-python')
java_pairs['docstring'] = extract(java_df.function, 'text/x-java')

## Cleaning

In [124]:
def stopPhrase(x):
    remove = ["supplementary characters (LANG1300)", "not found", "NOI18N", "\n\n"]
    for phrase in remove:
        x = x.replace(phrase, "")
    return x if x != "\n" else ""

In [125]:
py_pairs['docstring'] = py_pairs['docstring'].apply(lambda x : stopPhrase(x))
java_pairs['docstring'] = java_pairs['docstring'].apply(lambda x : stopPhrase(x))

In [126]:
py_clean = py_pairs[py_pairs.docstring != ""].reset_index(drop=True)
java_clean = java_pairs[java_pairs.docstring != ""].reset_index(drop=True)

In [127]:
java_clean

Unnamed: 0,code,docstring
0,getDefaultDebugClassPath,
1,correctDebugClassPath,"should not happen\n ""invalid"" strings\n \n \n..."
2,getErrorColor,inspired by org.openide.WizardDescriptor\n
3,logUsage,\n
4,getRootURL,\n
...,...,...
111522,removeCommonFrames,Remove the frame from the cause trace if it i...
111523,rethrow,claim that the typeErasure invocation throws ...
111524,indexOf,if (cs instanceof String && searchChar...
111525,lastIndexOf,\nNOTE - we must do a forward traversal for th...


In [128]:
import os
augment_path = "D:\PROJECT\Augments"
java = "java.jsonl"
py = "python.jsonl"

java_clean.dropna().to_json(os.path.join(augment_path, java), orient="records", lines=True)
py_clean.dropna().to_json(os.path.join(augment_path, py), orient="records", lines=True)