In [None]:
# default_exp data.management.transforms

# Data Management Transforms

> This module contains all of the transforms that can be applied to code related data.

In [None]:
# export
import pandas as pd

from fast_trees.core import FastParser
from typing import Dict, Callable, List, Optional

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# hide
from ds4se.data.core import get_dataframes, get_datasets
from pathlib import Path

path = Path("/home/nathan/Downloads/")
df_trn, df_val, df_tst = get_dataframes(path / "java/final/jsonl")

n = 100
df_trn = df_trn.head(n)
df_val = df_val.head(n)
df_tst = df_tst.head(n)
df_trn.head()

Unnamed: 0,code,docstring
0,protected final void bindIndexed(Configuration...,Bind indexed elements to the supplied collecti...
1,public void setServletRegistrationBeans(\n\t\t...,Set {@link ServletRegistrationBean}s that the ...
2,public void addServletRegistrationBeans(\n\t\t...,Add {@link ServletRegistrationBean}s for the f...
3,public void setServletNames(Collection<String>...,Set servlet names that the filter will be regi...
4,public void addServletNames(String... servletN...,Add servlet names for the filter.\n@param serv...


In [None]:
# export
def transform_df(
    df: pd.DataFrame, transform_fn: Callable, col: str, n: Optional[int] = None
) -> pd.DataFrame:
    """
    Transform the given pandas dataframe using the given transformation.

    :param df: the dataframe containing each method to be transformed
    :param transform_fn: the transformation that will be applied to each example in the dataframe
    :param col: the column to get the examples from
    :param n: the number of examples to evaluate. If none, the entire dataframe will be used
    :returns: returns a modified dataframe with the examples transformed
    """
    if n is None:
        n = len(df)

    df = df.iloc[:n].copy()
    df[col] = df[col].apply(transform_fn)

    return df

In [None]:
# export
def filter_df(
    df: pd.DataFrame, filter_fn: Callable, col: str, n: Optional[int] = None
) -> pd.DataFrame:
    """
    Filter the given pandas dataframe using the given transformation.

    :param df: the dataframe containing each method to be transformed
    :param filter_fn: the filter function that will be applied to each method in the dataframe
    :param col: the column to get the examples from
    :param n: the number of examples to evaluate. If none, the entire dataframe will be used
    :returns: returns a modified dataframe with the rows filtered
    """
    if n is None:
        n = len(df)

    df = df.iloc[:n].copy()
    df = df[df[col].apply(filter_fn)]

    return df

# Semantic Preserving
Semantic preserving transformations do not change the functionality or meaning of a piece of data, e.g., comment removal.

## Source Code

In [None]:
# export
def is_ascii(mthd: str) -> bool:
    """
    Check if the given method contains only ASCII characters. From https://stackoverflow.com/a/27084708/5768407.

    :param mthd: the method to verify contains only ASCII characters
    :returns: returns a boolean representing whether or not the given method contains only ASCII characters
    """
    try:
        mthd.encode(encoding="utf-8").decode("ascii")
    except UnicodeDecodeError:
        return False
    else:
        return True

In [None]:
# tst
df_fake = pd.DataFrame(["this is a test", "भारत test"], columns=["code"])

NON_ASCII_DF = pd.DataFrame(["this is a test"], columns=["code"])
df_non_ascii = filter_df(df_fake, is_ascii, "code")

assert (NON_ASCII_DF == df_non_ascii).all().all()

In [None]:
# export
def remove_comments(parser: FastParser, example: str) -> str:
    inline_comments = parser.get_method_inline_comments(example)
    for c in inline_comments:
        example = example.replace(c, "")

    return example

In [None]:
# tst
df_fake = pd.DataFrame(
    [
        """public static void main(String[] args) {
    // inline comment
    System.out.println("Hello, world!")
    /**
        multi-line inline comment
    */
}"""
    ],
    columns=["code"],
)

NO_COMMENTS_DF = pd.DataFrame(
    [
        """public static void main(String[] args) {
    
    System.out.println("Hello, world!")
    
}"""
    ],
    columns=["code"],
)

parser = FastParser("java")
df_no_comments = transform_df(
    df_fake, lambda example: remove_comments(parser, example), "code"
)

assert (NO_COMMENTS_DF == df_no_comments).all().all()

Repo already exists, continuing.


# Non-Semantic Preserving

## Source Code

# Pipeline
Composes multiple transformations to apply to a set of data