In [None]:
# default_exp data.management.transforms

# Data Management Transforms

> This module contains all of the transforms that can be applied to code related data.

In [None]:
# export
import pandas as pd

from fast_trees.core import FastParser
from random import shuffle
from typing import Callable, Optional

In [None]:
# export
def filter_df(
    df: pd.DataFrame, filter_fn: Callable, col: str, n: Optional[int] = None
) -> pd.DataFrame:
    """
    Filter the given pandas dataframe using the given transformation.

    :param df: the dataframe containing each method to be transformed
    :param filter_fn: the filter function that will be applied to each method in the dataframe
    :param col: the column to get the examples from
    :param n: the number of examples to evaluate. If none, the entire dataframe will be used
    :returns: returns a modified dataframe with the rows filtered
    """
    if n is None:
        n = len(df)

    df = df.iloc[:n].copy()
    df = df[df[col].apply(filter_fn)]

    return df

In [None]:
# export
def transform_df(
    df: pd.DataFrame, transform_fn: Callable, col: str, n: Optional[int] = None
) -> pd.DataFrame:
    """
    Transform the given pandas dataframe using the given transformation.

    :param df: the dataframe containing each method to be transformed
    :param transform_fn: the transformation that will be applied to each example in the dataframe
    :param col: the column to get the examples from
    :param n: the number of examples to evaluate. If none, the entire dataframe will be used
    :returns: returns a modified dataframe with the examples transformed
    """
    if n is None:
        n = len(df)

    df = df.iloc[:n].copy()
    df[col] = df[col].apply(transform_fn)

    return df

# Semantic Preserving
Semantic preserving transformations do not change the functionality or meaning of a piece of data, e.g., comment removal.

## Source Code

In [None]:
# export
def is_ascii(example: str) -> bool:
    """
    Check if the given example contains only ASCII characters. From https://stackoverflow.com/a/27084708/5768407.

    :param example: the data to verify contains only ASCII characters
    :returns: returns a boolean representing whether or not the given example contains only ASCII characters
    """
    try:
        example.encode(encoding="utf-8").decode("ascii")
    except UnicodeDecodeError:
        return False
    else:
        return True

In [None]:
# tst
df_fake = pd.DataFrame(["this is a test", "भारत test"], columns=["code"])

NON_ASCII_DF = pd.DataFrame(["this is a test"], columns=["code"])
df_non_ascii = filter_df(df_fake, is_ascii, "code")

assert (NON_ASCII_DF == df_non_ascii).all().all()

In [None]:
# export
def remove_comments(parser: FastParser, code: str) -> str:
    inline_comments = parser.get_method_inline_comments(code)
    for c in inline_comments:
        code = code.replace(c, "")

    return code

In [None]:
# tst
df_fake = pd.DataFrame(
    [
        """\
public static void main(String[] args) {
    // inline comment
    System.out.println("Hello, world!")
    /**
        multi-line inline comment
    */
}"""
    ],
    columns=["code"],
)

NO_COMMENTS_DF = pd.DataFrame(
    [
        """\
public static void main(String[] args) {
    
    System.out.println("Hello, world!")
    
}"""
    ],
    columns=["code"],
)

parser = FastParser("java")
df_no_comments = transform_df(
    df_fake, lambda example: remove_comments(parser, example), "code"
)

assert (NO_COMMENTS_DF == df_no_comments).all().all()

Repo already exists, continuing.


# Non-Semantic Preserving

## Source Code

In [None]:
# export
def randomize_lines(example: str) -> str:
    """
    Randomize the lines in a given example.

    :param example: the example to have its lines randomized
    :returns: returns the method with its lines randomized
    """
    example = example.split("\n")
    shuffle(example)

    return "\n".join(example)

In [None]:
# tst
import random

random.seed(42)

df_fake = pd.DataFrame(
    [
        """\
public static void main(String[] args) {
    System.out.println("Hello, world!")
}"""
    ],
    columns=["code"],
)

RANDOM_LINES_DF = pd.DataFrame(
    [
        """\
    System.out.println("Hello, world!")
public static void main(String[] args) {
}"""
    ],
    columns=["code"],
)

parser = FastParser("java")
df_random_lines = transform_df(df_fake, randomize_lines, "code")
# df_random_lines.code.values[0]

assert (RANDOM_LINES_DF == df_random_lines).all().all()

Repo already exists, continuing.


In [None]:
# export
def randomize_tokens(example: str) -> str:
    """
    Randomize the tokens in a given method.

    :param mthd: the method to have its code tokens randomized
    :returns: returns the method with its code tokens randomized
    """
    example = example.split(" ")
    shuffle(example)

    return " ".join(example)

In [None]:
# tst
import random

random.seed(42)

df_fake = pd.DataFrame(
    [
        """\
public static void main(String[] args) {
    System.out.println("Hello, world!")
}"""
    ],
    columns=["code"],
)

RANDOM_TOKENS_DF = pd.DataFrame(
    [
        """\
 main(String[] void  {
  System.out.println("Hello, args) public static world!")
}"""
    ],
    columns=["code"],
)

parser = FastParser("java")
df_random_tokens = transform_df(df_fake, randomize_tokens, "code")

assert (RANDOM_TOKENS_DF == df_random_tokens).all().all()

Repo already exists, continuing.


# Pipeline
Composes multiple transformations to apply to a set of data

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()