In [None]:
# default_exp data.transforms

In [None]:
# hide
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Data Transforms

> API details. @Nathan

In [None]:
# export
import re

import pandas as pd

from random import shuffle
from typing import Callable, Optional

## Semantic Preserving

In [None]:
# export
def java_comment_remover(mthd: str) -> str:
    """
    Remove all comments from a given java method. Code from https://stackoverflow.com/a/241506/5768407.

    :param mthd: the method to have its comments removed
    :returns: returns the method with its comments removed
    """

    def replacer(match):
        s = match.group(0)
        if s.startswith("/"):
            return " "  # note: a space and not an empty string
        else:
            return s

    pattern = re.compile(
        r'//.*?$|</>\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE,
    )
    return re.sub(pattern, replacer, mthd)

## Non-Semantic Preserving

In [None]:
# export
def line_randomizer(mthd: str) -> str:
    """
    Randomize the lines in a given method.

    :param mthd: the method to have its lines randomized
    :returns: returns the method with its lines randomized
    """
    mthd = mthd.split("\n")
    shuffle(mthd)

    return "\n".join(mthd)


def code_token_randomizer(mthd: str) -> str:
    """
    Randomize the tokens in a given method.

    :param mthd: the method to have its code tokens randomized
    :returns: returns the method with its code tokens randomized
    """
    mthd = mthd.split(" ")
    shuffle(mthd)

    return " ".join(mthd)

In [None]:
# hide

# From: https://www.geeksforgeeks.org/methods-in-java/
df_fake = pd.DataFrame(
    [
        """public int addTwoInt(int a, int b){ 
          
        // adding two integer value. 
        sum = a + b; 
          
        //returning summation of two values. 
        return sum;  
    }"""
    ],
    columns=["code"],
)
df_fake

In [None]:
# export
def transform_df(
    df: pd.DataFrame, transform: Callable, n: Optional[int] = None
) -> pd.DataFrame:
    """
    Transform the given pandas dataframe using the given transformation.

    :param df: the dataframe containing each method to be transformed
    :param transform: the transformation that will be applied to each method in the dataframe
    :param n: the number of methods to evaluate. If none, the entire dataframe will be used
    :returns: returns a modified dataframe with the methods transformed
    """
    if n is None:
        n = len(df)

    df = df.iloc[:n].copy()
    df.code = df.code.apply(transform)

    return df

In [None]:
NO_CMT_MTHD = """public int addTwoInt(int a, int b){ 
          
         
        sum = a + b; 
          
         
        return sum;  
    }"""

df_no_cmt = transform_df(df_fake, java_comment_remover)

assert NO_CMT_MTHD == df_no_cmt.code.values[0]

In [None]:
import random

NO_RND_LINES_MTHD = """          
        return sum;  
        //returning summation of two values. 
          
    }
public int addTwoInt(int a, int b){ 
        // adding two integer value. 
        sum = a + b; """

random.seed(4)
df_rnd_lines = transform_df(df_fake, line_randomizer)

assert NO_RND_LINES_MTHD == df_rnd_lines.code.values[0]

In [None]:
NO_RND_TOKS_MTHD = """ values.   a, two return  adding     
 int     
 integer    
     two   public   of } a sum;   //   b){  
 +       //returning  b;  int  
 
 
     value.  sum   addTwoInt(int    summation   = """

random.seed(4)
df_rnd_toks = transform_df(df_fake, code_token_randomizer)

assert NO_RND_TOKS_MTHD == df_rnd_toks.code.values[0]

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()