In [1]:
import ast
import pandas as pd
import warnings

In [2]:
train = pd.read_csv("data/ft_train.csv")
train

Unnamed: 0,cleaned_method,target_block,tokens_in_method
0,"def _resolve_lib_imported_symbols(self, lib, i...",if generic_refs :,189
1,"def make_docs_directory(output_dir, name):\n ...","if not isdir ( pjoin ( output_dir , name , str...",121
2,"def assert_results(self, results, activities, ...","if hasattr ( result , ""extra_context"" ) :",164
3,"def for_file(cls, filename: str, modname: str)...","if "".egg"" + path . sep in filename :",170
4,"def merge_dicts(source: Dict, destination: Dic...","if isinstance ( value , dict ) :",92
...,...,...,...
49995,"def reconnect(cache, exc):\n provider = cac...",if local . debug :,147
49996,def check_cockroachdb_metrics():\n response...,"if sample [ 0 ] == ""ranges_underreplicated"" :",134
49997,"def get_run_lang(submission_dir):\n """"""Get ...","if os . path . exists ( ""run.sh"" ) :",97
49998,"def archive_user_profile_fields(sender, archiv...",if sender . profile_fields . get ( profile_fie...,154


## Strategy:

First, I'll convert the python methods into abstract syntax trees. 

I can also take the target if statement and create some scaffolding around it to be able to parse it into its AST representation. This way I can guarantee that the method and the if statements have identical forms when unparsed.

Then I'll be able to unparse the ASTs in order to get the methods into a consistent format with no comments (Note: comments in the form of a string literal """comment""" will not be removed by this method)

Finally, since I have the unparsed if statement, I'll be able to replace the if statement with a mask and flatten the method.

In [None]:
def process_if_statement(statement: str):
    """
    Uses AST module to parse an if-statement then unparse it in order to get it into a standard form.
    Since AST module requires proper syntax, all statements will be followed by a pass, and elifs
    will be preceded by an extra if statement.

    Note:
        If the if statement is part of a comprehension (list comprehension, dict comprehension, etc), 
        then I add a colon to the end and treat it as if it's a regular if statement. Then the processed
        statement gets returned without the ending colon

    returns: str: the standard form of the if statement
    """
    # Keep track of whether or not this statement should have a colon at the end
    add_trailing_colon = statement.endswith(":")

    if not add_trailing_colon:
        # Add the colon to the statement so that it can be processed like a normal if statement
        statement += ":"

    if statement.startswith("elif"):
        # Prepend if statement so that the syntax is valid. Also add pass to elif so that its syntax is also correct
        statement = "if False:\n\tpass\n" + statement + "\n\tpass"

        tree = ast.parse(statement)

        for node in ast.walk(tree):
            if isinstance(node, ast.If) and node.orelse:
                statement = "elif " + ast.unparse(node.orelse[0].test)
                break
    
    else:
        # In this branch we are looking at pure if statements, not elifs

        # Add pass to statement so that syntax is valid for the desired statement
        statement += "\n\tpass"

        tree = ast.parse(statement)

        for node in ast.walk(tree):
            if isinstance(node, ast.If):
                statement = "if " + ast.unparse(node.test)
                break


    # if the if statement does not occur in a comprehension context, add a colon at the end
    if add_trailing_colon: statement += ":"

    return statement



def process_and_flatten_method(method: str, target_statement: str):
    """
    Processes the method using AST in order to get method in a standard form, then the target statement
    is replaced by a mask token. Finally it flattens the method by removing tabs and newlines.

    Notes:
        Some of the methods in the training set use regular expressions but the pattern is not contained in a raw string. Because of this, the AST module
        will raise a SyntaxWarning for 'invalid escape sequences' contained in the regular expression. After discussing this issue with the professor, I 
        have decided to skip these training examples.

        Some of the training examples have target blocks that are not if / elif statements. They contain variable names that start with "if"
        I will remove these examples

    Returns:
        str, str : a tuple containing the flattened method followed by the unparsed form of the target statement 
    """
    # Remove examples where the target is not an if / elif statement
    if not target_statement.startswith(("if ", "elif ")):
        return None, None

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", SyntaxWarning) # Catch all SyntaxWarnings

        tree = ast.parse(method)

        for warning in w:
            if issubclass(warning.category, SyntaxWarning):
                # print("Caught method with SyntaxWarning: ", warning.message)
                return None, None # Return None if the method contains a SyntaxWarning
            
        if_statement = process_if_statement(target_statement)

        # Unparse and flatten method:

        flattened_method = ast.unparse(tree).replace(if_statement, "<IF_STMT>", count=1).replace("    ", "").replace("\n", " ")
        
        return flattened_method, if_statement

In [None]:
train[["formatted_method", "if_statement"]] = train.apply(lambda row : pd.Series(process_and_flatten_method(row["cleaned_method"], row["target_block"])), axis=1)
print("Methods skipped because of SyntaxWarning or because target was not an if / elif statement:", train["if_statement"].isna().sum())
train = train.dropna()
train



Unnamed: 0,cleaned_method,target_block,tokens_in_method,formatted_method,if_statement
0,"def _resolve_lib_imported_symbols(self, lib, i...",if generic_refs :,189,"def _resolve_lib_imported_symbols(self, lib, i...",if generic_refs:
1,"def make_docs_directory(output_dir, name):\n ...","if not isdir ( pjoin ( output_dir , name , str...",121,"def make_docs_directory(output_dir, name): if ...","if not isdir(pjoin(output_dir, name, str(i))):"
2,"def assert_results(self, results, activities, ...","if hasattr ( result , ""extra_context"" ) :",164,"def assert_results(self, results, activities, ...","if hasattr(result, 'extra_context'):"
3,"def for_file(cls, filename: str, modname: str)...","if "".egg"" + path . sep in filename :",170,"def for_file(cls, filename: str, modname: str)...",if '.egg' + path.sep in filename:
4,"def merge_dicts(source: Dict, destination: Dic...","if isinstance ( value , dict ) :",92,"def merge_dicts(source: Dict, destination: Dic...","if isinstance(value, dict):"
...,...,...,...,...,...
49995,"def reconnect(cache, exc):\n provider = cac...",if local . debug :,147,"def reconnect(cache, exc): provider = cache.da...",if local.debug:
49996,def check_cockroachdb_metrics():\n response...,"if sample [ 0 ] == ""ranges_underreplicated"" :",134,def check_cockroachdb_metrics(): response = ge...,if sample[0] == 'ranges_underreplicated':
49997,"def get_run_lang(submission_dir):\n """"""Get ...","if os . path . exists ( ""run.sh"" ) :",97,"def get_run_lang(submission_dir): """"""Get the c...",if os.path.exists('run.sh'):
49998,"def archive_user_profile_fields(sender, archiv...",if sender . profile_fields . get ( profile_fie...,154,"def archive_user_profile_fields(sender, archiv...",if sender.profile_fields.get(profile_field.fie...


# Processing Validation Data

In [21]:
valid = pd.read_csv("data/ft_valid.csv")
valid

Unnamed: 0,cleaned_method,target_block,tokens_in_method
0,"def __init__(self, scale, factor, mode):\n ...",if factor < 1.0 :,160
1,def get_grab_keys(self):\n keystr = None\n ...,if keystr is None :,115
2,"def _checkAllExamples(self, num_type):\n fo...",if numobj_py is not None :,127
3,def _gaf10iterator(handle):\n for inline in...,if len ( inrec ) == 1 :,188
4,"def __xor__(self, other):\n inc, exc = _nor...",if exc is None :,183
...,...,...,...
4995,"def data(self):\n result = """"\n for hunk...","if isinstance ( hunk , tuple ) and len ( hunk ...",85
4996,"def not_less_witness(self, other):\n n = ma...",if self . indent_level ( ts ) >= other . inden...,110
4997,def _validate(self) -> None:\n indent = sel...,if len ( indent ) == 0 :,116
4998,def sanitize_numeric_fields(info):\n for nu...,"if field is None or isinstance ( field , compa...",96


In [22]:
valid[["formatted_method", "if_statement"]] = valid.apply(lambda row : pd.Series(process_and_flatten_method(row["cleaned_method"], row["target_block"])), axis=1)
print("Methods dropped:", valid["if_statement"].isna().sum())
valid = valid.dropna()
valid

Methods dropped: 24


Unnamed: 0,cleaned_method,target_block,tokens_in_method,formatted_method,if_statement
0,"def __init__(self, scale, factor, mode):\n ...",if factor < 1.0 :,160,"def __init__(self, scale, factor, mode): self....",if factor < 1.0:
1,def get_grab_keys(self):\n keystr = None\n ...,if keystr is None :,115,def get_grab_keys(self): keystr = None try: ke...,if keystr is None:
2,"def _checkAllExamples(self, num_type):\n fo...",if numobj_py is not None :,127,"def _checkAllExamples(self, num_type): for reg...",if numobj_py is not None:
3,def _gaf10iterator(handle):\n for inline in...,if len ( inrec ) == 1 :,188,def _gaf10iterator(handle): for inline in hand...,if len(inrec) == 1:
4,"def __xor__(self, other):\n inc, exc = _nor...",if exc is None :,183,"def __xor__(self, other): inc, exc = _norm_arg...",if exc is None:
...,...,...,...,...,...
4995,"def data(self):\n result = """"\n for hunk...","if isinstance ( hunk , tuple ) and len ( hunk ...",85,def data(self): result = '' for hunk in self._...,"if isinstance(hunk, tuple) and len(hunk) == 2:"
4996,"def not_less_witness(self, other):\n n = ma...",if self . indent_level ( ts ) >= other . inden...,110,"def not_less_witness(self, other): n = max(sel...",if self.indent_level(ts) >= other.indent_level...
4997,def _validate(self) -> None:\n indent = sel...,if len ( indent ) == 0 :,116,def _validate(self) -> None: indent = self.ind...,if len(indent) == 0:
4998,def sanitize_numeric_fields(info):\n for nu...,"if field is None or isinstance ( field , compa...",96,def sanitize_numeric_fields(info): for numeric...,"if field is None or isinstance(field, compat_n..."


# Processing Test Data

In [23]:
test = pd.read_csv("data/ft_test.csv")
test

Unnamed: 0,cleaned_method,target_block,tokens_in_method
0,"def read(self, count=True, timeout=None, ignor...",if ignore_timeouts and is_timeout ( e ) :,174
1,"def _cache_mem(curr_out, prev_mem, mem_len, re...",if prev_mem is None :,165
2,def filtered(gen):\n for example in gen:\n ...,if example_len > max_length :,117
3,"def search(self, query):\n # ""Search.ashx?q...","if item . get ( ""type"" , """" ) == ""audio"" :",163
4,"def _check_script(self, script, directive):\n ...","if var . must_contain ( ""/"" ) :",157
...,...,...,...
4995,"def _super_function(args):\n passed_class, ...","if isinstance ( pyclass , pyobjects . Abstract...",132
4996,def get_data(row):\n data = []\n for fie...,if result :,142
4997,"def say(jarvis, s):\n """"""Reads what is type...",if not voice_state :,99
4998,"def __import__(name, globals=None, locals=None...","if ""*"" in fromlist :",175


In [24]:
test[["formatted_method", "if_statement"]] = test.apply(lambda row : pd.Series(process_and_flatten_method(row["cleaned_method"], row["target_block"])), axis=1)
print("Methods dropped:", test["if_statement"].isna().sum())
test = test.dropna()
test

Methods dropped: 13


Unnamed: 0,cleaned_method,target_block,tokens_in_method,formatted_method,if_statement
0,"def read(self, count=True, timeout=None, ignor...",if ignore_timeouts and is_timeout ( e ) :,174,"def read(self, count=True, timeout=None, ignor...",if ignore_timeouts and is_timeout(e):
1,"def _cache_mem(curr_out, prev_mem, mem_len, re...",if prev_mem is None :,165,"def _cache_mem(curr_out, prev_mem, mem_len, re...",if prev_mem is None:
2,def filtered(gen):\n for example in gen:\n ...,if example_len > max_length :,117,def filtered(gen): for example in gen: example...,if example_len > max_length:
3,"def search(self, query):\n # ""Search.ashx?q...","if item . get ( ""type"" , """" ) == ""audio"" :",163,"def search(self, query): if not query: logger....","if item.get('type', '') == 'audio':"
4,"def _check_script(self, script, directive):\n ...","if var . must_contain ( ""/"" ) :",157,"def _check_script(self, script, directive): fo...",if var.must_contain('/'):
...,...,...,...,...,...
4995,"def _super_function(args):\n passed_class, ...","if isinstance ( pyclass , pyobjects . Abstract...",132,"def _super_function(args): passed_class, passe...","if isinstance(pyclass, pyobjects.AbstractClass):"
4996,def get_data(row):\n data = []\n for fie...,if result :,142,"def get_data(row): data = [] for field_name, f...",if result:
4997,"def say(jarvis, s):\n """"""Reads what is type...",if not voice_state :,99,"def say(jarvis, s): """"""Reads what is typed.""""""...",if not voice_state:
4998,"def __import__(name, globals=None, locals=None...","if ""*"" in fromlist :",175,"def __import__(name, globals=None, locals=None...",if '*' in fromlist:


# Save the train, test, and validation sets to CSV files

In [27]:
train[["formatted_method", "if_statement"]].to_csv("processed_data/ft_train_processed.csv", index=False)
valid[["formatted_method", "if_statement"]].to_csv("processed_data/ft_valid_processed.csv", index=False)
test[["formatted_method", "if_statement"]].to_csv("processed_data/ft_test_processed.csv", index=False)