In [None]:
#default_exp DatasetConstruction

In [None]:
#export
#hide
from typing import Callable, List
from copy import deepcopy

import sys
sys.path.append("..")

from hephaestus.EditOperations import *
from hephaestus.CondenseEditOperations import *
from hephaestus.IOUtils import *

# DatasetConstruction

> Create datasets based on edit operations from the abstract method data

In [None]:
#export
def makeEditOpsFile(
    abstractSourceFile: str,
    abstractTargetFile: str,
    editOpsFile: str,
    condenser: Callable[[List[EditOperation]], List[CompoundOperation]]
) -> None:
    """
    Determines the list of CompoundEditOperations between abstract methods given in `abstractSourceFile` and
    `abstractTargetFile`, then writes those operations in machine format to the given `editOpsFile`. The conversion
    process is done with the given `condenser` function, which should be a function provided in the
    `CondenseEditOperations` module.
    """

    # get the abstract methods
    sourceMethods = readAbstractMethodsFromFile(abstractSourceFile)
    targetMethods = readAbstractMethodsFromFile(abstractTargetFile)

    # make sure the number of abstract methods in both files are equal
    if len(sourceMethods) != len(targetMethods):
        raise ValueError("makeDataset: number of methods differ in source and target files!")
    
    # determine edit operations
    operations = []
    for sourceMethod, targetMethod in zip(sourceMethods, targetMethods):
        operations.append(condenser(sourceMethod.getEditOperationsTo(targetMethod)))
    
    # write the edit operations to the file
    writeCompoundOperationsToFile(editOpsFile, operations)

## Preparation

In [None]:
# abstract method files
TRAIN_BUGGY_SMALL = "../data/abstract_methods/small/train_buggy.txt"
TRAIN_FIXED_SMALL = "../data/abstract_methods/small/train_fixed.txt"

VALID_BUGGY_SMALL = "../data/abstract_methods/small/valid_buggy.txt"
VALID_FIXED_SMALL = "../data/abstract_methods/small/valid_fixed.txt"

In [None]:
# edit operation files
TRAIN_BASIC_SMALL = "../data/edit_ops/basic/small/train.txt"
VALID_BASIC_SMALL = "../data/edit_ops/basic/small/valid.txt"

TRAIN_STRICT_SMALL = "../data/edit_ops/strict/small/train.txt"
VALID_STRICT_SMALL = "../data/edit_ops/strict/small/valid.txt"

TRAIN_LOOSE_SMALL = "../data/edit_ops/loose/small/train.txt"
VALID_LOOSE_SMALL = "../data/edit_ops/loose/small/valid.txt"

## Basic condensed EditOperations

In [None]:
# training set for small methods
makeEditOpsFile(
    TRAIN_BUGGY_SMALL,
    TRAIN_FIXED_SMALL,
    TRAIN_BASIC_SMALL,
    getCondensedBasic
)

In [None]:
# validation set for small methods
makeEditOpsFile(
    VALID_BUGGY_SMALL,
    VALID_FIXED_SMALL,
    VALID_BASIC_SMALL,
    getCondensedBasic
)

## Strictly condensed EditOperations

In [None]:
# training set for small methods
makeEditOpsFile(
    TRAIN_BUGGY_SMALL,
    TRAIN_FIXED_SMALL,
    TRAIN_STRICT_SMALL,
    getCondensedStrict
)

In [None]:
# validation set for small methods
makeEditOpsFile(
    VALID_BUGGY_SMALL,
    VALID_FIXED_SMALL,
    VALID_STRICT_SMALL,
    getCondensedStrict
)

## Loosely condensed EditOperations

In [None]:
# training set for small methods
makeEditOpsFile(
    TRAIN_BUGGY_SMALL,
    TRAIN_FIXED_SMALL,
    TRAIN_LOOSE_SMALL,
    getCondensedLoose
)

In [None]:
# validation set for small methods
makeEditOpsFile(
    VALID_BUGGY_SMALL,
    VALID_FIXED_SMALL,
    VALID_LOOSE_SMALL,
    getCondensedLoose
)

In [None]:
#hide
# ALL BELOW CELLS ARE UNIT TESTS

In [None]:
#hide
def validateEditOpsFile(
    abstractSourceFile: str,
    abstractTargetFile: str,
    editOpsFile: str
) -> None:
    """
    Ensures that the CompoundOperations in the given `editOpsFile` can be extracted and applied to the AbstractMethods in
    the given `abstractSourceFile` to result in the AbstractMethods in the given `abstractTargetFile`.
    """

    # read abstract methods from the source and target files and make sure the number of methods in each is equal
    sourceMethods = readAbstractMethodsFromFile(abstractSourceFile)
    targetMethods = readAbstractMethodsFromFile(abstractTargetFile)
    assert(len(sourceMethods) == len(targetMethods))

    # read in edit ops and make sure that they were all able to be read, and that the length matches the number of methods
    operations = readCompoundOperationsFromFile(editOpsFile)
    assert(all(opList is not None for opList in operations))
    assert(len(operations) == len(sourceMethods))

    # iterate through each method and verify that applying the operations results in the target method
    for sourceMethod, targetMethod, ops in zip(sourceMethods, targetMethods, operations):
        fixedMethod = deepcopy(sourceMethod)
        fixedMethod.applyEditOperations(ops)
        if fixedMethod != targetMethod:
            raise AssertionError(
                "Applying operations did not result in the correct AbstractMethod:\n" +
                "source method: '{}'\n".format(sourceMethod) +
                "target method: '{}'\n".format(targetMethod) +
                "operations:    {}\n".format(ops) +
                "result:        '{}'".format(fixedMethod)
            )
    
    print("Successfully validated {} rows!".format(len(sourceMethods)))

In [None]:
#hide
# basic training set for small methods
validateEditOpsFile(
    TRAIN_BUGGY_SMALL,
    TRAIN_FIXED_SMALL,
    TRAIN_BASIC_SMALL
)

Successfully validated 46680 rows!


In [None]:
#hide
# basic validation set for small methods
validateEditOpsFile(
    VALID_BUGGY_SMALL,
    VALID_FIXED_SMALL,
    VALID_BASIC_SMALL
)

Successfully validated 5835 rows!


In [None]:
#hide
# strict training set for small methods
validateEditOpsFile(
    TRAIN_BUGGY_SMALL,
    TRAIN_FIXED_SMALL,
    TRAIN_STRICT_SMALL
)

Successfully validated 46680 rows!


In [None]:
#hide
# strict validation set for small methods
validateEditOpsFile(
    VALID_BUGGY_SMALL,
    VALID_FIXED_SMALL,
    VALID_STRICT_SMALL
)

Successfully validated 5835 rows!


In [None]:
#hide
# loose training set for small methods
validateEditOpsFile(
    TRAIN_BUGGY_SMALL,
    TRAIN_FIXED_SMALL,
    TRAIN_LOOSE_SMALL
)

Successfully validated 46680 rows!


In [None]:
#hide
# loose validation set for small methods
validateEditOpsFile(
    VALID_BUGGY_SMALL,
    VALID_FIXED_SMALL,
    VALID_LOOSE_SMALL
)

Successfully validated 5835 rows!
