In [None]:
#default_exp DatasetConstruction

In [None]:
#export
#hide
from typing import Callable, List
from copy import deepcopy

import sys
sys.path.append("..")

from hephaestus.EditOperations import *
from hephaestus.CondenseEditOperations import *
from hephaestus.IOUtils import *

In [None]:
#hide
from nbdev.showdoc import *

# DatasetConstruction

> Create datasets based on edit operations from the abstract method data. More specifically, the datasets contain CompoundOperation machine strings.

## Helper functions

In [None]:
#export
def makeEditOpsFile(
    abstractSourceFile: str,
    abstractTargetFile: str,
    editOpsFile: str,
    condenser: Callable[[List[EditOperation]], List[CompoundOperation]]
) -> None:
    """
    Determines the list of CompoundEditOperations between abstract methods given in `abstractSourceFile` and
    `abstractTargetFile`, then writes those operations in machine format to the given `editOpsFile`. The conversion
    process is done with the given `condenser` function, which should be a function provided in the
    `CondenseEditOperations` module.
    """

    # get the abstract methods
    sourceMethods = readAbstractMethodsFromFile(abstractSourceFile)
    targetMethods = readAbstractMethodsFromFile(abstractTargetFile)

    # make sure the number of abstract methods in both files are equal
    if len(sourceMethods) != len(targetMethods):
        raise ValueError("makeDataset: number of methods differ in source and target files!")
    
    # determine edit operations
    operations = []
    for sourceMethod, targetMethod in zip(sourceMethods, targetMethods):
        operations.append(condenser(sourceMethod.getEditOperationsTo(targetMethod)))
    
    # write the edit operations to the file
    writeCompoundOperationsToFile(editOpsFile, operations)

## Preparation

Define paths to existing and new data files.

In [None]:
#export
# abstract method files
DATA_SMALL_METHODS_TRAIN_BUGGY = "../data/small/abstract_methods/train_buggy.txt"
DATA_SMALL_METHODS_TRAIN_FIXED = "../data/small/abstract_methods/train_fixed.txt"

DATA_SMALL_METHODS_VALID_BUGGY = "../data/small/abstract_methods/valid_buggy.txt"
DATA_SMALL_METHODS_VALID_FIXED = "../data/small/abstract_methods/valid_fixed.txt"

DATA_SMALL_METHODS_TEST_BUGGY  = "../data/small/abstract_methods/test_buggy.txt"
DATA_SMALL_METHODS_TEST_FIXED  = "../data/small/abstract_methods/test_fixed.txt"

In [None]:
#export
# edit operation files
DATA_SMALL_OPS_BASIC_TRAIN = "../data/small/edit_ops/basic/train.txt"
DATA_SMALL_OPS_BASIC_VALID = "../data/small/edit_ops/basic/valid.txt"

DATA_SMALL_OPS_STRICT_TRAIN = "../data/small/edit_ops/strict/train.txt"
DATA_SMALL_OPS_STRICT_VALID = "../data/small/edit_ops/strict/valid.txt"

DATA_SMALL_OPS_LOOSE_TRAIN = "../data/small/edit_ops/loose/train.txt"
DATA_SMALL_OPS_LOOSE_VALID = "../data/small/edit_ops/loose/valid.txt"

### Paths to abstract method data files

| Variable name                    | Value                                              |
| :------------------------------- | :------------------------------------------------- |
| `DATA_SMALL_METHODS_TRAIN_BUGGY` | `"../data/small/abstract_methods/train_buggy.txt"` |
| `DATA_SMALL_METHODS_TRAIN_FIXED` | `"../data/small/abstract_methods/train_fixed.txt"` |
| `DATA_SMALL_METHODS_VALID_BUGGY` | `"../data/small/abstract_methods/valid_buggy.txt"` |
| `DATA_SMALL_METHODS_VALID_FIXED` | `"../data/small/abstract_methods/valid_fixed.txt"` |
| `DATA_SMALL_METHODS_TEST_BUGGY`  | `"../data/small/abstract_methods/test_buggy.txt"`  |
| `DATA_SMALL_METHODS_TEST_FIXED`  | `"../data/small/abstract_methods/test_fixed.txt"`  |

### Paths to EditOperation data files

| Variable name                 | Value                                        |
| :---------------------------- | :------------------------------------------- |
| `DATA_SMALL_OPS_BASIC_TRAIN`  | `"../data/small/edit_ops/basic/train.txt"`   |
| `DATA_SMALL_OPS_BASIC_VALID`  | `"../data/small/edit_ops/basic/valid.txt"`   |
| `DATA_SMALL_OPS_STRICT_TRAIN` | ` "../data/small/edit_ops/strict/train.txt"` |
| `DATA_SMALL_OPS_STRICT_VALID` | ` "../data/small/edit_ops/strict/valid.txt"` |
| `DATA_SMALL_OPS_LOOSE_TRAIN`  | `"../data/small/edit_ops/loose/train.txt"`   |
| `DATA_SMALL_OPS_LOOSE_VALID`  | `"../data/small/edit_ops/loose/valid.txt"`   |

## Basic condensed EditOperations

In [None]:
# training set for small methods
makeEditOpsFile(
    DATA_SMALL_METHODS_TRAIN_BUGGY,
    DATA_SMALL_METHODS_TRAIN_FIXED,
    DATA_SMALL_OPS_BASIC_TRAIN,
    getCondensedBasic
)

In [None]:
# validation set for small methods
makeEditOpsFile(
    DATA_SMALL_METHODS_VALID_BUGGY,
    DATA_SMALL_METHODS_VALID_FIXED,
    DATA_SMALL_OPS_BASIC_VALID,
    getCondensedBasic
)

## Strictly condensed EditOperations

In [None]:
# training set for small methods
makeEditOpsFile(
    DATA_SMALL_METHODS_TRAIN_BUGGY,
    DATA_SMALL_METHODS_TRAIN_FIXED,
    DATA_SMALL_OPS_STRICT_TRAIN,
    getCondensedStrict
)

In [None]:
# validation set for small methods
makeEditOpsFile(
    DATA_SMALL_METHODS_VALID_BUGGY,
    DATA_SMALL_METHODS_VALID_FIXED,
    DATA_SMALL_OPS_STRICT_VALID,
    getCondensedStrict
)

## Loosely condensed EditOperations

In [None]:
# training set for small methods
makeEditOpsFile(
    DATA_SMALL_METHODS_TRAIN_BUGGY,
    DATA_SMALL_METHODS_TRAIN_FIXED,
    DATA_SMALL_OPS_LOOSE_TRAIN,
    getCondensedLoose
)

In [None]:
# validation set for small methods
makeEditOpsFile(
    DATA_SMALL_METHODS_VALID_BUGGY,
    DATA_SMALL_METHODS_VALID_FIXED,
    DATA_SMALL_OPS_LOOSE_VALID,
    getCondensedLoose
)

In [None]:
#hide
# ALL BELOW CELLS ARE UNIT TESTS

In [None]:
#hide
def validateEditOpsFile(
    abstractSourceFile: str,
    abstractTargetFile: str,
    editOpsFile: str
) -> None:
    """
    Ensures that the CompoundOperations in the given `editOpsFile` can be extracted and applied to the AbstractMethods in
    the given `abstractSourceFile` to result in the AbstractMethods in the given `abstractTargetFile`.
    """

    # read abstract methods from the source and target files and make sure the number of methods in each is equal
    sourceMethods = readAbstractMethodsFromFile(abstractSourceFile)
    targetMethods = readAbstractMethodsFromFile(abstractTargetFile)
    assert(len(sourceMethods) == len(targetMethods))

    # read in edit ops and make sure that they were all able to be read, and that the length matches the number of methods
    operations = readCompoundOperationsFromFile(editOpsFile)
    assert(all(opList is not None for opList in operations))
    assert(len(operations) == len(sourceMethods))

    # iterate through each method and verify that applying the operations results in the target method
    for sourceMethod, targetMethod, ops in zip(sourceMethods, targetMethods, operations):
        fixedMethod = deepcopy(sourceMethod)
        fixedMethod.applyEditOperations(ops)
        if fixedMethod != targetMethod:
            raise AssertionError(
                "Applying operations did not result in the correct AbstractMethod:\n" +
                "source method: '{}'\n".format(sourceMethod) +
                "target method: '{}'\n".format(targetMethod) +
                "operations:    {}\n".format(ops) +
                "result:        '{}'".format(fixedMethod)
            )
    
    print("Successfully validated {} rows!".format(len(sourceMethods)))

In [None]:
#hide
# basic training set for small methods
validateEditOpsFile(
    DATA_SMALL_METHODS_TRAIN_BUGGY,
    DATA_SMALL_METHODS_TRAIN_FIXED,
    DATA_SMALL_OPS_BASIC_TRAIN
)

Successfully validated 46680 rows!


In [None]:
#hide
# basic validation set for small methods
validateEditOpsFile(
    DATA_SMALL_METHODS_VALID_BUGGY,
    DATA_SMALL_METHODS_VALID_FIXED,
    DATA_SMALL_OPS_BASIC_VALID
)

Successfully validated 5835 rows!


In [None]:
#hide
# strict training set for small methods
validateEditOpsFile(
    DATA_SMALL_METHODS_TRAIN_BUGGY,
    DATA_SMALL_METHODS_TRAIN_FIXED,
    DATA_SMALL_OPS_STRICT_TRAIN
)

Successfully validated 46680 rows!


In [None]:
#hide
# strict validation set for small methods
validateEditOpsFile(
    DATA_SMALL_METHODS_VALID_BUGGY,
    DATA_SMALL_METHODS_VALID_FIXED,
    DATA_SMALL_OPS_STRICT_VALID
)

Successfully validated 5835 rows!


In [None]:
#hide
# loose training set for small methods
validateEditOpsFile(
    DATA_SMALL_METHODS_TRAIN_BUGGY,
    DATA_SMALL_METHODS_TRAIN_FIXED,
    DATA_SMALL_OPS_LOOSE_TRAIN
)

Successfully validated 46680 rows!


In [None]:
#hide
# loose validation set for small methods
validateEditOpsFile(
    DATA_SMALL_METHODS_VALID_BUGGY,
    DATA_SMALL_METHODS_VALID_FIXED,
    DATA_SMALL_OPS_LOOSE_VALID
)

Successfully validated 5835 rows!
