In [None]:
import pickle
import sys
import logging
import numpy as np
import os
from utils import dir_hash, parse_json, gen_observations, preprocess
from mem_net import run_mem_net, test_mem_network

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
cache_pickle = "{}.pkl"
cache_dir = ".cache-pythia"

In [None]:
def get_args(
        # DIRECTORY
        directory,

        # one-hot CNN layer
        CNN_APPEND=False,
        CNN_DIFFERENCE=False,
        CNN_PRODUCT=False,
        CNN_COS=False,
        # The one-hot CNN will use the full_vocab parameters
        # memory network
        MEM_VOCAB=0,
        MEM_TYPE='dmn_basic',
        MEM_BATCH=1,
        MEM_EPOCHS=5,

        # PARAMETERS
        # resampling
        RESAMPLING=False,
        NOVEL_RATIO=None,
        OVERSAMPLING=False,
        REPLACEMENT=False,
        SAVE_RESULTS=False,

        # save training data for experimentation and hyperparameter grid search
        SAVEEXPERIMENTDATA=False,
        EXPERIMENTDATAFILE='data/experimentdatafile.pkl',

        # vocabulary
        VOCAB_SIZE=10000,
        STEM=False,
        FULL_VOCAB_SIZE=10000,
        FULL_VOCAB_TYPE='character',
        FULL_CHAR_VOCAB="abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/|_@#$%^&*~`+-=<>()[]{}",
        FULL_VOCAB_STEM=False,
        SEED=41,

        USE_CACHE=False):
    """ Return a parameters data structure with information on how to
    run an experiment. Argument list should match experiments/experiments.py
    """

    # get features

    cnn = dict()
    if CNN_APPEND: cnn['append'] = CNN_APPEND
    if CNN_DIFFERENCE: cnn['difference'] = CNN_DIFFERENCE
    if CNN_PRODUCT: cnn['product'] = CNN_PRODUCT
    if CNN_COS: cnn['cos'] = CNN_COS
    mem_net = dict()
    if MEM_VOCAB: mem_net['word_vector_size'] = MEM_VOCAB
    # if SEED: mem_net['seed'] = SEED
    if MEM_TYPE: mem_net['network'] = MEM_TYPE
    if MEM_BATCH: mem_net['batch_size'] = MEM_BATCH
    if MEM_EPOCHS: mem_net['epochs'] = MEM_EPOCHS

    # Use the same input params as word2vec

    features = dict()
    features['cnn'] = cnn
    if len(features) > 0:
        print("Caution!!  Only the memory network feature and algorithm will be ran as they have to run alone")
    features['mem_net'] = mem_net

    if len(features) == 0:
        raise Exception("Error: No features found")

    # get algorithms
    algorithms = dict()
    algorithms['mem_net'] = mem_net

    logger.debug("Algorithms structure: {}".format(algorithms))

    # Enforce requirement and limitation of one algorithm per run
    if len(algorithms) == 0:
        print("Error: One classification algorithm must be requested per run.", file=sys.stderr)
        quit()
    elif len(algorithms) > 1:
        print("Error: Only one classification can be requested per run.", file=sys.stderr)
        quit()

    # get parameters
    resampling = None

    if RESAMPLING:
        resampling = dict()
        if NOVEL_RATIO:
            resampling['novelToNotNovelRatio'] = NOVEL_RATIO
            logger.warning("NOVEL_RATIO specified but not supported")
        resampling['over'] = OVERSAMPLING
        resampling['replacement'] = REPLACEMENT

    saveexperimentdata = None
    if SAVEEXPERIMENTDATA:
        saveexperimentdata = dict()
        if EXPERIMENTDATAFILE: saveexperimentdata['experimentdatafile'] = EXPERIMENTDATAFILE

    parameters = dict()
    if RESAMPLING: parameters['resampling'] = resampling
    if SAVE_RESULTS: parameters['save_results'] = SAVE_RESULTS
    if SAVEEXPERIMENTDATA: parameters['saveexperimentdata'] = saveexperimentdata
    if VOCAB_SIZE: parameters['vocab'] = VOCAB_SIZE
    if STEM: parameters['stem'] = STEM
    if SEED:
        parameters['seed'] = SEED
    else:
        parameters['seed'] = 41
    if FULL_VOCAB_SIZE: parameters['full_vocab_size'] = FULL_VOCAB_SIZE
    if FULL_VOCAB_TYPE: parameters['full_vocab_type'] = FULL_VOCAB_TYPE
    if FULL_CHAR_VOCAB: parameters['full_char_vocab'] = FULL_CHAR_VOCAB
    if FULL_VOCAB_STEM: parameters['full_vocab_stem'] = FULL_VOCAB_STEM
    parameters['use_cache'] = USE_CACHE

    return directory, features, algorithms, parameters


In [None]:
def main(argv):
    """
    controls the over-arching implmentation of the algorithms
    """
    print('starting')
    directory, features, algorithms, parameters = argv

    # Create a numpy random state
    random_state = np.random.RandomState(parameters['seed'])

    # parsing
    print("parsing json data...", file=sys.stderr)

    if parameters['use_cache']:
        hash = dir_hash(directory)
        pickle_path = os.path.join(cache_dir, cache_pickle.format(hash))
        try:
            logger.debug("Trying to use cache")
            with open(pickle_path, 'rb') as f:
                parsed_data = pickle.load(f)
                logger.debug("Using existing cache")
        except:
            # parse and write to cache
            logger.debug("Parsing and writing to cache")
            parsed_data = parse_json(directory, parameters)
            os.makedirs(cache_dir, exist_ok=True)
            with open(pickle_path, 'wb') as f:
                pickle.dump(parsed_data, f)
    else:
        parsed_data = parse_json(directory, parameters)
    clusters, order, data, test_clusters, test_order, test_data, corpusdict = parsed_data

    # preprocessing
    print("preprocessing...", file=sys.stderr)
    vocab, full_vocab, encoder_decoder, lda_model, tf_model, w2v_model = preprocess(features, parameters,
                                                                                         corpusdict, data)

    # featurization
    hdf5_path_train = parameters['hdf5_path_train']
    hdf5_path_test = parameters['hdf5_path_test']
    print("generating training data...", file=sys.stderr)
    train_data, train_target, train_ids = gen_observations(clusters, order, data, features, parameters, vocab,
                                                                    full_vocab, encoder_decoder, lda_model, tf_model,
                                                                    w2v_model, hdf5_path_train)
    print("generating testing data...", file=sys.stderr)
    test_data, test_target, test_ids = gen_observations(test_clusters, test_order, test_data, features,
                                                                 parameters, vocab, full_vocab, encoder_decoder,
                                                                 lda_model, tf_model, w2v_model, hdf5_path_test)

    # save training data for separate experimentation and hyperparameter optimization
    if 'saveexperimentdata' in parameters:
        lunchbox = dict()
        lunchbox['directory'] = directory
        lunchbox['features'] = features
        lunchbox['algorithms'] = algorithms
        lunchbox['parameters'] = parameters
        lunchbox['train_data'] = train_data
        lunchbox['train_target'] = train_target
        lunchbox['test_data'] = test_data
        lunchbox['test_target'] = test_target
        pickle.dump(lunchbox, open(parameters['saveexperimentdata']['experimentdatafile'], "wb"))

    # modeling
    print("running algorithms...", file=sys.stderr)
    if 'mem_net' in algorithms:
        mem_net_model, model_name = run_mem_net(train_data, test_data, **algorithms['mem_net'])
        predicted_labels, perform_results = test_mem_network(mem_net_model, model_name,
                                                                          **algorithms['mem_net'])
    # results
    if "save_results" in parameters:
        perform_results.update({"id": test_ids})
        perform_results.update({"predicted_label": predicted_labels.tolist()})
        perform_results.update({"novelty": test_target})

    return perform_results


In [None]:
if __name__ == '__main__':
    args = get_args()
    print("Algorithm details and Results:", file=sys.stderr)
    print(main(args), file=sys.stdout)
    sys.exit(0)