In [18]:
"""
    This notebook is to build a curriculum learning dataset.
    Idea:
        We have a silver dataset with recipes with variable lengths. Based on our assumption that smaller recipes are easier to train, 
        we build multiple chunks of silver dataset and progressively introduce them during training. We start with smallest recipes
        and slowly introduce longer recipes to ease learning
"""


## imports
from mtrfg.utils import read_conllu_dataset_allennlp, write_text, make_dir, get_allennlp_in_conllu_format
import os
from math import floor


In [19]:
## let's get the relevant paths
train_file = '/data/Ahmad/Silver/Parser/train.conllu'
test_file = '/data/Ahmad/Silver/Parser/test.conllu'
dev_file = '/data/Ahmad/Silver/Parser/dev.conllu'

## path to curriculum learning dataset (where data will be stored)
path_to_curriculum_data = '/data/Multitask_RFG/curriculum_learning_data/'
make_dir(path_to_curriculum_data)

In [20]:
## let's load the data
train_data = read_conllu_dataset_allennlp(train_file)
dev_data = read_conllu_dataset_allennlp(dev_file)
test_data = read_conllu_dataset_allennlp(test_file)

In [21]:
## this will contain certain helper functions that we will need laters

def get_curriculum_learning_dataset(input_data, data_splits = 10):
    """
        input_data: List of allennlp Instances
        data_splits: How many curriculum learning data splits should be. (If there are 100 input data points, and data_split is 20 then we have 5 distinct dataset from shorter lengths to longer lengths)

        Output: List of datasets, each list element is 1 dataset. List is in sorted order, starting elements will have smallest recipes and end elements will have largest recipes

    """
    input_data_dict = {f'recipe_{i}': get_allennlp_in_conllu_format(data_point) for i, data_point in enumerate(input_data)}
    input_data_sorted_keys = dict(sorted( {f'recipe_{i}': len(data_point['words'].tokens) for i, data_point in enumerate(input_data)}.items(), key=lambda item: item[1])) ## sort input data by length of the recipes, smallest will come on the top
    input_data_sorted_keys = [key for key in input_data_sorted_keys.keys()]

    output_datasets = []
    dataset_size = floor(len(input_data_sorted_keys) / data_splits)

    ## let's get the datasets
    for i in range(0, len(input_data_sorted_keys), dataset_size):
        keys = input_data_sorted_keys[i : i + dataset_size] ## keys for this batch of dataset
        output_datasets.append('\n\n'.join([input_data_dict[key] for key in keys]))

    return output_datasets

In [22]:
## define datasplits
data_splits = 10

## let's build the datasets and save them
train_datasets = get_curriculum_learning_dataset(train_data, data_splits = data_splits)
dev_datasets = get_curriculum_learning_dataset(dev_data, data_splits = data_splits)
test_datasets = get_curriculum_learning_dataset(test_data, data_splits = data_splits)

print(len(train_datasets), len(dev_datasets), len(test_datasets))

## let's save the datasets!
for i in range(data_splits):
    train_path, test_path, dev_path = os.path.join(path_to_curriculum_data, f'train_{i}.conllu'), os.path.join(path_to_curriculum_data, f'test_{i}.conllu'), os.path.join(path_to_curriculum_data, f'dev_{i}.conllu')
    write_text(train_path, train_datasets[i])
    write_text(dev_path, dev_datasets[i])
    write_text(test_path, test_datasets[i])


10 10 10
