# Clear all calculated datasets for full build

Clears datasets unless they are:
* only an input and not an output
* not used as an input, nor an output (isolated)
* an input or output to a Text Extraction or knowledge bank recipe

In [0]:
from dataiku import api_client

In [0]:
# Connect to the Dataiku instance
client = api_client()
project = client.get_default_project()

In [0]:
def extract_key_values(dict_list, key):
    """
    Extracts the set of values associated with a given key from a list of dictionaries.

    Parameters:
        dict_list (list): List of dictionaries.
        key (str): The key whose values you want to extract.

    Returns:
        set: A set of values corresponding to the given key.
    """
    return {d[key] for d in dict_list if key in d}

In [0]:
all_datasets = extract_key_values(project.list_datasets(), "name")
all_datasets

In [0]:
all_recipes = project.list_recipes()

In [0]:
input_datasets = set()

keys_to_extract = ['main', 'output_dataset', 'input_folder', 'knowledge_bank']

for recipe in all_recipes:
    input_node = recipe['inputs']
    print(input_node)
    for key in keys_to_extract:
        if key in input_node:
            for input2 in input_node[key]['items']:
                input_datasets.add(input2['ref'])

input_datasets

In [0]:
output_datasets = set()

for recipe in all_recipes:
    output_node = recipe['outputs']
    print(output_node)
    
    for key in keys_to_extract:
        if key in output_node:
            for output2 in output_node[key]["items"]:
                output_datasets.add(output2['ref'])

            
output_datasets

In [0]:
input_only_datasets = input_datasets - output_datasets
input_only_datasets

In [0]:
non_input_datasets = all_datasets - input_only_datasets
non_input_datasets

In [0]:
isolated_datasets = all_datasets - input_datasets - output_datasets
isolated_datasets

In [0]:
datasets_to_clear = non_input_datasets - isolated_datasets
datasets_to_clear

In [0]:
datasets_to_leave_alone = all_datasets - datasets_to_clear
datasets_to_leave_alone

In [0]:
# # Clear each non-input dataset
# for dataset_name in datasets_to_clear:
#     if dataset_name in all_datasets:   # don't try to clear knowledge banks and stuff
#         dataset = project.get_dataset(dataset_name)
#         dataset.clear()  # Clears all data in the dataset
#         print(f"Cleared dataset: {dataset_name}")

# print("Finished clearing non-input datasets.")

In [0]:
# There is not an API method to delete/clear/clean/empty knowledge banks
# knowledge_banks = project.list_knowledge_banks()

# # Iterate over each Knowledge Bank and clear its contents
# for kb in knowledge_banks:
#     kb_handle = project.get_knowledge_bank(kb['id'])
