# Generate automatic documentation for datasets, projects, and flow zones

Written by Tim Honker - Oct 2025

```Warning: this notebook will fully overwrite the entirety of dataset descriptions if they are not already 100% filled out```

This notebook will iterate through all projects on the local DSS node (or a user specified list) and:

1. Check each dataset to see if it has ALL of the following: short description, long description, and each column has a description. If ANY one of these is empty, then it will re-generate ALL of them using AI and automatically save them.
 
2. Fill in any empty Flow Zone descriptions (if either the short or long or both are empty)

3. Fill in any empty Project descriptions. (Short descriptions are ignored).

This is code to prototype a future DSS Plugin that I'm working on with Abi Edwards.


Possible future features:

* add support for tagging entities (Projects, Datasets, Flow Zones) where the description was AI generated
* add support for a special tag that prevents any changes to entities, so projects, etc can be OPT'ed out of AI generated descriptions
* more difficult - add support for only filling in empty fields in datasets and don't change previous occupied ones.

# Setup / Config

In [0]:
from json import JSONDecodeError, dumps
import sys
import traceback

import pandas as pd
import dataiku
from dataikuapi.utils import DataikuException
import dataikuapi

In [0]:
# Future plugin level settings:

# The list of Projects to affect. If not defined, then it will default to ALL projects
# PROJECTS_LIST = ['SOL_MBA', 'HCP_TARGET_AGENTS', 'GOOGLE_DRIVE_AUTOMATIC_TRIGGER']

# Set SAVE_DESCRIPTION to False for a dry-run, where it will generate the descriptions, but not save them.
# Set to True to save the AI-generated descriptions.
SAVE_DESCRIPTION = True

# Language to generate descriptions in. 
# Supported languages are “dutch”, “english”, “french”, “german”, “portuguese”, and “spanish”
LANGUAGE = 'english'

################################################################################
# Settings for generating descriptions for Projects
################################################################################
#    https://developer.dataiku.com/latest/api-reference/python/projects.html#dataikuapi.dss.project.DSSProject.generate_ai_description
# The purpose of the generated description. 
# Supported purposes are “generic”, “technical”, “business_oriented”, and “executive” (defaults to generic).
PROJECT_PURPOSE='generic'

# The length of the generated description. 
# Supported lengths are “low”, “medium”, and “high” (defaults to medium).
PROJECT_LENGTH='medium'

################################################################################
# Settings for generating descriptions for Flow Zones
################################################################################
# same as Projects
#   https://developer.dataiku.com/latest/api-reference/python/flow.html#dataikuapi.dss.flow.DSSFlowZone.generate_ai_description
FLOWZONE_PURPOSE='generic'

FLOWZONE_LENGTH='medium'

In [0]:
client = dataiku.api_client()

# if it is not defined, then default to all projects on the local node
try:
    PROJECTS_LIST
except NameError:
    PROJECTS_LIST = client.list_project_keys()

# only reset to zero if undefined - useful for keeping track of the number of AI services called
# in a single 24 hour period - assuming you don't restart this Jupyter Notebook's kernel.
try:
    num_AI_services_used
except NameError:
    num_AI_services_used = 0

# Function definitions

In [0]:
def is_project_empty(project_handle):
    """This traps the JSONDecodeError exception that occurs when generate_ai_description is called
    if the project has no datasets or recipes or flow zones"""

    # Check if there are any datasets
    has_datasets = len(project_handle.list_datasets()) > 0

    # Check if there are any recipes
    has_recipes = len(project_handle.list_recipes()) > 0

    return (not has_datasets) and (not has_recipes)


def get_dataset_long_description(dataset_handle):
    dataset_metadata = dataset_handle.get_metadata()
    try:
        return dataset_metadata['description']
    except KeyError:
        return ''


def get_dataset_short_description(dataset_handle):
    dataset_settings = dataset_handle.get_settings().get_raw()
    try:
        return dataset_settings['shortDesc']
    except KeyError:
        return ''


def get_dataset_column_descriptions(dataset_handle):
    dataset_schema = dataset_handle.get_schema()
    try:
        return [item["comment"] for item in dataset_schema['columns']]
    except KeyError:
        return ''


def dataset_has_full_documentation(project_handle, dataset_id):
    """
    Returns a boolean describing if a specific dataset meets ALL of the following requirements:
        1) It has a shortDesc that is not empty
        2) It has a description that is not empty
        3) ALL columns have descriptions that are not empty.

    Useful for determining if we should auto-generate a description to fill it in.
    """

    # project_handle = client.get_project(project_key)
    dataset_handle = project_handle.get_dataset(dataset_id)
    
    if not get_dataset_long_description(dataset_handle):
        # print(f'Dataset {dataset_id} lacks full documentation because empty: Long Description')
        return False
    
    if not get_dataset_short_description(dataset_handle):
        # print(f'Dataset {dataset_id} lacks full documentation because empty: Short Description')
        return False
    
    column_descriptions = get_dataset_column_descriptions(dataset_handle)

    if any(not s or not s.strip() for s in column_descriptions):
        # print(f'Dataset {dataset_id} lacks full documentation because empty: Column descriptions')
        return False
    
    # print(f'Dataset {dataset_id} has all description fields filled out.')
    return True


def is_project_description_empty(project_handle):
    """
    Returns a boolean describing if a specific project an existing description.
    Useful for determining if we should auto-generate a description to fill it in.
    """

    try:
        md = project_handle.get_metadata()
        if not md['description']:
            return True
    except KeyError:
        return True
    return False


def pretty_print_dict(d):
    import json
    print(json.dump(d, fp=sys.stdout, indent=4, sort_keys=True))


def flow_zone_has_description(flow_zone_handle):
    """
    Returns a boolean describing if a specific flow zone has an existing description.
    Useful for determining if we should auto-generate a description to fill it in.
    """
    try:
        # the auto-generated ones ONLY CREATE THE LONG DESCRIPTION, don't do the short one.
        flow_zone_settings                   = flow_zone_handle.get_settings().get_raw()
        flow_zone_description                = flow_zone_settings.get('description', '')
        flow_zone_description_length         = len(flow_zone_description)

        # can't use the "not " version, must use len>0 for some reason.
        flow_zone_has_a_nonempty_description = len(flow_zone_description) > 0 

        return flow_zone_has_a_nonempty_description
    except JSONDecodeError:
        print(f"[ERROR] Trying to get description for {flow_zone_settings['name']}")
        pretty_print_dict(flow_zone_settings)
        return False
    
    
def is_flowzone_empty(flowzone_handle):
    """This traps the JSONDecodeError exception that occurs when generate_ai_description is called
    if the flowzone has no datasets or recipes
    
      There has to be at least one recipe or one dataset in order to explain a zone.

    """
    for i in flowzone_handle.items:
        if type(i) in [dataiku.Dataset, dataikuapi.dss.recipe.DSSRecipe, dataikuapi.dss.dataset.DSSDataset]:
            return False
    return True


def read_first_dataset_row(project_key, dataset_name):
    """Returns a boolean if the first row of the dataset was readable.
    False implies the dataset was either empty or an exception occurred.
    Useful for determining if autogenerated descriptions of a dataset are possible.
    """
    try:
        dataset_handle = dataiku.Dataset(dataset_name, project_key=project_key)
        df = dataset_handle.get_dataframe(limit=1)

        # Check if the dataframe is empty (no rows)
        if df.empty:
            print("Dataset is empty.")
            return False

        return True

    except Exception as e:
        # Catch any exception that might occur
        return False


# def tag_object_as_auto_generated(object_handle):
#     """
#     x
#     """
#     if isinstance(object_handle, dataiku.Dataset):
        
#     elif isinstance(object_handle, dataiku.DSSProject):
#         project_tags = project.get_tags()
#         project_tags.append("AI_generated_description")
#         project.set_tags(project_tags)
# #     elif isinstance(object_handle, dataiku.Dataset):

        

        

#         dataset_settings = dataset.get_settings()
#         dataset_settings.tags.append("AI_generated_description")
#         dataset_settings.save()

# Datasets

In [0]:
# iterate through projects
for project_key in PROJECTS_LIST:
    project_handle = client.get_project(project_key)

    # iterate through all datasets in that project
    for dataset in project_handle.list_datasets():
        dataset_id = dataset['name']
        dataset_handle = project_handle.get_dataset(dataset_id)

        if not dataset_handle.exists():
            print(f"[SKIP] dataset does not exist:   {project_key} - {dataset_id}")
            continue
            
        # check if there is no schema
        if len(dataset['schema'].get('columns','')) == 0:
            print(f"[SKIP] dataset has empty schema: {project_key} - {dataset_id}")
            continue
    
        # skip this dataset if it already has all of the description fields filled out
        if not dataset_has_full_documentation(project_handle, dataset_id):
            
            # test if the first row can be read. VERY IMPORTANT to filter out a lot of 
            # wasted AI Services calls.
            if not read_first_dataset_row(project_key, dataset_id):
                print(f"[SKIP] dataset could not be read: {project_key} - {dataset_id}")
                continue
            
            print(f"Auto-generating documentation for {project_key}'s dataset: {dataset_id} ...")
            try:
                # always increment this BEFORE calling generate_ai_description since generate_ai_description
                # often raises an exception
                num_AI_services_used += 1
                
                # this blocks execution, doesn't utilize Futures/JobID system
                # actually generate and save the description
                _ = dataset_handle.generate_ai_description(language=LANGUAGE, save_description=SAVE_DESCRIPTION)
                
#                 if SAVE_DESCRIPTION and dataset_has_full_documentation(project_handle, dataset_id):
#                     print(f"Successfully filled out all fields for {dataset_id}")
#                 else:
#                     print(f"Attempted to fill in description for dataset, but failed to take effect: {dataset_id}")
#                     print(x)
                
            except DataikuException as e:
                # there are so many different types of exceptions that occur
                # java.lang.IllegalArgumentException: Column not found in schema:
                print(f"[ERROR] Exception {e} when autofilling:         {project_key} - {dataset_id}")
                continue

# Flow Zones

In [0]:
# Iterate through the list of projects
for project_key in PROJECTS_LIST:
    project_handle = client.get_project(project_key)
    flow_handle = project_handle.get_flow()
    
    # Iterate through each flow zone in a specific project
    for flow_zone_handle in flow_handle.list_zones():
        try:
            # Ensure that the flow zone meets the requirements for AI-Gen descriptions before
            # attempting to have AI generate the description.
            if is_flowzone_empty(flow_zone_handle):
                print(f"[SKIP] Flow zone must have dataset or recipe in it to autogenerate description: {project_key} - {flow_zone_name}")
                continue

            # get the settings and name of the flow zone
            flow_zone_settings = flow_zone_handle.get_settings().get_raw()
            flow_zone_name = flow_zone_settings.get('name','')
            
            # only have AI write the description if there is not one there already
            if not(flow_zone_has_description(flow_zone_handle)):
                print(f"[CREATE] Generating flow zone documentation for {project_key} - {flow_zone_name}")
                num_AI_services_used += 1
                flow_zone_handle.generate_ai_description(
                    language=LANGUAGE,
                    purpose=FLOWZONE_PURPOSE,
                    length=FLOWZONE_LENGTH,
                    save_description=SAVE_DESCRIPTION
                )
#             else:
#                 print(f"[SKIP] Flow zone already has a description: {project_key} - {flow_zone_name}")
        except (DataikuException, JSONDecodeError) as e:
            print(f"[ERROR] Creating flow zone description for {project_key} - {flow_zone_name}")
            pretty_print_dict(flow_zone_settings)
            continue

# Projects

In [0]:
# iterate through the list of projects
for project_key in PROJECTS_LIST:
    try:
        project_handle = client.get_project(project_key)
        
        # Ensure that the project meets the requirements for creating AI generated descriptions
        if is_project_empty(project_handle):
            print(f"[SKIP] Project must have datasets or recipes in flow, can't create description: {project_key}")
            continue

        # Only generate descriptions if there is not one already:
        if is_project_description_empty(project_handle):
            print(f"Project {project_key} has an empty description, generating AI description for it.")
            num_AI_services_used += 1

            # https://developer.dataiku.com/latest/api-reference/python/projects.html#dataikuapi.dss.project.DSSProject.generate_ai_description
            project_handle.generate_ai_description(
                language=LANGUAGE,
                purpose=PROJECT_PURPOSE,
                length=PROJECT_LENGTH,
                save_description=SAVE_DESCRIPTION
            )

    except JSONDecodeError:
        print(f"[JSONDecodeError] Creating project description for {project_key}")
        continue

In [0]:
# Display the number of AI Services called.
print(f"Successfully finished. Used {num_AI_services_used} calls to Dataiku's API Services which are limited to a total of 1000/day")