# Generate automatic documentation for datasets in a list of projects

Written by Tim Honker - Oct 2025

In [0]:
import dataiku
from dataikuapi.utils import DataikuException
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
def get_dataset_long_description(dataset_handle):
    dataset_metadata = dataset_handle.get_metadata()
    try:
        return dataset_metadata['description']
    except KeyError:
        return ''


def get_dataset_short_description(dataset_handle):
    dataset_settings = dataset_handle.get_settings().get_raw()
    try:
        return dataset_settings['shortDesc']
    except KeyError:
        return ''


def get_dataset_column_descriptions(dataset_handle):
    dataset_schema = dataset_handle.get_schema()
    try:
        return [item["comment"] for item in dataset_schema['columns']]
    except KeyError:
        return ''


def dataset_has_full_documentation(project_handle, dataset_id):
    """x"""
    # project_handle = client.get_project(project_key)
    dataset_handle = project_handle.get_dataset(dataset_id)
    
    if not get_dataset_long_description(dataset_handle):
        # print(f'Dataset {dataset_id} lacks full documentation because empty: Long Description')
        return False
    
    if not get_dataset_short_description(dataset_handle):
        # print(f'Dataset {dataset_id} lacks full documentation because empty: Short Description')
        return False
    
    column_descriptions = get_dataset_column_descriptions(dataset_handle)

    if any(not s or not s.strip() for s in column_descriptions):
        # print(f'Dataset {dataset_id} lacks full documentation because empty: Column descriptions')
        return False
    
    # print(f'Dataset {dataset_id} has all description fields filled out.')
    return True

In [0]:
client = dataiku.api_client()

num_AI_services_used = 0

PROJECTS_LIST = ['SOL_MBA', 'HCP_TARGET_AGENTS', 'GOOGLE_DRIVE_AUTOMATIC_TRIGGER', 'SOL_BATCH_PERF_OPTIM', 'FINAIADVISORSTARTER', 'SOL_DEMAND_FORECAST', 'CHURNPREDICTIONUSINGSLACKMESSAGES', 'PMMOPTIMIZINGOMNICHANNELMARKETINGLLM', 'DKU_TSHIRTS']

for project_key in PROJECTS_LIST:
    # print(f"Starting loop on project key: {project_key}")
    project_handle = client.get_project(project_key)

    for dataset in project_handle.list_datasets():
        dataset_id = dataset['name']
        # print(f"Starting loop on dataset id: {dataset_id}")
        
        dataset_handle = project_handle.get_dataset(dataset_id)
    
        if not dataset_has_full_documentation(project_handle, dataset_id):
            print(f"Auto-generating documentation for dataset: {dataset_id}")
            
            try:
                # this blocks execution, doesn't utilize Futures/JobID system
                num_AI_services_used += 1
                x = dataset_handle.generate_ai_description(save_description=True)
            except DataikuException:
                print(f"[ERROR] Failed to update dataset {dataset_id} in {project_key}")

print(f"Successfully finished. Used {num_AI_services_used} calls to Dataiku's API Services which are limited to a total of 1000/day")

In [0]:
# https://developer.dataiku.com/latest/api-reference/python/projects.html#dataikuapi.dss.project.DSSProject.generate_ai_description
# generate_ai_description(language='english', purpose='generic', length='medium', save_description=False)

# https://developer.dataiku.com/latest/api-reference/python/flow.html#dataikuapi.dss.flow.DSSFlowZone.generate_ai_description
# generate_ai_description(language='english', purpose='generic', length='medium', save_description=False)

In [0]:
def is_project_description_empty(project_handle):
    try:
        md = project_handle.get_metadata()
        if not md['description']:
            return True
    except KeyError:
        return True
    return False

In [0]:
PROJECTS_LIST = client.list_project_keys()
# ['SOL_MBA', 'HCP_TARGET_AGENTS', 'GOOGLE_DRIVE_AUTOMATIC_TRIGGER', 'SOL_BATCH_PERF_OPTIM', 'FINAIADVISORSTARTER', 'SOL_DEMAND_FORECAST', 'CHURNPREDICTIONUSINGSLACKMESSAGES', 'PMMOPTIMIZINGOMNICHANNELMARKETINGLLM', 'DKU_TSHIRTS']

for project_key in PROJECTS_LIST:
    project_handle = client.get_project(project_key)
    is_empty = is_project_description_empty(project_handle)
    
    # print(f"Project {project_key}: {is_empty}")
    
    if is_empty:
        print(f"Project {project_key} had an empty description, generating AI description for it.")
        num_AI_services_used += 1
        project_handle.generate_ai_description(language='english', purpose='generic', length='medium', save_description=True)


print(f"Successfully finished. Used {num_AI_services_used} calls to Dataiku's API Services which are limited to a total of 1000/day")
    # shortDesc, description