In [0]:
# Migrates all datasets that are S3 and not "dataiku-managed-storage" to "dataiku-managed-storage".

import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
new_s3_connection = "dataiku-managed-storage"

In [0]:
client = dataiku.api_client()
project_keys = client.list_project_keys()

In [0]:
def get_all_dataset_metadata_for_project(project_handle):
    ds = project.list_datasets()
    if len(ds) == 0:
        return pd.DataFrame()
    else:
        extracted_data = [
            {
                'type': row.get('type'),
                'connection': row.get('params', {}).get('connection'),
                'name': row.get('name'),
                'table': row.get('params', {}).get('table'),
                'catalog': row.get('params', {}).get('catalog'),
                'schema': row.get('params', {}).get('schema'),
                'path':  row.get('params', {}).get('path'),
            }
            for row in ds
        ]
        return pd.DataFrame(extracted_data).sort_values(by=['type', 'connection', 'name'])

    
def check_connection_exists(connection_name):
    return connection_name in dataiku.api_client().list_connections()

In [0]:
if check_connection_exists('dataiku-managed-storage'):
    for project_key in project_keys:
        project = client.get_project(project_key)
        df = get_all_dataset_metadata_for_project(project)

        if df.empty:
            print(f'Project {project_key} has no datasets')
            continue
        unmigrated_S3_connections = df[(df['type'] == 'S3') & (df['connection'] != 'dataiku-managed-storage')]
        dataset_names_to_migrate = unmigrated_S3_connections['name'].unique().tolist()
        if len(dataset_names_to_migrate) == 0:
            print(f"Project {project_key} has datasets, but none that need to be migrated")
        else:
            for dataset_name in dataset_names_to_migrate:
                dataset = project.get_dataset(dataset_name)
                settings = dataset.get_settings()
                # Update to the new S3 connection
                settings.set_connection_and_path(new_s3_connection, settings.get_raw_params()['path'])
                settings.save()
                print(f"Project {project_key} dataset {dataset_name} updated to use connection: {new_s3_connection}")
else:
    print(f'This instance is on-prem, not cloud and does not have access to dataiku-managed-storage')

In [0]:
for project_key in project_keys:
    project = client.get_project(project_key)
    df = get_all_dataset_metadata_for_project(project)
    print(df)