In [0]:
dataset_name = "Dataiku_Hive_Project_Migration"

In [0]:
from datetime import datetime

import dataiku
import pandas as pd
import pprint

from io import StringIO
from IPython.display import FileLink

# Connect to the DSS instance
client = dataiku.api_client()
project = client.get_default_project()

In [0]:
existing_datasets = [ds["name"] for ds in project.list_datasets()]

if dataset_name in existing_datasets:
    print(f"✅ Dataset '{dataset_name}' already exists.")
else:
    print(f"⚠️ Dataset '{dataset_name}' not found. Creating it now...")
    builder = project.new_managed_dataset(dataset_name) \
                     .with_store_into("filesystem_managed") \
                     .with_format("csv")

    builder.create()

    print(f"✅ Dataset '{dataset_name}' has been created.")

In [0]:
# Prepare a list to store results
project_hive_counts = []

# Iterate over all projects
projects = client.list_project_keys()
for project_key in projects:
    project = client.get_project(project_key)
    
    # Get the list of recipes in the project
    recipes = project.list_recipes()
    
    # Count the number of Hive recipes
    hive_count = sum(1 for recipe in recipes if recipe['type'] == 'Hive')

    # Append the result to the list
    project_hive_counts.append({'project_key': project_key, 'hive_recipe_count': hive_count})

# Create a Pandas DataFrame with the results
df = pd.DataFrame(project_hive_counts)
# df.sort_values(by="hive_recipe_count", ascending=False)

In [0]:
# Prepare a list to store results
project_info = []

# Iterate over all projects
projects = client.list_project_keys()
for project_key in projects:
    try:
        project = client.get_project(project_key)
        
        # Get project metadata to extract owner information
        project_metadata = project.get_metadata()
        project_summary = project.get_summary()
        last_modified_timestamp = project_summary.get('versionTag', None).get('lastModifiedOn', None)
        if last_modified_timestamp:
            last_modified_date = datetime.fromtimestamp(last_modified_timestamp / 1000).strftime('%Y-%m-%d %H:%M:%S')
        else:
            last_modified_date = 'unknown_date'
        
        # Get the list of recipes in the project
        recipes = project.list_recipes()
        
        # Count the number of Hive recipes
        hive_recipes = [recipe for recipe in recipes if recipe['type'].lower() == 'hive']
        hive_count = len(hive_recipes)
        
        dataset_types = set()
        
        for d in  project.list_datasets():
            dataset_types.add(d.get('type'))
            
        recipe_types = set()
        for r in  project.list_recipes():
            recipe_types.add(r.get('type'))
        
        # Append the result to the list
        project_info.append({
            'PROJECT_KEY': project_key, 
            'NUMBER_OF_HIVE_RECIPES_USED_IN_THIS_PROJECT': hive_count,
            'Dataset_types_used_in_project': dataset_types,
            'Owner': project.get_permissions().get('owner', 'unknown owner'),
            'DATE_PROJECT_WAS_LAST_MODIFIED': last_modified_date,
            'Recipe_types_used_in_project': recipe_types,
        })
    except Exception as e:
        print(f"Error processing project {project_key}: {str(e)}")

# Create a Pandas DataFrame with the results
df = pd.DataFrame(project_info)
df

In [0]:
print(df)

In [0]:
pk = dataiku.default_project_key()
print(pk)


In [0]:
output_dataset = dataiku.Dataset(dataset_name, project_key=pk)
# output_dataset.write_with_schema(df)
output_dataset.write_dataframe(df)
#output_dataset.write_dataframe(df, drop_and_create=True)
