In [0]:
dataset_name = "Dataiku_Hive_Project_Migration"


In [0]:
from datetime import datetime

import dataiku
import pandas as pd
import pprint

from io import StringIO
from IPython.display import FileLink

# Connect to the DSS instance
client = dataiku.api_client()
project = client.get_default_project()

In [0]:
existing_datasets = [ds["name"] for ds in project.list_datasets()]

if dataset_name in existing_datasets:
    print(f"✅ Dataset '{dataset_name}' already exists.")
else:
    print(f"⚠️ Dataset '{dataset_name}' not found. Creating it now...")

    # Step 2: Define dataset params
    # This example creates a filesystem (CSV) dataset in the 'filesystem_managed' connection
    project.create_dataset(
        dataset_name=dataset_name,
        type="Filesystem",
        params={
            "connection": "filesystem_managed",   # Make sure this connection exists in your DSS
            "path": dataset_name,                 # Folder name = dataset name
            "format": "csv"
        },
        formatType="csv"
    )

    print(f"✅ Dataset '{dataset_name}' has been created.")

In [0]:
# Prepare a list to store results
project_hive_counts = []

# Iterate over all projects
projects = client.list_project_keys()
for project_key in projects:
    project = client.get_project(project_key)
    
    # Get the list of recipes in the project
    recipes = project.list_recipes()
    
    # Count the number of Hive recipes
    hive_count = sum(1 for recipe in recipes if recipe['type'] == 'Hive')

    # Append the result to the list
    project_hive_counts.append({'project_key': project_key, 'hive_recipe_count': hive_count})

# Create a Pandas DataFrame with the results
df = pd.DataFrame(project_hive_counts)
df.sort_values(by="hive_recipe_count", ascending=False)

In [0]:


# Connect to the DSS instance
client = dataiku.api_client()

# Prepare a list to store results
project_info = []

# Iterate over all projects
projects = client.list_project_keys()
for project_key in projects:
    try:
        project = client.get_project(project_key)
        
        # Get project metadata to extract owner information
        project_metadata = project.get_metadata()
        project_summary = project.get_summary()
        last_modified_timestamp = project_summary.get('versionTag', None).get('lastModifiedOn', None)
        if last_modified_timestamp:
            last_modified_date = datetime.fromtimestamp(last_modified_timestamp / 1000).strftime('%Y-%m-%d %H:%M:%S')
        else:
            last_modified_date = 'unknown_date'
        
        # Get the owner username
        #owner_username = project_metadata.get('owner', '')
        
        # Get the list of recipes in the project
        recipes = project.list_recipes()
        
        # Count the number of Hive recipes
        hive_recipes = [recipe for recipe in recipes if recipe['type'].lower() == 'hive']
        hive_count = len(hive_recipes)
        
        dataset_types = set()
        
        for d in  project.list_datasets():
#            if d.get('type') == 'hive':
                dataset_types.add(d.get('type'))
        
        # Append the result to the list
        project_info.append({
            'PROJECT_KEY': project_key, 
            'NUMBER_OF_HIVE_RECIPES_USED_IN_THIS_PROJECT': hive_count,
            'DATASET_TYPES': dataset_types,
            'PERMISSIONS': project.get_permissions(),
            #'PROJECT_OWNER_EMAIL': owner_email,
            'DATE_PROJECT_WAS_LAST_MODIFIED': last_modified_date,
            # 'DATE_LAST_JOB_WAS_RUN': last_job_date
        })
    except Exception as e:
        print(f"Error processing project {project_key}: {str(e)}")
        # Add error entry to maintain record of all projects
        project_info.append({
            'PROJECT_KEY': project_key,
            'NUMBER_OF_HIVE_RECIPES_USED_IN_THIS_PROJECT': -1,  # Error indicator
            #'PROJECT_OWNER_USERNAME': "",
            #'PROJECT_OWNER_EMAIL': "",
            'DATE_PROJECT_WAS_LAST_MODIFIED': "unknown",
            # 'DATE_LAST_JOB_WAS_RUN': "",
        })

# Create a Pandas DataFrame with the results
df = pd.DataFrame(project_info)
df['PERMISSIONS_EXPANDED'] = df['PERMISSIONS'].apply(lambda d: pprint.pformat(d, width=1000))
df

In [0]:
print(df)

In [0]:
# import os
# os.getcwd()

# output_filename = "downloads/Dataiku_Hive_Project_Migration.csv"
# os.makedirs("downloads", exist_ok=True)
# df.to_csv(output_filename, index=False)

# FileLink(os.path.join(os.getcwd(),output_filename))

In [0]:
output_dataset = dataiku.Dataset("Dataiku_Hive_Project_Migration")
output_dataset.write_with_schema(df)