#Workspace Estimator

To understand your current workload in Databricks, we need to gather some basic information that includes:
- Jobs, average monthly running frequency, average running time steps.
- Tasks within each job, notebooks of each step.
- Job Categories: Data Engineering, Machine Learning, or Streaming.
- Cluster information [Size, type]

\*All this information will be generated using Databrick Python API, and it will rest in your account for your perusal after that, if you agree, we will provide the steps to share that information with us.

## Requirements
1. Databricks Host (should begin with https://). <br> 
example: https://demo.cloud.databricks.com <br>
for more details visit [workspace details](https://docs.databricks.com/workspace/workspace-details.html)
2. token. For more details [Databricks Authentication](https://docs.databricks.com/dev-tools/api/latest/authentication.html)

### Permissions
The workspace admin account will required the following permissions:
- Personal Access Tokens
- Workspace visibility Control
- Cluster Visibility Control
- Job Visibility Control
- DBS File Browser
### Workflows/Jobs Requirements
To be able to make an estimation, each jobs need to have:
- A scheduling configured 
- At least one successful execution in the last 60 days. 
- If runnning in staging or production mirror, it must include
  - Same machine configuration
  - Complete dataset


In [0]:
!pip install tqdm

### API

In [0]:
from datetime import datetime
from datetime import timedelta
from tqdm.notebook import tqdm_notebook
from urllib.parse import urlencode
import glob
import json
import math
import os
import pandas as pd
import re
import requests
import shutil
import zipfile

class NoClusterEventsError(Exception):
    def __init__(self, message=None):
        self.message = message
        super().__init__(message)



class Manager:
    results_count = {}
    def __init__(self, input_url=None, input_token=None, input_output="./output"):
        if input_url:
            input_url = input_url[:-1] if input_url.endswith('/') else input_url
        self.url = input_url
        self.token = input_token
        self.output = input_output
        self.api_utl = Util()
        os.makedirs(self.output, exist_ok=True)

    def show_results(self, days):
        print(f"The following information was collected by the Workspace Estimator for the last {days} days")
        for key, value in self.results_count.items():
            print(f"{key.upper().ljust(10)}:\t{value}")

    def generator(self):
        while True:
            yield

    def get_and_save(self, path=None, name_output="default_output", array_field=None, suffix="", default_params=None,
                     use_paging=False, post=False, body=None, url_api="", cloud_provider="", pb=None,pb_message=None, paging_pb=False, full_response=False):
        has_more = True
        counter = 0
        next_page_token = ""
        new_url = ""
        response = []
        query = ""
        paging = {}
        offset = -1
        full_json = []
        if body is None:
            body = {}
        if default_params is None:
            default_params = {}
        skip = 0
        if pb:
              pb.set_description(f"{pb_message}") if pb_message else pb.set_description(f"Processing {path}")
        try:
            new_params = default_params.copy()
            pb_paging = None
            file_output = f"{name_output}{suffix}"
            gen = self.generator()
            if paging_pb:
                pb_paging = tqdm_notebook(gen, desc=f"Pages in {path}")
            for _ in gen:
                new_params = self.api_utl.get_params(counter, new_params, next_page_token, offset, use_paging, skip)
                response = self.get_response(body, new_params, path, post, url_api)
                json_data = self.api_utl.get_full_json(array_field, full_json, name_output, path, response, full_response, cloud_provider=cloud_provider)
                paging = self.api_utl.get_paging(json_data)
                offset = self.api_utl.get_offset(json_data, offset)
                skip = paging.get('has_skip')
                has_more = self.api_utl.get_has_more(json_data, offset)
                next_page_token = self.api_utl.get_page_token(paging)
                counter += 1
                if paging_pb:
                    pb_paging.update(1)
                if not has_more:
                    break
            if paging_pb:
                pb_paging.close()
            Util.write_file_request_(self.output, file_output, full_json)
            Util.check_file_request_(self.output, file_output, full_json)
        except Exception as e:
            local_vars = locals().copy()
            Util.write_log(self.output, e, local_vars)

        if pb:
            pb.update(1)
        self.results_count[name_output] = len(full_json)

    def get_response(self, body, new_params, path, post, url):
        query = self.api_utl.get_query(new_params)
        url_path = f"{url}/{path}" if path else url
        url_not_query = url_path if path else url
        new_url = f"{url_path}?{query}" if query else url_not_query
        headers = {"Authorization": f"Bearer {self.token}"}
        if post:
            response = requests.post(new_url, headers=headers, json=body)
        else:
            response = requests.get(new_url, headers=headers)
        if response.status_code != 200:
            error = f"Failed connection - {response.content}"
            if "does not exist" in error:
                raise NoClusterEventsError(response.content)
            raise Exception(error)
        return response




class Mapping:
    @staticmethod
    def get_clusters_ids(output=None):
        results = []
        results.extend(Mapping.get_clusters_ids_from_runs(output))
        results.extend(Mapping.get_clusters_ids_from_clusters(output))
        return results

    @staticmethod
    def get_clusters_ids_from_runs(output=None):
        if not output:
            return []
        runs_list = Mapping.get_runs(output)
        if len(runs_list) > 0:
            result_df = pd.DataFrame(runs_list)
            result_df['duration'] = result_df['end_time'] - result_df['start_time']
            result_df = result_df[(result_df['duration'] > 0) & (result_df['result_state'] == 'SUCCESS')]
            result_df['rank'] = result_df.groupby(['run_name'])['end_time'].rank('first', ascending=False)
            rank_first_df = result_df[result_df['rank'] == 1]
            rank_first_df = rank_first_df.dropna(subset=['cluster_id'])
            cluster_ids = rank_first_df['cluster_id'].values
            return set(cluster_ids)
        return []

    @staticmethod
    def get_clusters_ids_from_clusters(output=None):
        if not output:
            return []
        cluster_list = Mapping.get_clusters(output)
        if len(cluster_list) > 0:
            new_clusters = []
            result_df = pd.DataFrame(cluster_list)
            ui_clusters = result_df[result_df['cluster_source'] != 'JOB']['cluster_id'].values
            new_clusters.extend(set(ui_clusters))
            result_df['duration'] = result_df['end_time'] - result_df['start_time']
            result_df = result_df[(result_df['duration'] > 0) & (result_df['cluster_source'] == 'JOB') & (result_df['result_state'] == 'SUCCESS')]
            result_df['rank'] = result_df.groupby(['run_name'])['end_time'].rank('first', ascending=False)
            rank_first_df = result_df[result_df['rank'] == 1]
            rank_first_df = rank_first_df.dropna(subset=['cluster_id'])
            cluster_ids = rank_first_df['cluster_id'].values
            new_clusters.extend(set(cluster_ids))
            return new_clusters
        return []

    @staticmethod
    def get_runs_ids(output=None):
        if not output:
            return []
        runs_list = Mapping.get_runs(output)
        if len(runs_list) > 0:
            result_df = pd.DataFrame(runs_list)
            result_df['duration'] = result_df['end_time'] - result_df['start_time']
            result_df = result_df[(result_df['duration'] > 0) & (result_df['result_state'] == 'SUCCESS')]
            result_df['rank'] = result_df.groupby(['run_name'])['end_time'].rank('first', ascending=False)
            rank_first_df = result_df[result_df['rank'] == 1]
            rank_first_df = rank_first_df.dropna(subset=['run_id'])
            runs_ids = rank_first_df['run_id'].values
            return set(runs_ids)
        return []

    @staticmethod
    def get_clusters(output):
        result = []
        files = glob.glob(os.path.join(output, 'clusters*.json'))
        for f in files:
            with open(f, 'r') as file:
                clusters = json.load(file)
                for cluster in clusters:
                    cluster_id = cluster.get("cluster_id", None) if cluster else None
                    cluster_source = cluster.get("cluster_source", None) if cluster else None
                    tags = cluster.get("default_tags", None) if cluster else None
                    job_id = tags.get("JobId", "NO_ID_FOUND") if tags else "NO_TAG_FOUND"
                    run_name = tags.get("RunName", "NO_NAME_FOUND") if tags else "NO_TAG_FOUND"
                    start_time = cluster.get("start_time", 0)
                    end_time = cluster.get("end_time", 0)
                    termination_reason = cluster.get("termination_reason")
                    result_state = termination_reason.get("type") if termination_reason else None
                    result.append({
                        "cluster_id": cluster_id,
                        "cluster_source": cluster_source,
                        "run_name": Util.get_clean_name(run_name),
                        "job_id": job_id,
                        "start_time": start_time,
                        "end_time": end_time,
                        "result_state": result_state
                    })
        return result

    @staticmethod
    def get_runs(output):
        result = []
        files = glob.glob(os.path.join(output, 'run*.json'))
        for f in files:
            with open(f, 'r') as file:
                runs = json.load(file)
                for run in runs:
                    run_id = run.get("run_id", None) if run else None
                    run_name = run.get("run_name", None) if run else None
                    start_time = run.get("start_time", None) if run else None
                    end_time = run.get("end_time", None) if run else None
                    if "tasks" in run:
                        tasks = run["tasks"]
                        for task in tasks:
                            existing_cluster_id = task.get("existing_cluster_id", None)
                            cluster_instance = task.get("cluster_instance", None)
                            cluster_id = cluster_instance.get("cluster_id",
                                                              None) if cluster_instance else existing_cluster_id
                            state = task.get("state", None)
                            result_state = state.get("result_state", None) if state else None
                            run_name = run_name[:-37] if 'ADF' in run_name else run_name
                            result.append({
                                "run_name": Util.get_clean_name(run_name),
                                "run_id": run_id,
                                "start_time": start_time,
                                "end_time": end_time,
                                "cluster_instance": cluster_instance,
                                "cluster_id": cluster_id,
                                "result_state": result_state
                            })
        return result



class Sizing(Manager):
    def __init__(self, input_url, input_token=None, input_output="./output"):
        super().__init__(input_url, input_token=input_token, input_output=input_output)
        self.token = input_token
        self.output = input_output
        os.makedirs(self.output, exist_ok=True)

    def get_clusters_events(self, timestamp, pb=None):
        events_path = "api/2.0/clusters/events"
        if pb:
            pb.set_description(f"Processing {events_path}")

        cluster_list = Mapping.get_clusters_ids(self.output)
        with tqdm_notebook(range(len(cluster_list)), desc="Fetching Cluster Events") as pb2:
            for cluster in cluster_list:
                self.get_and_save(path=events_path, name_output=f"events", suffix=f"_{cluster}", post=True,
                                  default_params={"cluster_id": f"{cluster}", "limit": 250, "start_time": timestamp},
                                  body={"event_types": (
                                      "CREATING", "STARTING", "RESTARTING", "TERMINATING", "RUNNING", "RESIZING",
                                      "UPSIZE_COMPLETED", "EDITED")}, use_paging=True, url_api=self.url,
                                  pb=pb2, pb_message=f"Fetching {cluster} events")
        if pb:
            pb.update(1)

    def get_runs_details(self, pb=None):
        runs_details_path = "api/2.1/jobs/runs/get"
        if pb:
            pb.set_description(f"Processing {runs_details_path}")

        runs_list = Mapping.get_runs_ids(self.output)
        with tqdm_notebook(range(len(runs_list)), desc="Fetching Runs Details") as pb2:
            for run_id in runs_list:
                self.get_and_save(path=runs_details_path, name_output=f"runs_details", suffix=f"_{run_id}",
                                  default_params={"run_id": run_id}, url_api=self.url,
                                  pb=pb2, pb_message=f"Fetching details of run:{run_id}", full_response=True)
        if pb:
            pb.update(1)

    def get_metadata(self, days=60):
        with tqdm_notebook(range(9), desc='Processing...') as pb:
            date_days_ago = datetime.now() - timedelta(days=days)
            timestamp = int(date_days_ago.timestamp() * 1000)
            self.get_and_save(path="api/2.0/clusters/list-node-types", name_output="node_types", url_api=self.url,
                              pb=pb)
            self.get_and_save(path="api/2.0/clusters/list", name_output="clusters", use_paging=True, url_api=self.url,
                              pb=pb)
            self.get_and_save(path="api/2.0/jobs/list", name_output="jobs", use_paging=True, url_api=self.url, pb=pb)
            self.get_and_save(path="api/2.1/jobs/runs/list", name_output="runs", use_paging=True,
                              default_params={"expand_tasks": "true", "start_time_from": timestamp}, url_api=self.url,
                              pb=pb, paging_pb=True)
            self.get_and_save(path="api/2.0/sql/warehouses", name_output="warehouses", use_paging=True,
                              url_api=self.url, pb=pb)
            self.get_and_save(path="api/2.0/pipelines", name_output="pipelines", array_field="statuses",
                              use_paging=True,
                              default_params={"max_results": 100}, url_api=self.url, pb=pb)
            self.get_and_save(path="api/2.0/sql/history/queries", name_output="queries", array_field="res", use_paging=True,
                              url_api=self.url, pb=pb)
            self.get_clusters_events(timestamp, pb=pb)
            self.get_runs_details(pb=pb)



class UtilFile:
    email_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}')
    url_parameters_pattern = re.compile('^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:?\n]+)')
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    jwt_pattern = re.compile(r'[A-Za-z0-9_-]{4,}(?:\.[A-Za-z0-9_-]{4,}){2}')
    dbx_pattern = re.compile(r'https?://adb-\d{4,16}\.\d{0,2}|https?://dbc-.{4,12}-.{2,4}')

    @staticmethod
    def check_file_request_(output, name_output, json_data_check):
        os.makedirs(output, exist_ok=True)
        file_path = os.path.join(output, f"{name_output}.json")
        if os.path.exists(file_path):
            size = os.path.getsize(file_path)
            size_max_unit_value = "10 MB"
            array_units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
            size_bites_unit_value = UtilFile.convert_size_to_mb(size)
            size_correct = UtilFile.has_correct_size(size_bites_unit_value, size_max_unit_value, array_units)
            if size_correct:
                return True
            else:
                current_size_in_mb = UtilFile.convert_size_to_mb(size)
                size_value = int(current_size_in_mb.split(' ')[0])
                size_max_value = int(size_max_unit_value.split(' ')[0])
                parts = math.ceil(size_value / size_max_value)
                count = UtilFile.get_count(json_data_check)
                size_part = math.ceil(count / parts)
                end = 0
                for i in range(parts):
                    start = end + 1 if end != 0 else 0
                    end = end + size_part if i != parts else count - 1
                    json_data_part = json_data_check[start:end]
                    # the following line is to activate the cleanup of the json files.
                    # json_data_part = UtilFile.filter_data(json_data_part, [])
                    UtilFile.write_file_request_(output, f"{name_output}_{i + 1:02d}", json_data_part)
                file_path = os.path.join(output, f"{name_output}.json")
                os.remove(file_path)
                return True
        else:
            return False

    @staticmethod
    def convert_size_to_mb(size_bytes):
        if size_bytes == 0:
            return "0 MB"
        i = int(math.floor(math.log(size_bytes, 1024)))
        p = math.pow(1024, 2)
        s = int(round(size_bytes / p, 0))
        return "%s %s" % (s, 'MB')

    @staticmethod
    def compress_folder_to_zip(output, output_zip_file, extension='zip'):
        try:

            if not os.path.exists(output):
                raise Exception(f"The source folder '{output}' does not exist.")

            with zipfile.ZipFile(f"{output_zip_file}.{extension}", 'w', zipfile.ZIP_DEFLATED) as zipf:
                for root, dirs, files in os.walk(output):
                    for file in files:
                        file_path = os.path.join(root, file)
                        relative_path = os.path.relpath(file_path, output)
                        zipf.write(file_path, relative_path)

        except Exception as e:
            print(f"Error: {e}")

    @staticmethod
    def clean_output(output):
        shutil.rmtree(output)

    @staticmethod
    def get_count(json_data_count):
        if isinstance(json_data_count, list):
            count = len(json_data_count)
        elif isinstance(json_data_count, dict):
            count = len(json_data_count)
        else:
            count = 0
        return count

    @staticmethod
    def get_file_name(workspace):
        workspace = workspace.replace(" ", "_")
        workspace = workspace.replace(" ", "")
        workspace = workspace.replace(" ", "_")
        workspace = re.sub(r'[^A-Za-z0-9 ]+', '', workspace)
        workspace = "Default_wkp" if workspace == "" else workspace
        now = datetime.now()
        date_part = now.strftime('%m%d')
        name = f"{workspace}_{date_part}"
        return name

    @staticmethod
    def has_correct_size(size_bites_unit_value, size_max_unit_value, array_units):
        size_unit = size_bites_unit_value.split(' ')[1]
        size_index = array_units.index(size_unit)
        size_max_unit = size_max_unit_value.split(' ')[1]
        size_max_index = array_units.index(size_max_unit)
        if size_index < size_max_index:
            return True
        elif size_index > size_max_index:
            return False
        else:
            size_value = int(size_bites_unit_value.split(' ')[0])
            size_max_value = int(size_max_unit_value.split(' ')[0])
            if size_value > size_max_value:
                return False
            else:
                return True

    @staticmethod
    def read_config(config_path):
        with open(config_path, 'r') as file:
            config_values = json.load(file)
        if 'url' not in config_values:
            raise ValueError("Missing 'url' in configuration file.")
        if 'token' not in config_values:
            raise ValueError("Missing 'token' in configuration file.")
        return config_values

    @staticmethod
    def write_log(output, e, local_vars):
        log_path = os.path.join(output, "log.txt")
        with open(log_path, "a") as log_file:
            log_file.write("An error occurred:\n")
            log_file.write(f"{str(e)}\n")
            if not isinstance(e, NoClusterEventsError):
                variables_message = "Local Variables at the point of exception:\n"
                items = local_vars.items()
                log_file.write(variables_message)
                not_include = ['self', 'file', 'e', 'log_file']
                variables_info = "   ".join([f"{key}: {value}\n" for key, value in items if key not in not_include])
                log_file.write(f"   {variables_info}")

    @staticmethod
    def write_file_request_(output, name_output, json_data):
        data = json.dumps(json_data)
        os.makedirs(output, exist_ok=True)
        file_path = os.path.join(output, f"{name_output}.json")
        with open(file_path, "w") as file:
            file.write(data)

    @staticmethod
    def filter_data(json_data, keys):
        if isinstance(json_data, dict):
            new_data = UtilFile.clean_dictionary(json_data, keys, {})
        elif isinstance(json_data, list):
            new_data = UtilFile.clean_list(json_data, keys, [])
        else:
            new_data = json_data

        return new_data

    @staticmethod
    def clean_list(source, keys, new_list):
        if len(source) == 0:
            return new_list
        new_data = []
        for value in source:
            if isinstance(value, str):
                new_data.append(UtilFile.clean_str(value))
            elif isinstance(value, dict):
                new_data.append(UtilFile.clean_dictionary(value, keys, {}))
            elif isinstance(value, list):
                new_data.append(UtilFile.clean_list(value, keys, []))
            else:
                new_data.append(value)

        return new_data

    @staticmethod
    def clean_dictionary(source, keys, new_dict):
        if len(source) == 0:
            return new_dict
        for key, value in source.items():
            if len(keys) > 0 and key in keys:
                continue
            elif isinstance(value, str):
                new_dict[key] = UtilFile.clean_str(value)
            elif isinstance(value, dict):
                new_dict[key] = UtilFile.clean_dictionary(value, keys, {})
            elif isinstance(value, list):
                new_dict[key] = UtilFile.clean_list(value, keys, [])

            else:
                new_dict[key] = value

        return new_dict

    @staticmethod
    def clean_str(txt):
        clean_string = UtilFile.replace_emails(txt)
        clean_string = UtilFile.replace_adb_url(clean_string)
        clean_string = UtilFile.remove_url_parameters(clean_string)
        clean_string = UtilFile.remove_jwt(clean_string)
        return clean_string

    @staticmethod
    def replace_emails(text):
        return UtilFile.email_pattern.sub('[EMAIL_REMOVED]', text)

    @staticmethod
    def remove_url_parameters(text):
        matches = UtilFile.url_parameters_pattern.match(text)
        if matches:
            return matches.group(1)
        else:
            return text

    @staticmethod
    def remove_url(text):
        return UtilFile.url_pattern.sub('[URL_REMOVED]', text)

    @staticmethod
    def replace_adb_url(text):
        return UtilFile.dbx_pattern.sub('https://adb-0000000000000.00', text)

    @staticmethod
    def remove_jwt(txt):
        return UtilFile.jwt_pattern.sub('[TOKEN_REMOVED]', txt)


class Util(UtilFile):

    @staticmethod
    def get_url(input_url):
        return input_url[:-1] if input_url.endswith('/') else input_url

    @staticmethod
    def get_full_json(array_field, full_json, name_output, path, response, full_response, cloud_provider=""):
        json_data = response.json()
        field_out = array_field if array_field else name_output
        if field_out in json_data:
            if path or cloud_provider == "azure":
                data = json_data[field_out]
            else:
                data = json_data[field_out].items()
            full_json.extend(data)
        if full_response:
            if isinstance(json_data, list):
                full_json.extend(json_data)
            else:
                full_json.append(json_data)

        return json_data

    @staticmethod
    def get_has_more(json_data, offset):
        return json_data.get('has_more') or json_data.get('has_next_page') or offset or json_data.get('NextPageLink')

    @staticmethod
    def get_page_token(paging):
        return paging.get('next_page_token')

    @staticmethod
    def get_params(counter, new_params, next_page_token, offset, use_paging, skip):
        if use_paging and counter > 0:
            if next_page_token:
                new_params['page_token'] = next_page_token
            elif skip:
                new_params['$skip'] = skip
            elif offset and offset >= 0:
                new_params['offset'] = offset
        return new_params

    @staticmethod
    def get_paging(json_data):
        return {
            'has_more': json_data.get('has_more'),
            'next_page_token': json_data.get('next_page_token'),
            'has_next_page': json_data.get('has_next_page'),
            'has_skip': Util.get_skip(json_data),
        }

    @staticmethod
    def get_offset(json_data, offset):
        next_data = json_data.get('next_page')
        offset = json_data.get('next_page')['offset'] if next_data else None
        return offset

    @staticmethod
    def get_query(params):
        query = ""
        return urlencode(params) if params else query

    @staticmethod
    def get_skip(json_data):
        next_page_link = json_data.get('NextPageLink', None)
        if next_page_link:
            skip = json_data.get('NextPageLink').split('$skip=')[1]
            return skip
        else:
            return ""

    @staticmethod
    def get_clean_name(run_name):
        return run_name[:-37] if run_name.startswith("ADF_") else run_name

### Configure security settings.

In [0]:

dbutils.widgets.removeAll()
dbutils.widgets.dropdown("runs_for_last_days", "60", ["15", "30", "60"])
wkp_name_instructions = "please write workspace name"
dbutils.widgets.text("workspace_name", wkp_name_instructions)
# following line tries to get the host from the current workspace, if that fails you can change it manually to your desired host.
# host_name = "demo.azuredatabricks.net"
host_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().browserHostName().get()
url = f"https://{host_name}/" if host_name != '' else None
if url is None:
    raise Exception("Please provide the workspace url available in your address bar in the variable 'url' i.e. 'https://demo.azuredatabricks.net/'")
# Following line tries to get the token of the current notebook if that fails you can change it manually as in the example below.
# We advise against using an explicit token. Please store it in a secret scope.
# token = dbutils.secrets.get(scope='my-secrets', key='workload-query')
token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
if token == '' or token is None:
    raise Exception("Please provide a token to query Databricks API. Please define it in the variable 'token'.")
workspace_name = dbutils.widgets.get('workspace_name')
workspace_name = 'workspace_name' if dbutils.widgets.get('workspace_name') == wkp_name_instructions else workspace_name
days = int(dbutils.widgets.get('runs_for_last_days'))

In [0]:
import shutil
import random
api_util = Util()
tmp_folder ='%05x' % random.randrange(16**5)
tmp_driver = f'file:///tmp/{tmp_folder}/'
filename = api_util.get_file_name(workspace_name)
driver_zip_filename = f'/tmp/{filename}'
driver_to_zip = f'/tmp/{tmp_folder}/'
driver_folder = f'file:///tmp/{tmp_folder}' 
zip_path = f'file:///tmp/{filename}.zip'

dbutils.fs.mkdirs(tmp_driver)
client = Sizing(url, token, driver_to_zip)
client.get_metadata(days)
client.show_results(days)
shutil.make_archive(driver_zip_filename, 'zip', driver_to_zip)
zip_destination = 'dbfs:/FileStore/WAS_Tool/results'
dbutils.fs.mkdirs(zip_destination)
dbutils.fs.cp(zip_path, zip_destination)

In [0]:
from IPython.display import display as displayHTML, HTML
html = f'<html><div  style="display:flex;justify-content: center;"><a href=/files/WAS_Tool/results/{filename}.zip><button style="background-color:#249edc;color: #fff;border:1px solid #249edc;cursor:pointer;border-radius:45px;font-weight:800;line-height:18px;padding: 8px 16px" type="button">DOWNLOAD ZIP</button></a></div></html>'
displayHTML(HTML(html))
print(f"In case the download button was not being displayed, please click on the following link: {url}files/WAS_Tool/results/{filename}.zip")
