## Setup Notebook

### Objective
This notebook is used to configure DPGCE and Serverless environments for running benchmarks.

### Initialize Public GCS Bucket
This initializes the public GCS bucket from which datasets and utilities will be copied to your GCS bucket.

In [1]:
SOURCE_PUBLIC_GCS_BUCKET = "tpc-benchmarking-kit-bucket"

### Setup Requirements
Select the type of setup for benchmarking. By default, both DPGCE and Serverless environments are configured.

In [2]:
dataproc_setup_preferences = {
    "DPGCE_Standard":True,
    "DPGCE_Premium":True,
    "Serverless_Standard":True,
    "Serverless_Premium":True,
    "Serverless_Premium_with_NQE":True
}

### Utility Functions

In [3]:
import subprocess
import yaml

# Returns name of service account
def get_gcloud_account():
    try:
        result = subprocess.run(['gcloud', 'config', 'get-value', 'account'],capture_output=True, text=True, check=True)
        account = result.stdout.strip()
        return account
    except subprocess.CalledProcessError as e:
        print(f"Error running gcloud config get account: {e}")
        print(f"{e.stderr}")
        return None
    except FileNotFoundError:
        print("Error: gcloud command not found. Please ensure the Google Cloud CLI is installed and in your system's PATH.")
        return None

# Returns name of current project
def get_gcloud_project():
    try:
        command = ["gcloud", "config", "get", "project"]
        process = subprocess.run(command, capture_output=True, text=True, check=True)
        project_name = process.stdout.strip()
        return project_name
    except subprocess.CalledProcessError as e:
        print(f"Error getting project name: {e}")
        print(f"Stdout: {e.stdout}")
        print(f"{e.stderr}")
        return None
    except FileNotFoundError:
        print("Error: gcloud command not found. Make sure the Google Cloud CLI is installed and in your system's PATH.")
        return None
    
# Creates a Dataproc cluster    
def create_cluster(cluster_configs: dict):
    command = f""" ! gcloud dataproc clusters create {cluster_configs["dp_cluster_name"]} \
      --region {cluster_configs["region"]} \
      --tier {cluster_configs["tier"]} \
      --scopes cloud-platform \
      --enable-component-gateway \
      --no-address \
      --service-account {cluster_configs["service_account"]} \
      --subnet {cluster_configs["subnet"]} \
      --num-masters {cluster_configs["num_master"]} \
      --num-workers {cluster_configs["num_worker"]} \
      --master-machine-type {cluster_configs["master_vm"]} \
      --master-boot-disk-size {cluster_configs["master_boot_disk_size"]} \
      --master-boot-disk-type {cluster_configs["master_boot_disk_type"]} \
      --worker-machine-type {cluster_configs["worker_vm"]} \
      --worker-boot-disk-type {cluster_configs["worker_boot_disk_type"]} \
      --worker-boot-disk-size {cluster_configs["worker_boot_disk_size"]} \
      --num-master-local-ssds {cluster_configs["master_ssd_count"]} \
      --num-worker-local-ssds {cluster_configs["worker_ssd_count"]} \
      --master-local-ssd-interface {cluster_configs["master_local_ssd_interface"]} \
      --worker-local-ssd-interface {cluster_configs["worker_local_ssd_interface"]} \
      --image-version {cluster_configs["image_version"]} \
      --properties "hive:yarn.log-aggregation-enable=true" \
      --properties "spark:spark.checkpoint.compress=true" \
      --properties "spark:spark.eventLog.compress=true" \
      --properties "spark:spark.eventLog.compression.codec=zstd" \
      --properties "spark:spark.eventLog.rolling.enabled=true" \
      --properties "spark:spark.io.compression.codec=zstd" \
      --properties "spark:spark.sql.parquet.compression.codec=zstd" \
      --optional-components=JUPYTER \
      """
    
    print(command)
    ! {command}
    
# Creates a Dataproc Serverless session template
def create_template(yaml_file_path: str, tier: str, runtype: str, session_configs: dict):
    command = [
        "gcloud",
        "beta",
        "dataproc",
        "session-templates",
        "import",
        f"{tier}-{runtype}-runtime",
        "--source",
        yaml_file_path,
        "--location",
        f"{session_configs['region']}",
        "--project",
        PROJECT,
    ]

    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    stdout, stderr = process.communicate(input="y\n")  

    print(stdout)
    print("Stderr:", stderr)
    
# Updates the default session template with the provided configurations
def update_session_configs_in_template(file_path : str):
    with open(file_path, "r") as f:
        data = yaml.safe_load(f)

    data['environmentConfig']['executionConfig']['serviceAccount'] = session_configs["service_account"]
    data['environmentConfig']['executionConfig']['subnetworkUri'] = session_configs["subnet"]
    data['environmentConfig']['executionConfig']['idleTtl'] = session_configs["idleTTL"]
    data['environmentConfig']['executionConfig']['ttl'] = session_configs["TTL"]
    data['runtimeConfig']['version'] = session_configs["version"]

    with open(file_path, "w") as f:
        yaml.dump(data, f)

In [4]:
# Get the value of current project and service account in use
PROJECT = get_gcloud_project()
SERVICE_ACCOUNT = get_gcloud_account()

print(f"Project: {PROJECT}")
print(f"Service Account: {SERVICE_ACCOUNT}")

Project: nishitkamdarargo
Service Account: sparkpoc-sa@nishitkamdarargo.iam.gserviceaccount.com


### DPGCE Setup

#### Cluster Configuration
A Dataproc cluster will be created using the default configurations below. 

You can customize these settings in the following code cell. 

In [None]:
# Default Configurations
##############################################################################################################################
# DP_CLUSTER_NAME="dpgce-benchmarking"  
# REGION="us-east1"                  
# MASTER_TYPE="c3d-standard-16-lssd" 
# WORKER_TYPE="c3d-standard-16-lssd" 
# WORKER_COUNT="10"   [For 10TB benchmarks, it is recommended to increase this count to 27]                
# WORKER_SSD_COUNT="1"               
# MASTER_BOOT_DISK_SIZE="500"        
# WORKER_BOOT_DISK_SIZE="500"        
# MASTER_LOCAL_SSDS="1"              
# MASTER_LOCAL_SSD_INTERFACE="NVME"  
# WORKER_LOCAL_SSD_INTERFACE="NVME"  
# IMAGE_VERSION="2.3-debian12" (other supported versions include 2.2-debian12 and 2.1-debian11)
# SUBNET="sparkpoc-subnet"           
##############################################################################################################################

cluster_configs = {
    "dp_cluster_name":"dpgce-benchmarking",
    "region":"us-east1",
    "master_vm":"n2d-standard-16",
    "num_master":1,
    "master_ssd_count":1,
    "master_boot_disk_size":500,
    "master_local_ssd_interface":"NVME",
    "master_boot_disk_type":"pd-balanced",
    "worker_vm":"n2d-standard-16",
    "num_worker":10,
    "worker_ssd_count":1,
    "worker_boot_disk_size":500,
    "worker_local_ssd_interface":"NVME",
    "worker_boot_disk_type":"pd-balanced",
    "image_version":"2.3-debian12",
    "subnet":"sparkpoc-subnet",
    "service_account": SERVICE_ACCOUNT,
    "tier": "standard"
}

#### DPGCE Standard

In [None]:
if dataproc_setup_preferences["DPGCE_Standard"]:
    print("Setting up a DPGCE Standard cluster")
    cluster_configs["dp_cluster_name"]="dpgce-benchmarking-standard"
    create_cluster(cluster_configs)
else:
    print("DPGCE Standard setup not required, skipping this step.")

#### DPGCE Premium with Lightning Engine

In [None]:
# DPGCE Premium with Lightning Engine is available only on image version 2.3-debian12

if dataproc_setup_preferences["DPGCE_Premium"]:
    print("Setting up a DPGCE Premium cluster")
    cluster_configs["dp_cluster_name"]="dpgce-benchmarking-premium-2"
    cluster_configs["tier"]="premium"
    cluster_configs["image_version"]="2.3.4-debian12"
    create_cluster(cluster_configs)
else:
    print("DPGCE Premium setup not required, skipping this step.")

### Dataproc Serverless Setup

#### Session Configuration
A Dataproc session template with the following specifications will be created.

In [7]:
session_configs = {
    "region":"us-east1",
    "version":"2.2",
    "service_account":SERVICE_ACCOUNT,
    "subnet":"sparkpoc-subnet",  # Default subnet provisioned by Terraform. Update 'SUBNET' if using a different subnet.
    "idleTTL":"28800s",
    "TTL":"86400s"
}

#### Serverless Standard

Create Session Template for Serverless Standard

**Important - After the template is created, locate the displayName in the output. This displayName will be the name of the kernel you use to run your Dataproc Serverless notebook.**

In [8]:
if dataproc_setup_preferences["Serverless_Standard"]:
    
    # Copy the session template file
    ! gsutil cp "gs://{SOURCE_PUBLIC_GCS_BUCKET}/session_templates/s8s-standard-spark-runtime-template.yaml" .

    # Update the template with the provided session configs
    update_session_configs_in_template('./s8s-standard-spark-runtime-template.yaml')
    
    # Create session template
    create_template('./s8s-standard-spark-runtime-template.yaml','standard','spark',session_configs)    
else:
    print("Serverless Standard setup not required, skipping this step.")

Copying gs://tpc-benchmarking-kit-bucket/session_templates/s8s-standard-spark-runtime-template.yaml...
/ [1 files][  957.0 B/  957.0 B]                                                
Operation completed over 1 objects/957.0 B.                                      
createTime: '2025-08-01T08:33:41.405630Z'
creator: sparkpoc-sa@nishitkamdarargo.iam.gserviceaccount.com
description: Run TPC benchmark on Serverless Standard
environmentConfig:
  executionConfig:
    idleTtl: 28800s
    serviceAccount: sparkpoc-sa@nishitkamdarargo.iam.gserviceaccount.com
    subnetworkUri: sparkpoc-subnet
    ttl: 86400s
jupyterSession:
  displayName: Standard for running TPC Benchmark
  kernel: PYTHON
labels:
  client: bigquery-jupyter-plugin
name: projects/nishitkamdarargo/locations/us-east1/sessionTemplates/standard-spark-runtime
runtimeConfig:
  properties:
    spark.dataproc.driver.compute.tier: standard
    spark.dataproc.driver.disk.size: 750g
    spark.dataproc.driver.disk.tier: standard
    spark.da

#### Serverless Premium

Create Session Template for Serverless Premium

**Important - After the template is created, locate the displayName in the output. This displayName will be the name of the kernel you use to run your Dataproc Serverless notebook.**

In [None]:
if dataproc_setup_preferences["Serverless_Premium"]:
    
    # Copy the session template file
    ! gsutil cp "gs://{SOURCE_PUBLIC_GCS_BUCKET}/session_templates/s8s-premium-spark-runtime-template.yaml" .

    # Update the template with the provided session configs
    update_session_configs_in_template('./s8s-premium-spark-runtime-template.yaml')
    
    # Create session template
    create_template('./s8s-premium-spark-runtime-template.yaml','premium','spark',session_configs)    
else:
    print("Serverless Premium setup not required, skipping this step.")

#### Serverless Premium with NQE

Create Session Template for Serverless Premium with NQE

**Important - After the template is created, locate the displayName in the output. This displayName will be the name of the kernel you use to run your Dataproc Serverless notebook.**

In [None]:
if dataproc_setup_preferences["Serverless_Premium_with_NQE"]:
    
    # Copy the session template file
    ! gsutil cp "gs://{SOURCE_PUBLIC_GCS_BUCKET}/session_templates/s8s-premium-native-runtime-template.yaml" .

    # Update the template with the provided session configs
    update_session_configs_in_template('./s8s-premium-native-runtime-template.yaml')
    
    # Create session template    
    create_template('./s8s-premium-native-runtime-template.yaml','premium','native',session_configs)
else:
    print("Serverless Premium with NQE setup not required, skipping this step.")