# Raw Event Log Data To Organized

In [0]:
from datetime import datetime, timedelta
import json
import gzip
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import (
    ArrayType,
    StructType,
    IntegerType,
    StringType,
    StructField,
    MapType,
)
from functools import reduce
import re

## Run this cell to drop the tables and start over

In [0]:
# To drop all tables and start over from eventlog files
if False:
    spark.sql("drop table physical_plan_keys;")
    spark.sql("drop table queries;")
    spark.sql("drop table operations;")
    print("Dropped the tables")

### 1. Inputs
- start_time: Timestamp default: yesterday()
- end_time: Timestamp default: time_now()

In [0]:
dbutils.widgets.text(
    "start_time", "yyyy-mm-dd HH:MM:SS", "StartTime, Format: yyyy-mm-dd HH:MM:SS"
)
dbutils.widgets.text(
    "end_time", "yyyy-mm-dd HH:MM:SS", "EndTime, Format: yyyy-mm-dd HH:MM:SS"
)

# get variables
start_time = dbutils.widgets.get("start_time")
end_time = dbutils.widgets.get("end_time")

# Because of work flows
time_now = datetime.now()

# Handle start_time conversion
if start_time == "None":
    # 2 weeks prior
    start_time = time_now - timedelta(days=14)
else:
    try:
        start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        raise ValueError('inputted start_time is not on the right format')

# Handle end_time conversion
if end_time == "None":
    end_time = time_now
else:
    try:
        end_time = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        raise ValueError('inputted end_time is not on the right format')


assert start_time < end_time, "Start time needs to be before end time."

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-3025559219626018>:14[0m
[1;32m     11[0m end_time [38;5;241m=[39m dbutils[38;5;241m.[39mwidgets[38;5;241m.[39mget([38;5;124m"[39m[38;5;124mend_time[39m[38;5;124m"[39m)
[1;32m     13[0m [38;5;66;03m# Because of work flows[39;00m
[0;32m---> 14[0m time_now [38;5;241m=[39m datetime[38;5;241m.[39mnow()
[1;32m     16[0m [38;5;66;03m# Handle start_time conversion[39;00m
[1;32m     17[0m [38;5;28;01mif[39;00m start_time [38;5;241m==[39m [38;5;124m"[39m[38;5;124mNone[39m[38;5;124m"[39m:
[1;32m     18[0m     [38;5;66;03m# 2 weeks prior[39;00m

[0;31mNameError[0m: name 'datetime' is not defined

### 2. Load Data From Table To DataFrame

In [0]:
query = f"SELECT * FROM eventlog_raw WHERE lastModified BETWEEN '{str(start_time)}' AND '{str(end_time)}'"
raw_df = spark.sql(query)

### 3. Extract Relevant Information From Data  
- Could be done by creating a function, then applying it to every row, but that might be slow.  
- Apply function to the column instead? Map function?  
- Check out built in functions first because of better performance.
- We got some work to do here to find the optimal solution.  

**Note**
- Events from file in table seem to be divided by '\n', so can be used to split (there is also a \n at the end which results in '' which needs to be removed dont know if this is true for all files but needs to be checked)  

**Keywords:**  
Regex, Lambda function, Built in functions

##### 3.1 Restructure fileData to fit JSON Format 
  1. Remove all newlines
  2. Find all }{ which represents new event starting and add a comma to divide
  3. Add brackets to whole string
  4. Use from_json to get on json format and add Schema

In [0]:
# Define Schema for fileData (Events)
schema = ArrayType(
    StructType(
        [
            StructField("Event", StringType(), True),
            StructField("SparkContext Id", StringType(), True),
            StructField("Stage Info", StringType(), True),
            StructField("Task Info", StringType(), True),
            StructField("Stage ID", StringType(), True),
            StructField("Task End Reason", StringType(), True),
            StructField("Stage IDs", StringType(), True),
            StructField("Stage Attempt ID", StringType(), True),
            StructField("Completion Time", StringType(), True),
            StructField("time", StringType(), True),
            StructField("errorMessage", StringType(), True),
            StructField("Task Executor Metrics", StringType(), True),
            StructField("Timestamp", StringType(), True),
            StructField("executionId", StringType(), True),
            StructField("Job Result", StringType(), True),
            StructField("Stage Infos", StringType(), True),
            StructField("details", StringType(), True),
            StructField("Task Metrics", StringType(), True),
            StructField("physicalPlanDescription", StringType(), True),
            StructField("modifiedConfigs", StringType(), True),
            StructField("Submission Time", StringType(), True),
            StructField("rootExecutionId", StringType(), True),
            StructField("Spark Version", StringType(), True),
            StructField("Rollover Number", StringType(), True),
            StructField("sparkPlanInfo", StringType(), True),
            StructField("Job ID", StringType(), True),
            StructField("Task Type", StringType(), True),
            StructField("description", StringType(), True),
            StructField("Properties", StringType(), True),
            StructField("accumUpdates", StringType(), True),
        ]
    )
)

# Modify format to be a JSON-list of events
df = raw_df.withColumn("fileData", F.regexp_replace("fileData", "\n", ""))
df = df.withColumn("fileData", F.regexp_replace("fileData", "\}\{", "},{"))
df = df.withColumn("fileData", F.concat(F.lit("["), df.fileData, F.lit("]")))
df = df.withColumn("fileData", F.from_json("fileData", schema))

##### 3.2 Explode And Expand JSON Data To Multiple Columns

In [0]:
# Define column list
columns = [
    "filePath",
    "clusterID",
    "clusterInstanceID",
    "sparkContextID",
    "lastModified",
    "eventlogKey",
]
# Explode json data list to independent rows
df2 = df.select(*columns, F.explode(df.fileData).alias("eventData"))
# Expand eventData column to multiple columns
df3 = df2.select(*columns, F.col("eventData.*"))

##### 3.3 Filter Out Relevant Events Into Dataframes

In [0]:
# Filter on all events for SQLExecutionStart and select keys related to this
sql_start_df = df3.select(
    df3.Event.alias("event_start"),
    "sparkContextID",
    "clusterInstanceID",
    "executionId",
    "rootExecutionId",
    "Description",
    "Details",
    "physicalPlanDescription",
    "sparkPlanInfo",
    df3.time.alias("time_start"),
    "modifiedConfigs",
    "eventlogKey",
).filter(
    df3.Event == "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart"
)

sql_end_df = df3.select(
    df3.Event.alias("event_end"),
    "sparkContextID",
    "clusterInstanceID",
    "executionId",
    df3.time.alias("time_end"),
    "errorMessage",
).filter(df3.Event == "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd")

##### 3.4 Gather SQL Information

In [0]:
# Join star and end events for sql queries
sql_df = sql_start_df.join(
    sql_end_df,
    (sql_start_df.executionId == sql_end_df.executionId)
    & (sql_start_df.clusterInstanceID == sql_end_df.clusterInstanceID)
    & (sql_start_df.sparkContextID == sql_end_df.sparkContextID),
).drop(sql_end_df.executionId, sql_end_df.clusterInstanceID, sql_end_df.sparkContextID)
# above: drop sql_end columns to prevent duplicates in the resulting df

sql_df = sql_df.withColumn(
    "executionId", sql_df.executionId.cast("int")
)  # Cast id column to integer

sql_df = sql_df.withColumn(
    "time_start", sql_df.time_start.cast("double")
)  # Cast float
sql_df = sql_df.withColumn(
    "time_end", sql_df.time_end.cast("double")
)  # Cast float

sql_df = sql_df.sort(F.desc("executionId"))  # Sort by id ascending
# Calculate Duration of SQL Query
sql_df = sql_df.withColumn(
    "duration_ms", sql_df.time_end.cast("double") - sql_df.time_start.cast("double")
)

In [0]:
sql_df.show(10)

+--------------------+---------------+--------------------+--------------------+-----------------------+--------------------+-----------------+--------------------+-----------+--------------------+-------------------+--------------------+-----------+-----------------+------------+-----------+
|         event_start|rootExecutionId|         Description|             Details|physicalPlanDescription|       sparkPlanInfo|       time_start|     modifiedConfigs|eventlogKey|           event_end|     sparkContextID|   clusterInstanceID|executionId|         time_end|errorMessage|duration_ms|
+--------------------+---------------+--------------------+--------------------+-----------------------+--------------------+-----------------+--------------------+-----------+--------------------+-------------------+--------------------+-----------+-----------------+------------+-----------+
|org.apache.spark....|           1310|        display(df2)|org.apache.spark....|   == Physical Plan ...|{"nodeName":"C

##### 3.4.1 Create unique ids for each physical plan and store them in a table
* Create table for storage of unique physical plan keys (**physical_plan_keys**)
* Combine sparkContextID, clusterInstanceID and executionID to create a unique id for each physical plan
* Store these keys in the table

This id is later used on the single operations which gives us a way of seeing which operations belong to which physical plan.

In [0]:
%sql
CREATE TABLE IF NOT EXISTS physical_plan_keys (
  sparkContextID STRING,
  clusterInstanceID STRING,
  executionID INT,
  physicalPlanKey INT
);

In [0]:
# Add a new column to the dataframe that concatenates the 'sparkContextID', 'clusterInstanceID', and 'executionID' columns
# This column will be used to generate a hash key later
sql_df = sql_df.withColumn(
    "concatPhysicalPlanKey",
    F.concat("sparkContextID", "clusterInstanceID", "executionID"),
)

# Generate a hash key from the 'concatPhysicalPlanKey' column and add it as a new column to the dataframe
sql_df = sql_df.withColumn("physicalPlanKey", F.hash("concatPhysicalPlanKey")).drop(
    "concatPhysicalPlanKey"
)

sql_df = sql_df.dropDuplicates(['physicalPlanKey'])

# Query the existing physical plan keys in the 'physical_plan_keys' table
existing_keys = spark.sql("select physicalPlanKey from physical_plan_keys")

# Select the distinct values of 'sparkContextID', 'clusterInstanceID', 'executionID', and 'physicalPlanKey' columns
# from the dataframe
new_keys = sql_df.select(
    "sparkContextID", "clusterInstanceID", "executionID", "physicalPlanKey"
).distinct()

# Perform an anti-join on the 'physicalPlanKey' column between the new keys and the existing keys to keep only the keys
# that don't already exist in the 'physical_plan_keys' table
# This should not really be the case, but added as a failsafe if you are read the same physical plans multiple times
new_keys = new_keys.join(existing_keys, "physicalPlanKey", "anti")

# Write the new keys to the 'physical_plan_keys' table
print(f"Adding {new_keys.count()} new keys to the physicalPlanKey table")
new_keys.write.mode("append").saveAsTable("default.physical_plan_keys")

Adding 10210 new keys to the physicalPlanKey table


##### 3.5 Extract Information From Physical Plan  
###### Final Table  
- **execution_id** (same as Spark queryID)
- **operation** (type of operation, e.g. Filter, ..)
- **column_id** (unique global id of the column the operation is performed on)
- **timestamp**  
- **condition**  (e.g. greaterThan)
- **conditionValue** (e.g. 5)

In [0]:
physical_plan_df = sql_df.select(
    F.col("executionId"),
    F.col("time_start").alias("timestamp"),
    F.col("physicalPlanDescription"),
    F.col("physicalPlanKey"),
    "eventlogKey",
)

physical_plan_df.show(5)

+-----------+-----------------+-----------------------+---------------+-----------+
|executionId|        timestamp|physicalPlanDescription|physicalPlanKey|eventlogKey|
+-----------+-----------------+-----------------------+---------------+-----------+
|        976|1.682034474802E12|   == Physical Plan ...|    -2140848050|   85799799|
|          0| 1.68133340762E12|   == Physical Plan ...|    -2139115355| -371546082|
|         49|1.682505485759E12|   == Physical Plan ...|    -2137141090| -302499034|
|        302|1.682594864853E12|   == Physical Plan ...|    -2135497522| 1563319382|
|        385|1.682029605382E12|   == Physical Plan ...|    -2132409082|-1937385617|
+-----------+-----------------+-----------------------+---------------+-----------+
only showing top 5 rows



#### 3.5.0 Filter unrelevant physical plans
* Check for plans that include a FileScan opearator (i.e. Scan Parquet ...)
* May add more in the future as there could be different ways of storing data than parquet
* Also, not every query is reading from disk

In [0]:
physical_plan_filtered_df = physical_plan_df.filter(
    F.col("physicalPlanDescription").like("%Scan parquet%")
)

#### 3.5.1 Create Column Lookup Dictionary  
- A dictionary to bind columns to table and database

In [0]:
def create_column_lookup(physicalPlan):
    COLUMN_LOOKUP = {}

    # first part of this regex will capture the spark_catalog.database.column_name part of the filescan
    # second part of the regex will capture all the columns in the following line
    regex = r"\(\d+\)\s+Scan parquet\s+(\S+)\nOutput \[\d+\]: \[(.*?)\]"
    matches = re.findall(regex, physicalPlan)

    for m in matches:
        db_table = m[0]  # first capture group
        database, table = db_table.split(".")[-2:]

        columns = m[1]  # second capture group
        for c in columns.split(", "):
            COLUMN_LOOKUP[c] = {"database": database, "table": table}
    return COLUMN_LOOKUP


# User defined function to perform on dataframe
col_lookup_udf = F.udf(
    create_column_lookup,
    MapType(StringType(), MapType(StringType(), StringType()), False),
)

In [0]:
# Create column lookup column
physical_plan_lookup_df = physical_plan_filtered_df.withColumn(
    "columnLookup", col_lookup_udf("physicalPlanDescription")
)

## Example of a scan parquet where NO LOOKUP is built:

    (1) Scan parquet 
    Output [1]: [wdayEffect#24088]
    Batched: true
    Location: PreparedDeltaFileIndex [dbfs:/mnt/lake/process/raw/gassco/output/bookings_for_companies_with_rebooking/v1r0]
    ReadSchema: struct<wdayEffect:string>

In [0]:
# Filter out only sql queries which contains lookup dictionary
physical_plan_lookup_df = physical_plan_lookup_df.filter(F.size("columnLookup") > 0)

#### 3.5.2 Use Regex To Find Relevant Information
- `Filter`
- `DataFilter`  
- `PushedFilters`  
- `PartitionFilters`  

##### `NOTE` Still one problem have no idea how to make the operation_id the way we planed. As it is now it has same operation id for operations from same physical plan  
##### `NOTE` Also a lot of operation rows are missing database and table

In [0]:
def build_rows(
    columns,
    column_lookup,
    operation_name,
    execution_id,
    timestamp,
    physical_plan_key,
    eventlog_key,
    table=None,
    database=None,
):
    unique_columns = set(columns)
    rows = []
    for c in unique_columns:
        # If you are handling a pushed filter, the column lookup might not be available
        # this is when the table and database variables ARE provided
        if database is None:
            database = column_lookup.get(c, {}).get("database", "")
        if table is None:
            table = column_lookup.get(c, {}).get("table", "")

        rows.append(
            {
                "operationName": operation_name,
                "executionId": execution_id,
                "databaseName": database,
                "tableName": table,
                "columnName": c.split("#")[0],  # get the part before the #
                "timeGenerated": timestamp,
                "physicalPlanKey": physical_plan_key,
                "eventlogKey": eventlog_key,
            }
        )
    return rows

In [0]:
def plan_parser(row, name, regex, col_regex, group_index):
    """Performs parsing of physical plan in dataframe row

    Args:
        row[spark.row]: Row in dataframe
        name[string]: Name of operation
        regex[string]: Regex for parsing after operation
        col_regex[string]: Regex for parsing column (different for pushedFilters so needed as input)
        group_index[integer]: Index for group in regex to get data frin
    Returns:
        A list of dictionarys/rows for new dataframe
        row_lst: list[dict]

    """
    # Define inputs
    execution_id = row["executionId"]
    timestamp = row["timestamp"]
    physical_plan = row["physicalPlanDescription"]
    column_lookup = row["columnLookup"]
    physical_plan_key = row["physicalPlanKey"]
    eventlog_key = row["eventlogKey"]

    # List containing instances
    row_lst = []
    # Matches for provided regex and loop through each
    matches = re.findall(regex, physical_plan)
    for m in matches:
        # Find relevant condtion group
        cond_group = m[group_index]
        # Find columns in condition group
        columns = re.findall(col_regex, cond_group)

        if name == "PushedFilters":  # Missing internal id so needs a small tweak
            # Find for pushed group
            # first capture group in file scan is: somethinghere.database.table
            db_table = m[0]
            database, table = db_table.split(".")[-2:]

            rows = build_rows(
                columns,
                column_lookup,
                name,
                execution_id,
                timestamp,
                physical_plan_key,
                eventlog_key,
                table=table,
                database=database,
            )
        else:
            rows = build_rows(
                columns,
                column_lookup,
                name,
                execution_id,
                timestamp,
                physical_plan_key,
                eventlog_key,
            )
        row_lst.extend(rows)

    return row_lst


def multi_plan_parser(row, settings):

    row_instances = []

    for operation in settings:
        operation_rows = plan_parser(
            row,
            operation["operation_name"],
            operation["regex"],
            operation["col_regex"],
            operation["group_index"],
        )
        row_instances.extend(operation_rows)
    return row_instances

In [0]:
# unique id for each of the opeartions
latest_operation_id = 0

# update latest_operation_id if there exists opeartions from before
if spark.catalog.tableExists("operations"):
    prev_operation_id = spark.sql(
        "SELECT MAX(operationId) AS operationId FROM operations"
    )
    latest_operation_id = prev_operation_id.collect()[0]["operationId"]

print(f"Previouis Operation ID: {latest_operation_id}")

Previouis Operation ID: 777


In [0]:
settings = [
    {
        "operation_name": "Filter",
        "regex": "\(\d+\)\s+Filter(.*?\n)Input(.*?\n)Condition : (.*?)\n\n",
        "col_regex": "\w+#\d+",
        "group_index": -1,
    },
    {
        "operation_name": "PartitionFilters",
        "regex": "\(\d+\)\s+Scan parquet\s+(\S+)\nOutput \[\d+\]: \[(.*?)\]\n(.*?\n)Location:(.*?\n)?(PartitionFilters:\s\[(.*?)\]\n)",
        "col_regex": "\w+#\d+",
        "group_index": -1,
    },
    {
        "operation_name": "PushedFilters",
        "regex": "\(\d+\)\s+Scan parquet\s+(\S+)\nOutput \[\d+\]: \[(.*?)\]\n(.*?\n)Location:(.*?\n)?(PartitionFilters:\s\[(.*?)\]\n)?(PushedFilters:\s\[(.*?)\])?",
        "col_regex": r"(?<=\()\b(?:[^(),]+|\((?:[^(),]+|\((?:[^(),]+|\((?:[^(),]+|\([^()]*\))*\))*\))*)(?=,|\))",
        "group_index": -1,
    },
]


operation_rdd = physical_plan_lookup_df.rdd.map(
    lambda row: multi_plan_parser(row, settings)
)
operation_df = operation_rdd.flatMap(lambda l: l).toDF()
operation_df = operation_df.withColumn(
    "operationId", F.monotonically_increasing_id() + latest_operation_id
)

In [0]:
# Convert string to int so that it can be used for comparison at a later stage (rulebased notebook)

operation_df = operation_df.withColumn(
    "timeGenerated", F.col("timeGenerated").cast("bigint")
)

operation_df.printSchema()
operation_df.show()

root
 |-- columnName: string (nullable = true)
 |-- databaseName: string (nullable = true)
 |-- eventlogKey: long (nullable = true)
 |-- executionId: long (nullable = true)
 |-- operationName: string (nullable = true)
 |-- physicalPlanKey: long (nullable = true)
 |-- tableName: string (nullable = true)
 |-- timeGenerated: long (nullable = true)
 |-- operationId: long (nullable = false)

+--------------------+------------+-----------+-----------+-------------+---------------+------------+-------------+-----------+
|          columnName|databaseName|eventlogKey|executionId|operationName|physicalPlanKey|   tableName|timeGenerated|operationId|
+--------------------+------------+-----------+-----------+-------------+---------------+------------+-------------+-----------+
|           tableName|     default| 1049959353|        246|       Filter|    -2012069072|  operations|1682160382506|        777|
|        databaseName|     default| 1049959353|        246|       Filter|    -2012069072|  ope

### 4. Write to Delta Table

#####`NOTE` WE ARE SAVING FILES INTO A PATH ALREADY THAT IS NOT IN DELTA FORMAT; WE NEED TO CHOOSE NEW PATH FOR THE DELTA FORMAT OR DELETE EXISTING FILES THERE  
##### WE ALSO HAVE TO FIX DUPLICATE PROBLEM FOR BOTH NOTEBOOKS WHEN LOADING TO TABLE ... (MERGE) https://stackoverflow.com/questions/67920919/how-to-prevent-duplicate-entries-to-enter-to-delta-lake-of-azure-storage

##### 4.2 Spark Operations Table

In [0]:
operation_df.printSchema()

root
 |-- columnName: string (nullable = true)
 |-- databaseName: string (nullable = true)
 |-- eventlogKey: long (nullable = true)
 |-- executionId: long (nullable = true)
 |-- operationName: string (nullable = true)
 |-- physicalPlanKey: long (nullable = true)
 |-- tableName: string (nullable = true)
 |-- timeGenerated: long (nullable = true)
 |-- operationId: long (nullable = false)



In [0]:
%sql
CREATE TABLE IF NOT EXISTS operations (
  columnName STRING,
  databaseName STRING,
  executionId LONG,
  operationName STRING,
  physicalPlanKey LONG,
  eventlogKey LONG,
  tableName STRING,
  timeGenerated LONG,
  operationId LONG
)

In [0]:
# Load eventlogKeys from existing data
existing_keys = spark.sql("select eventlogKey from operations")

print(f"Size of new operations before filtering {operation_df.count()}")

# Perform anti-join on eventlogKeys to make sure the new data isnt a duplicate of what is already stored in the table
operation_df = operation_df.join(existing_keys, "eventlogKey", "anti")

print(f"Size of new operations after filtering {operation_df.count()}")

# Load data to delta table
delta_table_path = "dbfs:/mnt/lake/"
operation_df.write.format("delta").mode("append").saveAsTable("operations")

Size of new operations before filtering 7267
Size of new operations after filtering 7267


##### 4.3 Queries Table

Create a queries-table that can be joined with the operations table to get more information with each opeartion.

In [0]:
sql_df.printSchema()

root
 |-- event_start: string (nullable = true)
 |-- rootExecutionId: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Details: string (nullable = true)
 |-- physicalPlanDescription: string (nullable = true)
 |-- sparkPlanInfo: string (nullable = true)
 |-- time_start: double (nullable = true)
 |-- modifiedConfigs: string (nullable = true)
 |-- eventlogKey: integer (nullable = true)
 |-- event_end: string (nullable = true)
 |-- sparkContextID: string (nullable = true)
 |-- clusterInstanceID: string (nullable = true)
 |-- executionId: integer (nullable = true)
 |-- time_end: double (nullable = true)
 |-- errorMessage: string (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- physicalPlanKey: integer (nullable = false)



In [0]:
%sql
CREATE TABLE IF NOT EXISTS queries (
  event_start STRING,
  rootExecutionId STRING,
  Description STRING,
  Details STRING,
  physicalPlanDescription STRING,
  sparkPlanInfo STRING,
  time_start DOUBLE,
  modifiedConfigs STRING,
  eventlogKey INT,
  event_end STRING,
  sparkContextID STRING,
  clusterInstanceID STRING,
  executionId INT,
  time_end DOUBLE,
  errorMessage STRING,
  duration_ms DOUBLE,
  physicalPlanKey INT
);

In [0]:
existing_keys = spark.sql("select physicalPlanKey from queries")

print(f"Size queries in dataframe {sql_df.count()}")

# Perform anti-join on eventlogKeys to make sure the new data isnt a duplicate of what is already stored in the table
filtered_sql_df = sql_df.join(existing_keys, "physicalPlanKey", "anti")

print(f"Size of new queries after filtering {sql_df.count()}")

# Load data to delta table
delta_table_path = "dbfs:/mnt/lake/"
filtered_sql_df.write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable(
    "queries"
)

Size queries in dataframe 10210
Size of new queries after filtering 10210


In [0]:
%sql
SELECT
  *
FROM
  operations

columnName,databaseName,executionId,operationName,physicalPlanKey,eventlogKey,tableName,timeGenerated,operationId
lastModified,default,568,Filter,-2113573252,848484182,eventlog_raw,1681726704046,42949673737
clusterInstanceID,default,568,Filter,-2113573252,848484182,eventlog_raw,1681726704046,42949673738
sparkContextID,default,568,Filter,-2113573252,848484182,eventlog_raw,1681726704046,42949673739
eventData,,568,Filter,-2113573252,848484182,,1681726704046,42949673740
clusterInstanceID,default,568,Filter,-2113573252,848484182,eventlog_raw,1681726704046,42949673741
sparkContextID,default,568,Filter,-2113573252,848484182,eventlog_raw,1681726704046,42949673742
lastModified,default,568,Filter,-2113573252,848484182,eventlog_raw,1681726704046,42949673743
eventData,,568,Filter,-2113573252,848484182,,1681726704046,42949673744
sparkContextID,default,568,PushedFilters,-2113573252,848484182,eventlog_raw,1681726704046,42949673745
lastModified,default,568,PushedFilters,-2113573252,848484182,eventlog_raw,1681726704046,42949673746


In [0]:
%sql
SELECT
  *
FROM
  queries

event_start,rootExecutionId,Description,Details,physicalPlanDescription,sparkPlanInfo,time_start,modifiedConfigs,eventlogKey,event_end,sparkContextID,clusterInstanceID,executionId,time_end,errorMessage,duration_ms,physicalPlanKey
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,184,use catalog `hive_metastore`; describe database...,org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3416) com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation0(OutputAggregator.scala:267) com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation(OutputAggregator.scala:101) com.databricks.backend.daemon.driver.SQLDriverLocal.executeSql(SQLDriverLocal.scala:115) com.databricks.backend.daemon.driver.SQLDriverLocal.repl(SQLDriverLocal.scala:145) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$23(DriverLocal.scala:729) com.databricks.unity.EmptyHandle$.runWith(UCSHandle.scala:124) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$20(DriverLocal.scala:712) com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:398) scala.util.DynamicVariable.withValue(DynamicVariable.scala:62) com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:147) com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:396) com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:393) com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:62) com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:441) com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:426) com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:62) com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:689) com.databricks.backend.daemon.driver.DriverWrapper.$anonfun$tryExecutingCommand$1(DriverWrapper.scala:622) scala.util.Try$.apply(Try.scala:213),"== Physical Plan == CollectLimit (3) +- CommandResult (1)  +- DescribeNamespace (2) (1) CommandResult Output [2]: [database_description_item#1373, database_description_value#1374] Arguments: [database_description_item#1373, database_description_value#1374] (2) DescribeNamespace Arguments: [database_description_item#1373, database_description_value#1374], com.databricks.sql.managedcatalog.UnityCatalogV2Proxy@5b8f3995, [default], true (3) CollectLimit Input [2]: [database_description_item#1373, database_description_value#1374] Arguments: 10001","{""nodeName"":""CollectLimit"",""simpleString"":""CollectLimit 10001"",""children"":[{""nodeName"":""CommandResult"",""simpleString"":""CommandResult [database_description_item#1373, database_description_value#1374]"",""children"":[],""metadata"":{},""metrics"":[{""name"":""number of output rows"",""accumulatorId"":2442,""metricType"":""sum"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""shuffle records written"",""accumulatorId"":2440,""metricType"":""sum"",""experimental"":false},{""name"":""records read"",""accumulatorId"":2438,""metricType"":""sum"",""experimental"":false},{""name"":""local bytes read"",""accumulatorId"":2436,""metricType"":""size"",""experimental"":false},{""name"":""fetch wait time"",""accumulatorId"":2437,""metricType"":""timing"",""experimental"":false},{""name"":""remote bytes read"",""accumulatorId"":2434,""metricType"":""size"",""experimental"":false},{""name"":""local blocks read"",""accumulatorId"":2433,""metricType"":""sum"",""experimental"":false},{""name"":""remote blocks read"",""accumulatorId"":2432,""metricType"":""sum"",""experimental"":false},{""name"":""remote bytes read to disk"",""accumulatorId"":2435,""metricType"":""size"",""experimental"":false},{""name"":""shuffle bytes written"",""accumulatorId"":2439,""metricType"":""size"",""experimental"":false}],""explainId"":null}",1679485119956.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/RtmpPe0TrA""}",2042027793.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,1.7791291964349135e+18,1220-124459-5lnjmtzx_10_254_10_44,184.0,1679485119958.0,,2.0,-2145443157.0
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,228,"# spark.conf.set(""spark.databricks.pyspark.opti...",org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4050) sun.reflect.GeneratedMethodAccessor427.invoke(Unknown Source) sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) java.lang.reflect.Method.invoke(Method.java:498) py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) py4j.Gateway.invoke(Gateway.java:306) py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) py4j.commands.CallCommand.execute(CallCommand.java:79) py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:195) py4j.ClientServerConnection.run(ClientServerConnection.java:115) java.lang.Thread.run(Thread.java:750),"== Physical Plan == CollectLimit (4) +- * Generate (3)  +- * Filter (2)  +- * Scan XmlRelation(com.databricks.spark.xml.DefaultSource$$Lambda$6729/2137464541@20386798,Some(mnt/lake/ingest/gassco/bookings_for_companies/v0r1preview/window/2022/6/16/default_partition/ff5dee29-ebdd-4e6c-aa77-d1febe7309d8.xml),Map(rowtag -> return, roottag -> S:Envelope, path -> mnt/lake/ingest/gassco/bookings_for_companies/v0r1preview/window/2022/6/16/default_partition/ff5dee29-ebdd-4e6c-aa77-d1febe7309d8.xml),null) (1) (1) Scan XmlRelation(com.databricks.spark.xml.DefaultSource$$Lambda$6729/2137464541@20386798,Some(mnt/lake/ingest/gassco/bookings_for_companies/v0r1preview/window/2022/6/16/default_partition/ff5dee29-ebdd-4e6c-aa77-d1febe7309d8.xml),Map(rowtag -> return, roottag -> S:Envelope, path -> mnt/lake/ingest/gassco/bookings_for_companies/v0r1preview/window/2022/6/16/default_partition/ff5dee29-ebdd-4e6c-aa77-d1febe7309d8.xml),null) [codegen id : 1] Output [2]: [bookings#9848, status#9849L] PushedFilters: [IsNotNull(bookings)] ReadSchema: struct>,status:bigint> (2) Filter [codegen id : 1] Input [2]: [bookings#9848, status#9849L] Condition : ((size(bookings#9848, true) > 0) AND isnotnull(bookings#9848)) (3) Generate [codegen id : 1] Input [2]: [bookings#9848, status#9849L] Arguments: explode(bookings#9848), [status#9849L], false, [col#9852] (4) CollectLimit Input [2]: [status#9849L, col#9852] Arguments: 1","{""nodeName"":""CollectLimit"",""simpleString"":""CollectLimit 1"",""children"":[{""nodeName"":""WholeStageCodegen (1)"",""simpleString"":""WholeStageCodegen"",""children"":[{""nodeName"":""Generate"",""simpleString"":""Generate explode(bookings#9848), [status#9849L], false, [col#9852]"",""children"":[{""nodeName"":""Filter"",""simpleString"":""Filter ((size(bookings#9848, true) > 0) AND isnotnull(bookings#9848))"",""children"":[{""nodeName"":""Scan XmlRelation(com.databricks.spark.xml.DefaultSource$$Lambda$6729/2137464541@20386798,Some(mnt/lake/ingest/gassco/bookings_for_companies/v0r1preview/window/2022/6/16/default_partition/ff5dee29-ebdd-4e6c-aa77-d1febe7309d8.xml),Map(rowtag -> return, roottag -> S:Envelope, path -> mnt/lake/ingest/gassco/bookings_for_companies/v0r1preview/window/2022/6/16/default_partition/ff5dee29-ebdd-4e6c-aa77-d1febe7309d8.xml),null) "",""simpleString"":""Scan XmlRelation(com.databricks.spark.xml.DefaultSource$$Lambda$6729/2137464541@20386798,Some(mnt/lake/ingest/gassco/bookings_for_companies/v0r1preview/window/2022/6/16/default_partition/ff5dee29-ebdd-4e6c-aa77-d1febe7309d8.xml),Map(rowtag -> return, roottag -> S:Envelope, path -> mnt/lake/ingest/gassco/bookings_for_companies/v0r1preview/window/2022/6/16/default_partition/ff5dee29-ebdd-4e6c-aa77-d1febe7309d8.xml),null) [bookings#9848,status#9849L] PushedFilters: [IsNotNull(bookings)], ReadSchema: struct1.679991891218E12{""spark.r.sql.derby.temp.dir"":""/tmp/RtmpKZltiy"",""spark.databricks.pyspark.optimizeWrite.enabled"":""true"",""spark.sql.sources.commitMetadataCheck"":""false"",""spark.sql.parquet.mergeSchema"":""false"",""mapreduce.fileoutputcommitter.algorithm.version"":""2"",""spark.sql.parquet.writeLegacyFormat"":""true""}-1228857796org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd85853421804830186921220-124459-5lnjmtzx_10_254_10_352281.679991891925E12707.0-2141969732",,,,,,,,,,,
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,167,id = 969769e6-1577-4ad1-b0be-1d4e843e1bda runId = 4c9451bb-1c70-4629-998b-f32895e9fb02 batch = 0,org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:255) sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) java.lang.reflect.Method.invoke(Method.java:498) py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) py4j.Gateway.invoke(Gateway.java:306) py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) py4j.commands.CallCommand.execute(CallCommand.java:79) py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:195) py4j.ClientServerConnection.run(ClientServerConnection.java:115) java.lang.Thread.run(Thread.java:750),"== Physical Plan == * Project (5) +- StreamingDeduplicate (4)  +- Exchange (3)  +- * ColumnarToRow (2)  +- Scan parquet (1) (1) Scan parquet Output [17]: [status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981] Batched: true Location: CloudFilesSourceFileIndex [dbfs:/mnt/lake/process/conformed/gassco/output/point_overview/v0r1preview/full] ReadSchema: struct (2) ColumnarToRow [codegen id : 1] Input [17]: [status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981] (3) Exchange Input [17]: [status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981] Arguments: hashpartitioning(companyId#7966L, pointId#7976L, fromDate#7969, toDate#7979, 200), ENSURE_REQUIREMENTS, [plan_id=3972] (4) StreamingDeduplicate Input [17]: [status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981] Arguments: [companyId#7966L, pointId#7976L, fromDate#7969, toDate#7979], state info [ checkpoint = , runId = f0ffe6d7-f505-4954-acff-f80cc5f819ca, opId = 0, ver = 0, numPartitions = 200], 0 (5) Project [codegen id : 2] Output [18]: [status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981, hash(status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981, 42) AS column_hash#7748] Input [17]: [status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981]","{""nodeName"":""WholeStageCodegen (2)"",""simpleString"":""WholeStageCodegen"",""children"":[{""nodeName"":""Project"",""simpleString"":""Project [status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981, hash(status#7965L, companyId#7966L, firmBookedWday#7967, firmPointCapacity#7968, fromDate#7969, intrBookedWday#7970, intrPointCapacity#7971, ownBookedFirmWday#7972, ownBookedIntrWday#7973, ownBookedWday#7974, ownNomination#7975, pointId#7976L, sumBookedWday#7977, sumNomination#7978, toDate#7979, etl_created_at#7980, _rescued_data#7981, 42) AS column_hash#7748]"",""children"":[{""nodeName"":""InputAdapter"",""simpleString"":""InputAdapter"",""children"":[{""nodeName"":""StreamingDeduplicate"",""simpleString"":""StreamingDeduplicate [companyId#7966L, pointId#7976L, fromDate#7969, toDate#7979], state info [ checkpoint = dbfs:/mnt/lake/process/raw/gassco/work/point_overview/v2r0/_checkpoint/state, runId = 4c9451bb-1c70-4629-998b-f32895e9fb02, opId = 0, ver = 0, numPartitions = 200], 0"",""children"":[{""nodeName"":""Exchange"",""simpleString"":""Exchange hashpartitioning(companyId#7966L, pointId#7976L, fromDate#7969, toDate#7979, 200), ENSURE_REQUIREMENTS, [plan_id=3938]"",""children"":[{""nodeName"":""WholeStageCodegen (1)"",""simpleString"":""WholeStageCodegen"",""children"":[{""nodeName"":""ColumnarToRow"",""simpleString"":""ColumnarToRow"",""children"":[{""nodeName"":""InputAdapter"",""simpleString"":""InputAdapter"",""children"":[{""nodeName"":""Scan parquet "",""simpleString"":""FileScan parquet [status#7965L,companyId#7966L,firmBookedWday#7967,firmPointCapacity#7968,fromDate#7969,intrBookedWday#7970,intrPointCapacity#7971,ownBookedFirmWday#7972,ownBookedIntrWday#7973,ownBookedWday#7974,ownNomination#7975,pointId#7976L,sumBookedWday#7977,sumNomination#7978,toDate#7979,etl_created_at#7980,_rescued_data#7981] Batched: true, DataFilters: [], Format: Parquet, Location: CloudFilesSourceFileIndex(1 paths)[dbfs:/mnt/lake/process/conformed/gassco/output/point_overview/..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct"",""Format"":""Parquet"",""Batched"":""true"",""PartitionFilters"":""[]"",""PushedFilters"":""[]"",""DataFilters"":""[]""},""metrics"":[{""name"":""file sorting by size time"",""accumulatorId"":6189,""metricType"":""timing"",""experimental"":false},{""name"":""total number of parquet row groups"",""accumulatorId"":6176,""metricType"":""sum"",""experimental"":false},{""name"":""cloud storage response size"",""accumulatorId"":6179,""metricType"":""size"",""experimental"":false},{""name"":""number of files read"",""accumulatorId"":6184,""metricType"":""sum"",""experimental"":false},{""name"":""filesystem read data size"",""accumulatorId"":6170,""metricType"":""size"",""experimental"":false},{""name"":""scan time"",""accumulatorId"":6171,""metricType"":""timing"",""experimental"":false},{""name"":""cloud storage retry duration"",""accumulatorId"":6183,""metricType"":""timing"",""experimental"":false},{""name"":""estimated repeated reads high size"",""accumulatorId"":6175,""metricType"":""size"",""experimental"":false},{""name"":""filesystem read data size (sampled)"",""accumulatorId"":6169,""metricType"":""size"",""experimental"":false},{""name"":""filesystem read time (sampled)"",""accumulatorId"":6168,""metricType"":""timing"",""experimental"":false},{""name"":""max partition size chosen"",""accumulatorId"":6191,""metricType"":""size"",""experimental"":false},{""name"":""missing files"",""accumulatorId"":6172,""metricType"":""sum"",""experimental"":false},{""name"":""cloud storage request size"",""accumulatorId"":6178,""metricType"":""size"",""experimental"":false},{""name"":""metadata time"",""accumulatorId"":6185,""metricType"":""timing"",""experimental"":false},{""name"":""number of parquet row groups read"",""accumulatorId"":6177,""metricType"":""sum"",""experimental"":false},{""name"":""size of files read"",""accumulatorId"":6186,""metricType"":""size"",""experimental"":false},{""name"":""cloud storage retry count"",""accumulatorId"":6182,""metricType"":""count"",""experimental"":false},{""name"":""cloud storage request count"",""accumulatorId"":6180,""metricType"":""count"",""experimental"":false},{""name"":""average total splits sizes distribution per node"",""accumulatorId"":6187,""metricType"":""size"",""experimental"":false},{""name"":""number of output rows"",""accumulatorId"":6167,""metricType"":""sum"",""experimental"":false},{""name"":""the proration factor used"",""accumulatorId"":6190,""metricType"":""sum"",""experimental"":false},{""name"":""estimated repeated reads low size"",""accumulatorId"":6174,""metricType"":""size"",""experimental"":false},{""name"":""corrupt files"",""accumulatorId"":6173,""metricType"":""sum"",""experimental"":false},{""name"":""cluster's current parallelism "",""accumulatorId"":6192,""metricType"":""sum"",""experimental"":false},{""name"":""relative skew in total splits sizes distribution"",""accumulatorId"":6188,""metricType"":""size"",""experimental"":false},{""name"":""cloud storage request duration"",""accumulatorId"":6181,""metricType"":""timing"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""number of output rows"",""accumulatorId"":6165,""metricType"":""sum"",""experimental"":false},{""name"":""number of input batches"",""accumulatorId"":6166,""metricType"":""sum"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""duration"",""accumulatorId"":6164,""metricType"":""timing"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""shuffle records written"",""accumulatorId"":6162,""metricType"":""sum"",""experimental"":false},{""name"":""records read"",""accumulatorId"":6160,""metricType"":""sum"",""experimental"":false},{""name"":""local bytes read"",""accumulatorId"":6158,""metricType"":""size"",""experimental"":false},{""name"":""fetch wait time"",""accumulatorId"":6159,""metricType"":""timing"",""experimental"":false},{""name"":""remote bytes read"",""accumulatorId"":6156,""metricType"":""size"",""experimental"":false},{""name"":""local blocks read"",""accumulatorId"":6155,""metricType"":""sum"",""experimental"":false},{""name"":""remote blocks read"",""accumulatorId"":6154,""metricType"":""sum"",""experimental"":false},{""name"":""data size"",""accumulatorId"":6152,""metricType"":""size"",""experimental"":false},{""name"":""number of partitions"",""accumulatorId"":6153,""metricType"":""sum"",""experimental"":false},{""name"":""remote bytes read to disk"",""accumulatorId"":6157,""metricType"":""size"",""experimental"":false},{""name"":""shuffle bytes written"",""accumulatorId"":6161,""metricType"":""size"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""number of shuffle partitions"",""accumulatorId"":6147,""metricType"":""sum"",""experimental"":false},{""name"":""number of removed state rows"",""accumulatorId"":6143,""metricType"":""sum"",""experimental"":false},{""name"":""number of duplicates dropped"",""accumulatorId"":6137,""metricType"":""sum"",""experimental"":false},{""name"":""number of total state rows"",""accumulatorId"":6140,""metricType"":""sum"",""experimental"":false},{""name"":""number of state store instances"",""accumulatorId"":6148,""metricType"":""sum"",""experimental"":false},{""name"":""memory used by state"",""accumulatorId"":6146,""metricType"":""size"",""experimental"":false},{""name"":""count of cache hit on states cache in provider"",""accumulatorId"":6150,""metricType"":""sum"",""experimental"":false},{""name"":""number of output rows"",""accumulatorId"":6138,""metricType"":""sum"",""experimental"":false},{""name"":""estimated size of state only on current version"",""accumulatorId"":6149,""metricType"":""size"",""experimental"":false},{""name"":""number of rows which are dropped by watermark"",""accumulatorId"":6139,""metricType"":""sum"",""experimental"":false},{""name"":""count of cache miss on states cache in provider"",""accumulatorId"":6151,""metricType"":""sum"",""experimental"":false},{""name"":""time to commit changes"",""accumulatorId"":6145,""metricType"":""timing"",""experimental"":false},{""name"":""time to remove"",""accumulatorId"":6144,""metricType"":""timing"",""experimental"":false},{""name"":""number of updated state rows"",""accumulatorId"":6141,""metricType"":""sum"",""experimental"":false},{""name"":""time to update"",""accumulatorId"":6142,""metricType"":""timing"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[],""explainId"":null}],""metadata"":{},""metrics"":[],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""duration"",""accumulatorId"":6136,""metricType"":""timing"",""experimental"":false}],""explainId"":null}",1679498560374.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/RtmpdUb1ij"",""spark.databricks.sql.optimizer.forceAggregateShuffleAllExpressions.enabled"":""false"",""spark.sql.adaptive.enabled"":""false"",""spark.sql.requireAllClusterKeysForDistribution"":""false"",""spark.sql.cbo.enabled"":""false""}",1249497844.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,1.3831541825546675e+18,1220-124459-5lnjmtzx_10_254_10_35,167.0,1679498607498.0,,47124.0,-2137596936.0
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,495,-- this is a system generated query from notebo...,org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3416) com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation0(OutputAggregator.scala:267) com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation(OutputAggregator.scala:101) com.databricks.backend.daemon.driver.SQLDriverLocal.executeSql(SQLDriverLocal.scala:115) com.databricks.backend.daemon.driver.SQLDriverLocal.repl(SQLDriverLocal.scala:145) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$23(DriverLocal.scala:729) com.databricks.unity.EmptyHandle$.runWith(UCSHandle.scala:124) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$20(DriverLocal.scala:712) com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:398) scala.util.DynamicVariable.withValue(DynamicVariable.scala:62) com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:147) com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:396) com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:393) com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:62) com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:441) com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:426) com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:62) com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:689) com.databricks.backend.daemon.driver.DriverWrapper.$anonfun$tryExecutingCommand$1(DriverWrapper.scala:622) scala.util.Try$.apply(Try.scala:213),"== Physical Plan == CollectLimit (3) +- CommandResult (1)  +- ShowNamespaces (2) (1) CommandResult Output [1]: [databaseName#17592] Arguments: [databaseName#17592] (2) ShowNamespaces Output [1]: [databaseName#17592] Arguments: [databaseName#17592], com.databricks.sql.managedcatalog.UnityCatalogV2Proxy@a5ae1f9 (3) CollectLimit Input [1]: [databaseName#17592] Arguments: 1000001","{""nodeName"":""CollectLimit"",""simpleString"":""CollectLimit 1000001"",""children"":[{""nodeName"":""CommandResult"",""simpleString"":""CommandResult [databaseName#17592]"",""children"":[],""metadata"":{},""metrics"":[{""name"":""number of output rows"",""accumulatorId"":23411,""metricType"":""sum"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""shuffle records written"",""accumulatorId"":23409,""metricType"":""sum"",""experimental"":false},{""name"":""records read"",""accumulatorId"":23407,""metricType"":""sum"",""experimental"":false},{""name"":""local bytes read"",""accumulatorId"":23405,""metricType"":""size"",""experimental"":false},{""name"":""fetch wait time"",""accumulatorId"":23406,""metricType"":""timing"",""experimental"":false},{""name"":""remote bytes read"",""accumulatorId"":23403,""metricType"":""size"",""experimental"":false},{""name"":""local blocks read"",""accumulatorId"":23402,""metricType"":""sum"",""experimental"":false},{""name"":""remote blocks read"",""accumulatorId"":23401,""metricType"":""sum"",""experimental"":false},{""name"":""remote bytes read to disk"",""accumulatorId"":23404,""metricType"":""size"",""experimental"":false},{""name"":""shuffle bytes written"",""accumulatorId"":23408,""metricType"":""size"",""experimental"":false}],""explainId"":null}",1680006732089.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/RtmpKZltiy""}",375232879.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,8.585342180483018e+18,1220-124459-5lnjmtzx_10_254_10_35,495.0,1680006732090.0,,1.0,-2134689599.0
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,0,sql at DriverLocal.scala:279,org.apache.spark.sql.SQLContext.sql(SQLContext.scala:695) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$6(DriverLocal.scala:279) org.apache.spark.SafeAddJarOrFile$.safe(SafeAddJarOrFile.scala:31) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$5(DriverLocal.scala:279) com.databricks.sql.acl.CheckPermissions$.trusted(CheckPermissions.scala:1798) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$4(DriverLocal.scala:278) com.databricks.unity.EmptyHandle$.runWith(UCSHandle.scala:124) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$3(DriverLocal.scala:271) scala.util.Using$.resource(Using.scala:269) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$2(DriverLocal.scala:270) scala.collection.Iterator.foreach(Iterator.scala:943) scala.collection.Iterator.foreach$(Iterator.scala:943) scala.collection.AbstractIterator.foreach(Iterator.scala:1431) scala.collection.IterableLike.foreach(IterableLike.scala:74) scala.collection.IterableLike.foreach$(IterableLike.scala:73) scala.collection.AbstractIterable.foreach(Iterable.scala:56) com.databricks.backend.daemon.driver.DriverLocal.(DriverLocal.scala:257) com.databricks.backend.daemon.driver.SQLDriverLocal.(SQLDriverLocal.scala:24) com.databricks.backend.daemon.driver.SQLDriverWrapper.instantiateDriver(DriverWrapper.scala:839) com.databricks.backend.daemon.driver.DriverWrapper.setupRepl(DriverWrapper.scala:342),== Physical Plan == Execute AddJarsCommand (1)  +- AddJarsCommand (2) (1) Execute AddJarsCommand Output: [] (2) AddJarsCommand Arguments: [/local_disk0/tmp/addedFile1376998798014864251com_databricks_spark_xml_2_12_0_12_0-940be.jar],"{""nodeName"":""Execute AddJarsCommand"",""simpleString"":""Execute AddJarsCommand"",""children"":[],""metadata"":{},""metrics"":[],""explainId"":null}",1680253141190.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/RtmprwWgXa""}",-412877831.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,7.954194231928627e+18,1220-124459-5lnjmtzx_10_254_10_11,0.0,1680253144132.0,,2942.0,-2126828031.0
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,365,sql at DriverLocal.scala:279,org.apache.spark.sql.SQLContext.sql(SQLContext.scala:695) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$6(DriverLocal.scala:279) org.apache.spark.SafeAddJarOrFile$.safe(SafeAddJarOrFile.scala:31) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$5(DriverLocal.scala:279) com.databricks.sql.acl.CheckPermissions$.trusted(CheckPermissions.scala:1798) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$4(DriverLocal.scala:278) com.databricks.unity.EmptyHandle$.runWith(UCSHandle.scala:124) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$3(DriverLocal.scala:271) scala.util.Using$.resource(Using.scala:269) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$new$2(DriverLocal.scala:270) scala.collection.Iterator.foreach(Iterator.scala:943) scala.collection.Iterator.foreach$(Iterator.scala:943) scala.collection.AbstractIterator.foreach(Iterator.scala:1431) scala.collection.IterableLike.foreach(IterableLike.scala:74) scala.collection.IterableLike.foreach$(IterableLike.scala:73) scala.collection.AbstractIterable.foreach(Iterable.scala:56) com.databricks.backend.daemon.driver.DriverLocal.(DriverLocal.scala:257) com.databricks.backend.daemon.driver.PythonDriverLocalBase.(PythonDriverLocalBase.scala:168) com.databricks.backend.daemon.driver.JupyterDriverLocal.(JupyterDriverLocal.scala:381) com.databricks.backend.daemon.driver.PythonDriverWrapper.instantiateDriver(DriverWrapper.scala:723),== Physical Plan == Execute AddJarsCommand (1)  +- AddJarsCommand (2) (1) Execute AddJarsCommand Output: [] (2) AddJarsCommand Arguments: [/local_disk0/tmp/addedFile5834123427018254087com_databricks_spark_xml_2_12_0_12_0-940be.jar],"{""nodeName"":""Execute AddJarsCommand"",""simpleString"":""Execute AddJarsCommand"",""children"":[],""metadata"":{},""metrics"":[],""explainId"":null}",1679499681516.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/RtmpdUb1ij""}",-678364129.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,1.3831541825546675e+18,1220-124459-5lnjmtzx_10_254_10_35,365.0,1679499681517.0,,1.0,-2121057437.0
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,292,"df = raw_df.withColumn('fileData', F.regexp_rep...",org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4050) sun.reflect.GeneratedMethodAccessor574.invoke(Unknown Source) sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) java.lang.reflect.Method.invoke(Method.java:498) py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) py4j.Gateway.invoke(Gateway.java:306) py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) py4j.commands.CallCommand.execute(CallCommand.java:79) py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:195) py4j.ClientServerConnection.run(ClientServerConnection.java:115) java.lang.Thread.run(Thread.java:750),"== Physical Plan == CollectLimit (6) +- * Project (5)  +- Project (4)  +- * Filter (3)  +- * ColumnarToRow (2)  +- Scan parquet spark_catalog.default.eventlog_raw (1) (1) Scan parquet spark_catalog.default.eventlog_raw Output [3]: [filePath#853, lastModified#857, fileData#858] Batched: true Location: PreparedDeltaFileIndex [dbfs:/user/hive/warehouse/eventlog_raw] PushedFilters: [IsNotNull(lastModified), GreaterThanOrEqual(lastModified,2023-03-22 15:30:00.0), LessThanOrEqual(lastModified,2023-03-22 17:30:00.0)] ReadSchema: struct (2) ColumnarToRow [codegen id : 1] Input [3]: [filePath#853, lastModified#857, fileData#858] (3) Filter [codegen id : 1] Input [3]: [filePath#853, lastModified#857, fileData#858] Condition : ((isnotnull(lastModified#857) AND (lastModified#857 >= 2023-03-22 15:30:00)) AND (lastModified#857 <= 2023-03-22 17:30:00)) (4) Project Output [2]: [filePath#853, from_json(StructField(Event,StringType,true), StructField(SparkContext Id,StringType,true), StructField(Stage Info,StringType,true), StructField(Task Info,StringType,true), StructField(Stage ID,StringType,true), StructField(Task End Reason,StringType,true), StructField(Stage IDs,StringType,true), StructField(Stage Attempt ID,StringType,true), StructField(Completion Time,StringType,true), StructField(time,StringType,true), StructField(errorMessage,StringType,true), StructField(Task Executor Metrics,StringType,true), StructField(Timestamp,StringType,true), StructField(executionId,StringType,true), StructField(Job Result,StringType,true), StructField(Stage Infos,StringType,true), StructField(details,StringType,true), StructField(Task Metrics,StringType,true), StructField(physicalPlanDescription,StringType,true), StructField(modifiedConfigs,StringType,true), StructField(Submission Time,StringType,true), StructField(rootExecutionId,StringType,true), StructField(Spark Version,StringType,true), StructField(Rollover Number,StringType,true), ... 8 more fields) AS fileData#4986] Input [3]: [filePath#853, lastModified#857, fileData#858] (5) Project [codegen id : 2] Output [31]: [filePath#853, fileData#4986.Event AS Event#4993, fileData#4986.SparkContext Id AS SparkContext Id#4994, fileData#4986.Stage Info AS Stage Info#4995, fileData#4986.Task Info AS Task Info#4996, fileData#4986.Stage ID AS Stage ID#4997, fileData#4986.Task End Reason AS Task End Reason#4998, fileData#4986.Stage IDs AS Stage IDs#4999, fileData#4986.Stage Attempt ID AS Stage Attempt ID#5000, fileData#4986.Completion Time AS Completion Time#5001, fileData#4986.time AS time#5002, fileData#4986.errorMessage AS errorMessage#5003, fileData#4986.Task Executor Metrics AS Task Executor Metrics#5004, fileData#4986.Timestamp AS Timestamp#5005, fileData#4986.executionId AS executionId#5006, fileData#4986.Job Result AS Job Result#5007, fileData#4986.Stage Infos AS Stage Infos#5008, fileData#4986.details AS details#5009, fileData#4986.Task Metrics AS Task Metrics#5010, fileData#4986.physicalPlanDescription AS physicalPlanDescription#5011, fileData#4986.modifiedConfigs AS modifiedConfigs#5012, fileData#4986.Submission Time AS Submission Time#5013, fileData#4986.rootExecutionId AS rootExecutionId#5014, fileData#4986.Spark Version AS Spark Version#5015, fileData#4986.Rollover Number AS Rollover Number#5016, fileData#4986.sparkPlanInfo AS sparkPlanInfo#5017, fileData#4986.Job ID AS Job ID#5018, fileData#4986.Task Type AS Task Type#5019, fileData#4986.description AS description#5020, fileData#4986.Properties AS Properties#5021, fileData#4986.accumUpdates AS accumUpdates#5022] Input [2]: [filePath#853, fileData#4986] (6) CollectLimit Input [31]: [filePath#853, Event#4993, SparkContext Id#4994, Stage Info#4995, Task Info#4996, Stage ID#4997, Task End Reason#4998, Stage IDs#4999, Stage Attempt ID#5000, Completion Time#5001, time#5002, errorMessage#5003, Task Executor Metrics#5004, Timestamp#5005, executionId#5006, Job Result#5007, Stage Infos#5008, details#5009, Task Metrics#5010, physicalPlanDescription#5011, modifiedConfigs#5012, Submission Time#5013, rootExecutionId#5014, Spark Version#5015, Rollover Number#5016, sparkPlanInfo#5017, Job ID#5018, Task Type#5019, description#5020, Properties#5021, accumUpdates#5022] Arguments: 1","{""nodeName"":""CollectLimit"",""simpleString"":""CollectLimit 1"",""children"":[{""nodeName"":""WholeStageCodegen (2)"",""simpleString"":""WholeStageCodegen"",""children"":[{""nodeName"":""Project"",""simpleString"":""Project [filePath#853, fileData#4986.Event AS Event#4993, fileData#4986.SparkContext Id AS SparkContext Id#4994, fileData#4986.Stage Info AS Stage Info#4995, fileData#4986.Task Info AS Task Info#4996, fileData#4986.Stage ID AS Stage ID#4997, fileData#4986.Task End Reason AS Task End Reason#4998, fileData#4986.Stage IDs AS Stage IDs#4999, fileData#4986.Stage Attempt ID AS Stage Attempt ID#5000, fileData#4986.Completion Time AS Completion Time#5001, fileData#4986.time AS time#5002, fileData#4986.errorMessage AS errorMessage#5003, fileData#4986.Task Executor Metrics AS Task Executor Metrics#5004, fileData#4986.Timestamp AS Timestamp#5005, fileData#4986.executionId AS executionId#5006, fileData#4986.Job Result AS Job Result#5007, fileData#4986.Stage Infos AS Stage Infos#5008, fileData#4986.details AS details#5009, fileData#4986.Task Metrics AS Task Metrics#5010, fileData#4986.physicalPlanDescription AS physicalPlanDescription#5011, fileData#4986.modifiedConfigs AS modifiedConfigs#5012, fileData#4986.Submission Time AS Submission Time#5013, fileData#4986.rootExecutionId AS rootExecutionId#5014, fileData#4986.Spark Version AS Spark Version#5015, ... 7 more fields]"",""children"":[{""nodeName"":""InputAdapter"",""simpleString"":""InputAdapter"",""children"":[{""nodeName"":""Project"",""simpleString"":""Project [filePath#853, from_json(StructField(Event,StringType,true), StructField(SparkContext Id,StringType,true), StructField(Stage Info,StringType,true), StructField(Task Info,StringType,true), StructField(Stage ID,StringType,true), StructField(Task End Reason,StringType,true), StructField(Stage IDs,StringType,true), StructField(Stage Attempt ID,StringType,true), StructField(Completion Time,StringType,true), StructField(time,StringType,true), StructField(errorMessage,StringType,true), StructField(Task Executor Metrics,StringType,true), StructField(Timestamp,StringType,true), StructField(executionId,StringType,true), StructField(Job Result,StringType,true), StructField(Stage Infos,StringType,true), StructField(details,StringType,true), StructField(Task Metrics,StringType,true), StructField(physicalPlanDescription,StringType,true), StructField(modifiedConfigs,StringType,true), StructField(Submission Time,StringType,true), StructField(rootExecutionId,StringType,true), StructField(Spark Version,StringType,true), StructField(Rollover Number,StringType,true), ... 8 more fields) AS fileData#4986]"",""children"":[{""nodeName"":""WholeStageCodegen (1)"",""simpleString"":""WholeStageCodegen"",""children"":[{""nodeName"":""Filter"",""simpleString"":""Filter ((isnotnull(lastModified#857) AND (lastModified#857 >= 2023-03-22 15:30:00)) AND (lastModified#857 <= 2023-03-22 17:30:00))"",""children"":[{""nodeName"":""ColumnarToRow"",""simpleString"":""ColumnarToRow"",""children"":[{""nodeName"":""InputAdapter"",""simpleString"":""InputAdapter"",""children"":[{""nodeName"":""Scan parquet spark_catalog.default.eventlog_raw"",""simpleString"":""FileScan parquet spark_catalog.default.eventlog_raw[filePath#853,lastModified#857,fileData#858] Batched: true, DataFilters: [isnotnull(lastModified#857), (lastModified#857 >= 2023-03-22 15:30:00), (lastModified#857 <= 202..., Format: Parquet, Location: PreparedDeltaFileIndex(1 paths)[dbfs:/user/hive/warehouse/eventlog_raw], PartitionFilters: [], PushedFilters: [IsNotNull(lastModified), GreaterThanOrEqual(lastModified,2023-03-22 15:30:00.0), LessThanOrEqual..., ReadSchema: struct"",""children"":[],""metadata"":{""Location"":""PreparedDeltaFileIndex(1 paths)[dbfs:/user/hive/warehouse/eventlog_raw]"",""ReadSchema"":""struct"",""Format"":""Parquet"",""Batched"":""true"",""PartitionFilters"":""[]"",""PushedFilters"":""[IsNotNull(lastModified), GreaterThanOrEqual(lastModified,2023-03-22 15:30:00.0), LessThanOrEqual(lastModified,2023-03-22 17:30:00.0)]"",""DataFilters"":""[isnotnull(lastModified#857), (lastModified#857 >= 2023-03-22 15:30:00), (lastModified#857 <= 2023-03-22 17:30:00)]""},""metrics"":[{""name"":""file sorting by size time"",""accumulatorId"":10569,""metricType"":""timing"",""experimental"":false},{""name"":""total number of parquet row groups"",""accumulatorId"":10556,""metricType"":""sum"",""experimental"":false},{""name"":""cloud storage response size"",""accumulatorId"":10559,""metricType"":""size"",""experimental"":false},{""name"":""number of files read"",""accumulatorId"":10564,""metricType"":""sum"",""experimental"":false},{""name"":""filesystem read data size"",""accumulatorId"":10550,""metricType"":""size"",""experimental"":false},{""name"":""scan time"",""accumulatorId"":10551,""metricType"":""timing"",""experimental"":false},{""name"":""cloud storage retry duration"",""accumulatorId"":10563,""metricType"":""timing"",""experimental"":false},{""name"":""estimated repeated reads high size"",""accumulatorId"":10555,""metricType"":""size"",""experimental"":false},{""name"":""filesystem read data size (sampled)"",""accumulatorId"":10549,""metricType"":""size"",""experimental"":false},{""name"":""number of files pruned"",""accumulatorId"":10570,""metricType"":""sum"",""experimental"":false},{""name"":""filesystem read time (sampled)"",""accumulatorId"":10548,""metricType"":""timing"",""experimental"":false},{""name"":""max partition size chosen"",""accumulatorId"":10573,""metricType"":""size"",""experimental"":false},{""name"":""missing files"",""accumulatorId"":10552,""metricType"":""sum"",""experimental"":false},{""name"":""cloud storage request size"",""accumulatorId"":10558,""metricType"":""size"",""experimental"":false},{""name"":""metadata time"",""accumulatorId"":10565,""metricType"":""timing"",""experimental"":false},{""name"":""number of parquet row groups read"",""accumulatorId"":10557,""metricType"":""sum"",""experimental"":false},{""name"":""size of files read"",""accumulatorId"":10566,""metricType"":""size"",""experimental"":false},{""name"":""cloud storage retry count"",""accumulatorId"":10562,""metricType"":""count"",""experimental"":false},{""name"":""number of bytes pruned"",""accumulatorId"":10571,""metricType"":""size"",""experimental"":false},{""name"":""cloud storage request count"",""accumulatorId"":10560,""metricType"":""count"",""experimental"":false},{""name"":""average total splits sizes distribution per node"",""accumulatorId"":10567,""metricType"":""size"",""experimental"":false},{""name"":""number of output rows"",""accumulatorId"":10547,""metricType"":""sum"",""experimental"":false},{""name"":""the proration factor used"",""accumulatorId"":10572,""metricType"":""sum"",""experimental"":false},{""name"":""estimated repeated reads low size"",""accumulatorId"":10554,""metricType"":""size"",""experimental"":false},{""name"":""corrupt files"",""accumulatorId"":10553,""metricType"":""sum"",""experimental"":false},{""name"":""cluster's current parallelism "",""accumulatorId"":10574,""metricType"":""sum"",""experimental"":false},{""name"":""relative skew in total splits sizes distribution"",""accumulatorId"":10568,""metricType"":""size"",""experimental"":false},{""name"":""cloud storage request duration"",""accumulatorId"":10561,""metricType"":""timing"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""number of output rows"",""accumulatorId"":10545,""metricType"":""sum"",""experimental"":false},{""name"":""number of input batches"",""accumulatorId"":10546,""metricType"":""sum"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""number of output rows"",""accumulatorId"":10544,""metricType"":""sum"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""duration"",""accumulatorId"":10543,""metricType"":""timing"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[],""explainId"":null}],""metadata"":{},""metrics"":[],""explainId"":null}],""metadata"":{},""metrics"":[],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""duration"",""accumulatorId"":10542,""metricType"":""timing"",""experimental"":false}],""explainId"":null}],""metadata"":{},""metrics"":[{""name"":""shuffle records written"",""accumulatorId"":10540,""metricType"":""sum"",""experimental"":false},{""name"":""records read"",""accumulatorId"":10538,""metricType"":""sum"",""experimental"":false},{""name"":""local bytes read"",""accumulatorId"":10536,""metricType"":""size"",""experimental"":false},{""name"":""fetch wait time"",""accumulatorId"":10537,""metricType"":""timing"",""experimental"":false},{""name"":""remote bytes read"",""accumulatorId"":10534,""metricType"":""size"",""experimental"":false},{""name"":""local blocks read"",""accumulatorId"":10533,""metricType"":""sum"",""experimental"":false},{""name"":""remote blocks read"",""accumulatorId"":10532,""metricType"":""sum"",""experimental"":false},{""name"":""remote bytes read to disk"",""accumulatorId"":10535,""metricType"":""size"",""experimental"":false},{""name"":""shuffle bytes written"",""accumulatorId"":10539,""metricType"":""size"",""experimental"":false}],""explainId"":null}",1679663276137.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/Rtmp1MTIIm""}",941263863.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,7.28805286556769e+18,1220-124459-5lnjmtzx_10_254_10_41,292.0,1679663277356.0,,1219.0,-2119378548.0
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,369,df_phys.show(),org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3416) com.databricks.sql.transaction.tahoe.stats.skipping.DataSkippingReaderV2SnapshotReconstruction.$anonfun$deduplicateAndFilterRemovedLocally$2(DataSkippingReaderV2SnapshotReconstruction.scala:105) com.databricks.sql.util.ThreadLocalTagger.withTag(QueryTagger.scala:62) com.databricks.sql.util.ThreadLocalTagger.withTag$(QueryTagger.scala:59) com.databricks.sql.util.QueryTagger$.withTag(QueryTagger.scala:123) com.databricks.sql.transaction.tahoe.metering.DeltaLogging.$anonfun$recordFrameProfileWithDmqTag$1(DeltaLogging.scala:250) com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:80) com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordFrameProfileWithDmqTag(DeltaLogging.scala:250) com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordFrameProfileWithDmqTag$(DeltaLogging.scala:249) com.databricks.sql.transaction.tahoe.stats.skipping.DataSkippingReaderV2SnapshotReconstruction.recordFrameProfileWithDmqTag(DataSkippingReaderV2SnapshotReconstruction.scala:41) com.databricks.sql.transaction.tahoe.stats.skipping.DataSkippingReaderV2SnapshotReconstruction.deduplicateAndFilterRemovedLocally(DataSkippingReaderV2SnapshotReconstruction.scala:90) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.$anonfun$collectScan$1(DataSkippingReaderV2.scala:346) com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:80) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.collectScan(DataSkippingReaderV2.scala:342) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.$anonfun$getDataSkippedFiles$1(DataSkippingReaderV2.scala:295) com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:80) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.getDataSkippedFiles(DataSkippingReaderV2.scala:284) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.getDataSkippedFiles$(DataSkippingReaderV2.scala:280) com.databricks.sql.transaction.tahoe.SnapshotEdge.getDataSkippedFiles(SnapshotEdge.scala:63) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderBase.$anonfun$filesForScan$12(DataSkippingReader.scala:1055),"== Physical Plan == LocalTableScan (1) (1) LocalTableScan Output [7]: [stats_parsed.numRecords#8486L, version#7638L, path#7631, size#7633L, modificationTime#7634L, tags#7635, stats#466] Arguments: [stats_parsed.numRecords#8486L, version#7638L, path#7631, size#7633L, modificationTime#7634L, tags#7635, stats#466]","{""nodeName"":""LocalTableScan"",""simpleString"":""LocalTableScan [stats_parsed.numRecords#8486L, version#7638L, path#7631, size#7633L, modificationTime#7634L, tags#7635, stats#466]"",""children"":[],""metadata"":{},""metrics"":[{""name"":""number of output rows"",""accumulatorId"":11549,""metricType"":""sum"",""experimental"":false}],""explainId"":null}",1679925972032.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/RtmpIEEtn5""}",-440138456.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,3.5031241944576794e+18,1220-124459-5lnjmtzx_10_254_10_7,371.0,1679925972063.0,,31.0,-2117930586.0
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,233,"from pyspark.sql.types import ArrayType, Struct...",org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3416) com.databricks.sql.transaction.tahoe.stats.skipping.DataSkippingReaderV2SnapshotReconstruction.$anonfun$deduplicateAndFilterRemovedLocally$2(DataSkippingReaderV2SnapshotReconstruction.scala:105) com.databricks.sql.util.ThreadLocalTagger.withTag(QueryTagger.scala:62) com.databricks.sql.util.ThreadLocalTagger.withTag$(QueryTagger.scala:59) com.databricks.sql.util.QueryTagger$.withTag(QueryTagger.scala:123) com.databricks.sql.transaction.tahoe.metering.DeltaLogging.$anonfun$recordFrameProfileWithDmqTag$1(DeltaLogging.scala:250) com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:80) com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordFrameProfileWithDmqTag(DeltaLogging.scala:250) com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordFrameProfileWithDmqTag$(DeltaLogging.scala:249) com.databricks.sql.transaction.tahoe.stats.skipping.DataSkippingReaderV2SnapshotReconstruction.recordFrameProfileWithDmqTag(DataSkippingReaderV2SnapshotReconstruction.scala:41) com.databricks.sql.transaction.tahoe.stats.skipping.DataSkippingReaderV2SnapshotReconstruction.deduplicateAndFilterRemovedLocally(DataSkippingReaderV2SnapshotReconstruction.scala:90) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.$anonfun$collectScan$1(DataSkippingReaderV2.scala:346) com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:80) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.collectScan(DataSkippingReaderV2.scala:342) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.$anonfun$getDataSkippedFiles$1(DataSkippingReaderV2.scala:295) com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:80) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.getDataSkippedFiles(DataSkippingReaderV2.scala:284) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderV2.getDataSkippedFiles$(DataSkippingReaderV2.scala:280) com.databricks.sql.transaction.tahoe.SnapshotEdge.getDataSkippedFiles(SnapshotEdge.scala:63) com.databricks.sql.transaction.tahoe.stats.DataSkippingReaderBase.$anonfun$filesForScan$12(DataSkippingReader.scala:1055),"== Physical Plan == LocalTableScan (1) (1) LocalTableScan Output [7]: [stats_parsed.numRecords#3814L, version#171L, path#164, size#166L, modificationTime#167L, tags#168, stats#181] Arguments: [stats_parsed.numRecords#3814L, version#171L, path#164, size#166L, modificationTime#167L, tags#168, stats#181]","{""nodeName"":""LocalTableScan"",""simpleString"":""LocalTableScan [stats_parsed.numRecords#3814L, version#171L, path#164, size#166L, modificationTime#167L, tags#168, stats#181]"",""children"":[],""metadata"":{},""metrics"":[{""name"":""number of output rows"",""accumulatorId"":7678,""metricType"":""sum"",""experimental"":false}],""explainId"":null}",1679661044275.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/Rtmp1MTIIm""}",-1295519634.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,7.28805286556769e+18,1220-124459-5lnjmtzx_10_254_10_41,234.0,1679661044278.0,,3.0,-2110406178.0
org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart,135,use catalog `hive_metastore`; describe detail `...,org.apache.spark.sql.SQLContext.sql(SQLContext.scala:695) com.databricks.backend.daemon.driver.SQLDriverLocal.$anonfun$executeSql$1(SQLDriverLocal.scala:91) scala.collection.immutable.List.map(List.scala:297) com.databricks.backend.daemon.driver.SQLDriverLocal.executeSql(SQLDriverLocal.scala:37) com.databricks.backend.daemon.driver.SQLDriverLocal.repl(SQLDriverLocal.scala:145) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$23(DriverLocal.scala:729) com.databricks.unity.EmptyHandle$.runWith(UCSHandle.scala:124) com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$20(DriverLocal.scala:712) com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:398) scala.util.DynamicVariable.withValue(DynamicVariable.scala:62) com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:147) com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:396) com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:393) com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:62) com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:441) com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:426) com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:62) com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:689) com.databricks.backend.daemon.driver.DriverWrapper.$anonfun$tryExecutingCommand$1(DriverWrapper.scala:622) scala.util.Try$.apply(Try.scala:213),"== Physical Plan == Execute DescribeDeltaDetailCommand (1)  +- DescribeDeltaDetailCommand (2) (1) Execute DescribeDeltaDetailCommand Output [13]: [format#2731, id#2732, name#2733, description#2734, location#2735, createdAt#2736, lastModified#2737, partitionColumns#2738, numFiles#2739L, sizeInBytes#2740L, properties#2741, minReaderVersion#2742, minWriterVersion#2743] (2) DescribeDeltaDetailCommand Arguments: `default`.`eventlog_raw`","{""nodeName"":""Execute DescribeDeltaDetailCommand"",""simpleString"":""Execute DescribeDeltaDetailCommand"",""children"":[],""metadata"":{},""metrics"":[],""explainId"":null}",1680253498060.0,"{""spark.r.sql.derby.temp.dir"":""/tmp/RtmprwWgXa""}",-435030037.0,org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd,7.954194231928627e+18,1220-124459-5lnjmtzx_10_254_10_11,135.0,1680253498204.0,,144.0,-2108954658.0


In [0]:
%sql
SELECT
  *
FROM
  physical_plan_keys

sparkContextID,clusterInstanceID,executionID,physicalPlanKey
2861951072945299053,1220-124459-5lnjmtzx_10_254_10_9,353,-2146934606
2861951072945299053,1220-124459-5lnjmtzx_10_254_10_9,606,-2146784117
2751788431714625779,1220-124459-5lnjmtzx_10_254_10_12,1287,-2146209763
7643291139468147288,1220-124459-5lnjmtzx_10_254_10_8,158,-2145286336
3952163646167870121,1220-124459-5lnjmtzx_10_254_10_22,516,-2145055373
7033256278058855708,1220-124459-5lnjmtzx_10_254_10_13,31,-2145019305
5831612348755309262,1220-124459-5lnjmtzx_10_254_10_5,16,-2144949261
2296910109933074401,1220-124459-5lnjmtzx_10_254_10_8,11,-2143362574
919268754167096745,1220-124459-5lnjmtzx_10_254_10_33,393,-2142958139
2751788431714625779,1220-124459-5lnjmtzx_10_254_10_12,826,-2142320414
