# Event Log To Table

In [0]:
from pathlib import Path
from datetime import datetime, timedelta
import re
import gzip
import pyspark.sql.functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    TimestampType,
)

### 1. Inputs  
- start_time: Timestamp default: Two weeks prior
- end_time: Timestamp default: time_now()
- path_to_cluster_logs: Root logging path

In [0]:
dbutils.widgets.text("start_time", "None", "StartTime, Format: Y-%m-%d %H:%M:%S")
dbutils.widgets.text("end_time", "None", "EndTime, Format: Y-%m-%d %H:%M:%S")
dbutils.widgets.text(
    "path_to_cluster_logs", "/dbfs/cluster-logs", "Path to cluster logs"
)


# get variables
start_time = dbutils.widgets.get("start_time")
end_time = dbutils.widgets.get("end_time")
path_to_cluster_logs = dbutils.widgets.get("path_to_cluster_logs")

# Because of work flows
time_now = datetime.now()

# Handle start_time conversion
if start_time == "None":
    # 2 weeks prior
    start_time = time_now - timedelta(days=14)
else:
    try:
        start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        raise ValueError('inputted start_time is not on the right format')

# Handle end_time conversion
if end_time == "None":
    end_time = time_now
else:
    try:
        end_time = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        raise ValueError('inputted end_time is not on the right format')


assert start_time < end_time, "Start time needs to be before end time."

### 2. Load Data From File

##### 2.1 Extract Metadata From Files

In [0]:
pathlist = Path(path_to_cluster_logs).glob("**/eventlog/**/eventlog*")


def find_timestamp(file_name):
    match = re.match(r"^eventlog-(\d{4}-\d{2}-\d{2}--\d{2}-\d{2})\.gz$", file_name)
    if match:
        datetime_str = match.group(1)
        dt = datetime.strptime(datetime_str, "%Y-%m-%d--%H-%M")
        return dt


file_metadata = []
for path in pathlist:
    path_string = str(path)
    is_zipped = Path(path_string).suffix == ".gz"
    f = path_string.split("/")

    # if filename is 'eventlog', file is not zipped and stored with a timestamp
    if f[7] == "eventlog":
        unix_time = Path(path_string).stat().st_mtime
        dt = datetime.fromtimestamp(unix_time)

    # file is zipped and stored with a timestamp, finds the timestmap with regex
    else:
        timestamp = f[7].split("eventlog")
        dt = find_timestamp(f[7])

    # metadata consisting of
    # (path_object, cluster_id, cluster_instance_id, spark_context_id, last_modification_timestamp, bool_file_is_zipped)
    metadata = (path, f[3], f[5], f[6], dt, is_zipped)

    file_metadata.append(metadata)

##### 2.2 Filter On Input Variables

In [0]:
start_time, end_time

Out[4]: (datetime.datetime(2023, 6, 1, 9, 0), datetime.datetime(2023, 6, 1, 12, 0))

In [0]:
relevant_files = [
    file_meta for file_meta in file_metadata if start_time < file_meta[4] < end_time
]
print(f"Number of files in time interval: {len(relevant_files)}")

Number of files in time interval: 3


In [0]:
# Print dates for each file
print(len(relevant_files), 'relevant files')
[f[4] for f in relevant_files]

Out[6]: [datetime.datetime(2023, 6, 1, 10, 5, 16),
 datetime.datetime(2023, 6, 1, 9, 45),
 datetime.datetime(2023, 6, 1, 10, 0)]

##### 2.3 Load Data From Files

In [0]:
data = []
for file_meta in relevant_files:
    (
        path_object,
        cluster_id,
        cluster_instance_id,
        spark_context_id,
        last_modification_timestamp,
        is_zipped,
    ) = file_meta

    # string content of file
    file_data = ""
    if is_zipped:
        with gzip.open(str(path_object), "rb") as g:
            file_data = g.read().decode("UTF-8")
    else:
        with open(str(path_object), "r") as f:
            file_data = f.read()

    row = (
        str(path_object),
        cluster_id,
        cluster_instance_id,
        spark_context_id,
        last_modification_timestamp,
        file_data,
    )
    data.append(row)

##### 2.4 Handle large eventlogs
If eventlogs are above the size 268_435_456 bytes, it validates spark.rpc.message.maxSize.
We therefore split eventlogs above 200_000_000 bytes into half, to make sure data is on a manageable format.

In [0]:
def split_eventlog(eventlog):
    events = eventlog[-1].split('\n')

    if len(events) < 2:
        raise Exception(f"One of the eventlogs you are trying to split has a max size above the input given. But has fever than 2 events, thus can not be splitted further")

    midpoint = len(events) // 2

    first_half = list(eventlog)
    first_half[-1] = "\n".join(events[:midpoint])

    last_half = list(eventlog)
    last_half[-1] = "\n".join(events[midpoint:])

    return first_half, last_half

In [0]:
# will abort if a file about this threshold is found
max_size = 6.4 * 10**9  # 6.4 GB

# max size of 200MB per eventlog
max_desired_size = 0.2 * 10**9  # 0.2 GB

sizes = [len(d[-1]) for d in data]
if any([s > max_size for s in sizes]):
    raise Exception(f"Found eventlog-file > {max_size/10**9}GB. Aborting")

n = 5
# n=1 allows for files of 400MB to pass
# n=2 allows for files of 800MB to pass
# n=3 allows for files of 1.6GB to pass
# n=4 allows for files of 3.2GB to pass
# n=5 allows for files of 6.4GB to pass
for _ in range(n):
    splitted_data = []
    for d in data:
        if len(d[-1]) > max_desired_size:
            first_half, last_half = split_eventlog(d)
            splitted_data.append(first_half)
            splitted_data.append(last_half)
        else:
            splitted_data.append(d)

    data = splitted_data

### 3. Write Data To Delta Table

In [0]:
# Specify schema
schema = StructType(
    [
        StructField("filePath", StringType(), True),
        StructField("clusterID", StringType(), True),
        StructField("clusterInstanceID", StringType(), True),
        StructField("sparkContextID", StringType(), True),
        StructField("lastModified", TimestampType(), True),
        StructField("fileData", StringType(), True),
    ]
)
new_eventlogs = spark.createDataFrame(data=data, schema=schema)

new_eventlogs.printSchema()

#### Create storage table if it doesn't exist already

In [0]:
%sql
CREATE TABLE IF NOT EXISTS eventlog_raw (
  filePath STRING,
  clusterID STRING,
  clusterInstanceID STRING,
  sparkContextID STRING,
  lastModified TIMESTAMP,
  fileData STRING,
  eventlogKey INT
)

#### Create a unique id for each of the eventlogs
* combine the cluster instance, spark context and lastmodified strings, and use sparks hashing algorithm to create a hash
* creating this key to prevent duplicate storage of eventlogs in the next cell

In [0]:
# Create a unique key: eventlogKey
new_eventlogs = new_eventlogs.withColumn(
    "eventlogKey",
    F.hash(F.concat("clusterInstanceID", "sparkContextID", "lastModified")),
)

In [0]:
# Load eventlogKeys from existing data
existing_keys = spark.sql("select eventlogKey from eventlog_raw")

# Perform anti-join on eventlogKeys to make sure the new data isnt a duplicate of what is already stored in the table
new_eventlogs = new_eventlogs.join(existing_keys, "eventlogKey", "anti")

# Nice to know that this number includes the splitting of eventlogs
# meaning, one large eventlog counts as 2..
print(f"Adding {new_eventlogs.count()} new eventlogs to the table")

In [0]:
# Write data to delta table
delta_table_path = "dbfs:/mnt/lake/"
new_eventlogs.write.format("delta").mode("append").saveAsTable("eventlog_raw")

In [0]:
%sql
SELECT
  COUNT(*)
FROM
  eventlog_raw;