In [1]:
import findspark

findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

dataset_root_path = "D:\\Documents\\Programming\\Python-Projects\\Clusterdata_2019_e\\"
spark = SparkSession.builder.appName('Failure Prediction on Google Borg Cluster Traces').master('local[*]').getOrCreate()
SparkContext.setSystemProperty('spark.executor.memory', '2g')
SparkContext.setSystemProperty('spark.driver.memory', '2g')

collection_events_df = spark.read.parquet(dataset_root_path + "collection_events-*.parquet.gz")
instance_events_df = spark.read.parquet(dataset_root_path + "instance_events-*.parquet.gz")

In [2]:
collection_events_df.count()

5244061

In [3]:
instance_events_df.count()

1523447380

In [2]:
# drop unused columns
collection_events_df = collection_events_df.drop(*['missing_type', 'alloc_collection_id', 'user', 'collection_name', 'collection_logical_name', 'start_after_collection_ids', 'max_per_machine', 'max_per_switch', 'vertical_scaling', 'scheduler'])
instance_events_df = instance_events_df.drop(*['missing_type', 'alloc_collection_id', 'instance_index', 'machine_id', 'alloc_instance_index', 'constraint'])
instance_events_df = instance_events_df.na.drop(subset=["resource_request"])

# filter only the Jobs from the collections and the Tasks from the instances
collection_events_df = collection_events_df.filter(collection_events_df.collection_type == 0)
instance_events_df = instance_events_df.filter(instance_events_df.collection_type == 0)

# drop columns previously used for filter
collection_events_df = collection_events_df.drop("collection_type")
instance_events_df = instance_events_df.drop("collection_type")

# filter the jobs in the timeframe (between 0 and MAXINT)
collection_events_df = collection_events_df.filter((collection_events_df.time != 0) & (collection_events_df.time != (2 ^ 63 - 1)))

# filter the job ids out of the timeframe and collect to list
task_ids_to_remove_list = collection_events_df.filter((collection_events_df.time == 0) | (collection_events_df.time == (2 ^ 63 - 1))).rdd.map(lambda x: x.collection_id).collect()

# filter the tasks in the timeframe (between 0 and MAXINT), that are not part of the jobs that are outside the timeframe
instance_events_df = instance_events_df.filter((instance_events_df.collection_id.isin(task_ids_to_remove_list) == False) & (instance_events_df.time != 0) & (instance_events_df.time != (2 ^ 63 - 1)))

# remove time column after it was ensured that the records are in the timeframe
collection_events_df = collection_events_df.drop("time")
instance_events_df = instance_events_df.drop("time")

In [3]:
collection_events_df.printSchema()
instance_events_df.printSchema()

root
 |-- type: long (nullable = true)
 |-- collection_id: long (nullable = true)
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- parent_collection_id: long (nullable = true)

root
 |-- type: long (nullable = true)
 |-- collection_id: long (nullable = true)
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- resource_request: struct (nullable = true)
 |    |-- cpus: double (nullable = true)
 |    |-- memory: double (nullable = true)



In [4]:
collection_events_df.count()

5029695

In [5]:
instance_events_df.count()

1517629860

In [6]:
# extract max cpu and max memory request for each collection_id, from the tasks
max_resource_request_df = instance_events_df.groupBy('collection_id').agg(
    F.max(F.col('resource_request.cpus')).alias('cpus'),
    F.max(F.col('resource_request.memory')).alias('memory'))

# add the maximum cpu/memory request to the jobs
collection_events_df = collection_events_df.join(max_resource_request_df, on='collection_id')
collection_events_df.printSchema()

max_resource_request_df = max_resource_request_df.unpersist()

root
 |-- collection_id: long (nullable = true)
 |-- type: long (nullable = true)
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- parent_collection_id: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)



In [7]:
# take the cpu and memory requests out of the struct
instance_events_df = instance_events_df.withColumn('cpus', instance_events_df["resource_request.cpus"]).withColumn('memory', instance_events_df["resource_request.memory"])
instance_events_df = instance_events_df.drop("resource_request")

In [8]:
allowedEventTypes = [4, 5, 6, 7, 8]
failureEventTypes = [4, 5, 7, 8]

collection_events_df = collection_events_df.filter(collection_events_df.type.isin(allowedEventTypes))
collection_events_df = collection_events_df.withColumn("event_success", F.when(collection_events_df.type.isin(failureEventTypes), 0).otherwise(1))
collection_events_df = collection_events_df.drop(*['type'])
collection_events_df.show(n=1)

instance_events_df = instance_events_df.filter(instance_events_df.type.isin(allowedEventTypes))
instance_events_df = instance_events_df.withColumn("event_success", F.when(instance_events_df.type.isin(failureEventTypes), 0).otherwise(1))
instance_events_df = instance_events_df.drop(*['type'])
instance_events_df.show(n=1)

+-------------+----------------+--------+--------------------+-----------------+---------------+-------------+
|collection_id|scheduling_class|priority|parent_collection_id|             cpus|         memory|event_success|
+-------------+----------------+--------+--------------------+-----------------+---------------+-------------+
|  35288547244|               2|     360|                null|0.024749755859375|0.0072021484375|            0|
+-------------+----------------+--------+--------------------+-----------------+---------------+-------------+
only showing top 1 row

+-------------+----------------+--------+----+------------------+-------------+
|collection_id|scheduling_class|priority|cpus|            memory|event_success|
+-------------+----------------+--------+----+------------------+-------------+
| 237301563343|               2|     210| 0.0|1.9073486328125E-4|            0|
+-------------+----------------+--------+----+------------------+-------------+
only showing top 1 ro

In [9]:
collection_events_df.write.parquet("./intermediary_data/jobs_with_type.parquet")
instance_events_df.write.parquet("./intermediary_data/tasks_with_type.parquet")