In [1]:
import findspark

findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Failure Prediction on Google Borg Cluster Traces').master('local[*]').getOrCreate()
SparkContext.setSystemProperty('spark.executor.memory', '2g')
SparkContext.setSystemProperty('spark.driver.memory', '2g')

job_ids_to_remove_df = spark.read.parquet("./intermediary_data/job_ids_to_remove.parquet")
jobs_with_type_df = spark.read.parquet("./intermediary_data/jobs_with_type.parquet")
tasks_with_type_df = spark.read.parquet("./intermediary_data/tasks_with_type.parquet")

In [2]:
job_ids_to_remove_df.count()

2465

In [3]:
job_ids_to_remove_df.printSchema()

root
 |-- collection_id: long (nullable = true)



In [4]:
jobs_with_type_df.count()

1259254

In [5]:
tasks_with_type_df.count()

360176108

In [6]:
jobs_with_type_df = jobs_with_type_df.join(job_ids_to_remove_df, 'collection_id', how='anti')
tasks_with_type_df = tasks_with_type_df.join(job_ids_to_remove_df, 'collection_id', how='anti')

In [7]:
jobs_with_type_df.count()

1256789

In [8]:
tasks_with_type_df.count()

359717767

After filtering out the task that died because of parent job failure, 2.465 out of 1.259.254 jobs and 458.341 out of 360.176.108 tasks were removed.

In [9]:
jobs_with_type_df = jobs_with_type_df.drop(*['collection_id', 'parent_collection_id'])
tasks_with_type_df = tasks_with_type_df.drop(*['collection_id'])

jobs_with_type_df.printSchema()
tasks_with_type_df.printSchema()

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: integer (nullable = true)

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: integer (nullable = true)



In [10]:
jobs_with_type_df.filter(jobs_with_type_df.event_success == 1).count()

531742

In [11]:
tasks_with_type_df.filter(tasks_with_type_df.event_success == 1).count()

18643465

In [17]:
jobs_with_type_df.write.parquet("./result_data/jobs_with_type.parquet")
tasks_with_type_df.write.parquet("./result_data/tasks_with_type.parquet")

In [12]:
jobs_train_df, jobs_test_df = jobs_with_type_df.randomSplit([0.75, 0.25], seed=13)

jobs_train_df.write.parquet("../machine-learning/training_data/jobs_data_unbalanced.parquet")
jobs_test_df.write.parquet("../machine-learning/test_data/jobs_data.parquet")

In [13]:
tasks_train_df, tasks_test_df = tasks_with_type_df.randomSplit([0.75, 0.25], seed=13)

tasks_train_df = tasks_train_df.coalesce(1)
tasks_test_df = tasks_test_df.coalesce(1)

tasks_train_df.write.parquet("../machine-learning/training_data/tasks_data_unbalanced.parquet")
tasks_test_df.write.parquet("../machine-learning/test_data/tasks_data.parquet")