In [1]:
import findspark

findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession

dataset_root_path = "D:\\Documents\\Programming\\Python-Projects\\Clusterdata_2019_e\\"
spark = SparkSession.builder.appName('Failure Prediction on Google Borg Cluster Traces').master('local[*]').getOrCreate()
SparkContext.setSystemProperty('spark.executor.memory', '3g')
SparkContext.setSystemProperty('spark.driver.memory', '3g')

jobs_train_df = spark.read.parquet("./training_data/jobs_data.parquet")
jobs_test_df = spark.read.parquet("./test_data/jobs_data.parquet")

tasks_train_df = spark.read.parquet("./training_data/tasks_data.parquet")
tasks_test_df = spark.read.parquet("./test_data/tasks_data.parquet")

In [2]:
jobs_train_df.printSchema()
jobs_test_df.printSchema()
tasks_train_df.printSchema()
tasks_test_df.printSchema()

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: long (nullable = true)

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: integer (nullable = true)

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: integer (nullable = true)

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: integer (nullable = true)



In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression

# TODO: check if pipeline stages are actually needed
stages = []

assemblerInputs = ["scheduling_class", "priority", "cpus", "memory"]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)


jobs_train_data = pipeline.fit(jobs_train_df).transform(jobs_train_df)
jobs_test_data = pipeline.fit(jobs_test_df).transform(jobs_test_df)

print('Job train dataset count:', jobs_train_data.count())
print('Job test dataset count:', jobs_test_data.count())


tasks_train_data = pipeline.fit(tasks_train_df).transform(tasks_train_df)
tasks_test_data = pipeline.fit(tasks_test_df).transform(tasks_test_df)

print('Task train dataset count:', tasks_train_data.count())
print('Task test dataset count:', tasks_test_data.count())

decision_tree = DecisionTreeClassifier(labelCol="event_success")
random_forest = RandomForestClassifier(labelCol='event_success')
gradient_boosting = GBTClassifier(labelCol='event_success')
logistic_regression = LogisticRegression(labelCol='event_success')

Job train dataset count: 1087936
Job test dataset count: 313905
Task train dataset count: 27976115
Task test dataset count: 89928090


# Jobs Failure Prediction

## Decision Tree

In [6]:
dt_model_jobs = decision_tree.fit(jobs_train_data)
dt_prediction_jobs = dt_model_jobs.transform(jobs_test_data)

In [7]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
print("Accuracy:", evaluator.evaluate(dt_prediction_jobs))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="f1")
print("F1 score:", evaluator.evaluate(dt_prediction_jobs))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="recallByLabel")
print("Recall:", evaluator.evaluate(dt_prediction_jobs))


# Evaluate feature importance
feature_importance = dt_model_jobs.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]:.2f}")

Accuracy: 0.821560663257992
F1 score: 0.8223131325062677
Recall: 0.7430237631089193
Feature 'scheduling_class': 0.50
Feature 'priority': 0.44
Feature 'cpus': 0.01
Feature 'memory': 0.04


## Random Forest

In [8]:
rf_model_jobs = random_forest.fit(jobs_train_data)
rf_prediction_jobs = dt_model_jobs.transform(jobs_test_data)

In [9]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
print("Accuracy:", evaluator.evaluate(rf_prediction_jobs))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="f1")
print("F1 score:", evaluator.evaluate(rf_prediction_jobs))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="recallByLabel")
print("Recall:", evaluator.evaluate(rf_prediction_jobs))


# Evaluate feature importance
feature_importance = rf_model_jobs.featureImportances.toArray()

# Show feature importance
for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]:.2f}")

Accuracy: 0.821560663257992
F1 score: 0.8223131325062677
Recall: 0.7430237631089193
Feature 'scheduling_class': 0.56
Feature 'priority': 0.26
Feature 'cpus': 0.07
Feature 'memory': 0.11


## Gradient Boosting

In [10]:
gb_model_jobs = gradient_boosting.fit(jobs_train_data)
gb_prediction_jobs = gb_model_jobs.transform(jobs_test_data)

In [12]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
print("Accuracy:", evaluator.evaluate(gb_prediction_jobs))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="f1")
print("F1 score:", evaluator.evaluate(gb_prediction_jobs))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="recallByLabel")
print("Recall:", evaluator.evaluate(gb_prediction_jobs))


# Evaluate feature importance
feature_importance = gb_model_jobs.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]:.2f}")

Accuracy: 0.8352527038435196
F1 score: 0.8359601469413815
Recall: 0.7557088342657072
Feature 'scheduling_class': 0.42
Feature 'priority': 0.37
Feature 'cpus': 0.10
Feature 'memory': 0.11


## Logistic Regression

In [13]:
lr_model_jobs = logistic_regression.fit(jobs_train_data)
lr_prediction_jobs = lr_model_jobs.transform(jobs_test_data)

In [14]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
print("Accuracy:", evaluator.evaluate(lr_prediction_jobs))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="f1")
print("F1 score:", evaluator.evaluate(lr_prediction_jobs))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="recallByLabel")
print("Recall:", evaluator.evaluate(lr_prediction_jobs))

Accuracy: 0.623366942227744
F1 score: 0.6253513097971465
Recall: 0.5847889595149078


# Tasks Failure Prediction

## Decision Tree

In [16]:
dt_model_tasks = decision_tree.fit(tasks_train_data)
dt_prediction_tasks = dt_model_tasks.transform(tasks_test_data)

In [17]:
dt_model_tasks.save('./task_models_TML/decision_tree')

In [18]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
print("Accuracy:", evaluator.evaluate(dt_prediction_tasks))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="f1")
print("F1 score:", evaluator.evaluate(dt_prediction_tasks))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="recallByLabel")
print("Recall:", evaluator.evaluate(dt_prediction_tasks))


# Evaluate feature importance
feature_importance = dt_model_tasks.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]:.2f}")

Accuracy: 0.9870222863623591
F1 score: 0.9876461588608803
Recall: 0.9869271382743618
Feature 'scheduling_class': 0.33
Feature 'priority': 0.55
Feature 'cpus': 0.08
Feature 'memory': 0.04


## Random Forest

In [13]:
rf_model_tasks = random_forest.fit(tasks_train_data)
rf_prediction_tasks = rf_model_tasks.transform(tasks_test_data)

In [14]:
rf_model_tasks.save('./task_models_TML/random_forest')

In [15]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
print("Accuracy:", evaluator.evaluate(rf_prediction_tasks))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="f1")
print("F1 score:", evaluator.evaluate(rf_prediction_tasks))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="recallByLabel")
print("Recall:", evaluator.evaluate(rf_prediction_tasks))


# Evaluate feature importance
feature_importance = rf_model_tasks.featureImportances.toArray()

# Show feature importance
for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]:.2f}")

Accuracy: 0.9872281063680992
F1 score: 0.9878044591466593
Recall: 0.9874611452082195
Feature 'scheduling_class': 0.36
Feature 'priority': 0.44
Feature 'cpus': 0.09
Feature 'memory': 0.12


## Gradient Boosting

In [16]:
gb_model_tasks = gradient_boosting.fit(tasks_train_data)
gb_prediction_tasks = gb_model_tasks.transform(tasks_test_data)

In [17]:
gb_model_tasks.save('./task_models_TML/gradient_boosting')

In [18]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
print("Accuracy:", evaluator.evaluate(gb_prediction_tasks))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="f1")
print("F1 score:", evaluator.evaluate(gb_prediction_tasks))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="recallByLabel")
print("Recall:", evaluator.evaluate(gb_prediction_tasks))


# Evaluate feature importance
feature_importance = gb_model_tasks.featureImportances.toArray()

# Show feature importance
for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]:.2f}")

Accuracy: 0.9893820829509444
F1 score: 0.9898181903124817
Recall: 0.9891720642087517
Feature 'scheduling_class': 0.32
Feature 'priority': 0.56
Feature 'cpus': 0.06
Feature 'memory': 0.07


## Logistic Regression

In [19]:
lr_model_tasks = logistic_regression.fit(tasks_train_data)
lr_prediction_tasks = lr_model_tasks.transform(tasks_test_data)

In [20]:
lr_model_tasks.save('./task_models_TML/logistic_regression')

In [21]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
print("Accuracy:", evaluator.evaluate(lr_prediction_tasks))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="f1")
print("F1 score:", evaluator.evaluate(lr_prediction_tasks))

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="recallByLabel")
print("Recall:", evaluator.evaluate(lr_prediction_tasks))

Accuracy: 0.9066080020158329
F1 score: 0.9256697714562917
Recall: 0.904947786034295
