In [1]:
import findspark

findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession
import numpy as np

dataset_root_path = "D:\\Documents\\Programming\\Python-Projects\\Clusterdata_2019_e\\"
spark = SparkSession.builder.appName('Failure Prediction on Google Borg Cluster Traces').master('local[*]').getOrCreate()
SparkContext.setSystemProperty('spark.executor.memory', '3g')
SparkContext.setSystemProperty('spark.driver.memory', '3g')

jobs_train_df = spark.read.parquet("./training_data/jobs_data.parquet")
jobs_test_df = spark.read.parquet("./test_data/jobs_data.parquet")

tasks_train_df = spark.read.parquet("./training_data/tasks_data.parquet")
tasks_test_df = spark.read.parquet("./test_data/tasks_data.parquet")

In [2]:
jobs_train_df.printSchema()
jobs_test_df.printSchema()
tasks_train_df.printSchema()
tasks_test_df.printSchema()

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: long (nullable = true)

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: integer (nullable = true)

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: integer (nullable = true)

root
 |-- scheduling_class: long (nullable = true)
 |-- priority: long (nullable = true)
 |-- cpus: double (nullable = true)
 |-- memory: double (nullable = true)
 |-- event_success: integer (nullable = true)



In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression

one_hot_encoder = OneHotEncoder(inputCol="scheduling_class", outputCol="scheduling_class_encoded")
assemblerInputs = ["scheduling_class_encoded", "priority", "cpus", "memory"]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

pipeline = Pipeline(stages=[one_hot_encoder, assembler])

jobs_train_data = pipeline.fit(jobs_train_df).transform(jobs_train_df)
jobs_test_data = pipeline.fit(jobs_test_df).transform(jobs_test_df)

print('Job train dataset count:', jobs_train_data.count())
print('Job test dataset count:', jobs_test_data.count())

tasks_train_data = pipeline.fit(tasks_train_df).transform(tasks_train_df)
tasks_test_data = pipeline.fit(tasks_test_df).transform(tasks_test_df)

print('Task train dataset count:', tasks_train_data.count())
print('Task test dataset count:', tasks_test_data.count())

decision_tree = DecisionTreeClassifier(labelCol="event_success")
random_forest = RandomForestClassifier(labelCol='event_success')
gradient_boosting = GBTClassifier(labelCol='event_success')
logistic_regression = LogisticRegression(labelCol='event_success')

evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction")
roc_evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="event_success")

Job train dataset count: 1087936
Job test dataset count: 313905
Task train dataset count: 27976115
Task test dataset count: 89928090


# Jobs Failure Prediction

## Decision Tree

In [3]:
dt_model_jobs = decision_tree.fit(jobs_train_data)
dt_prediction_jobs = dt_model_jobs.transform(jobs_test_data)

In [4]:
# Evaluate the model
print('Model Evaluation')
print('Accuracy:', evaluator.evaluate(dt_prediction_jobs, {evaluator.metricName: 'accuracy'}))
print('Recall:', evaluator.evaluate(dt_prediction_jobs, {evaluator.metricName: 'recallByLabel'}))
print('F1 score:', evaluator.evaluate(dt_prediction_jobs, {evaluator.metricName: 'f1'}))
print('ROC curve:', roc_evaluator.evaluate(dt_prediction_jobs, {evaluator.metricName: "areaUnderROC"}))

# Evaluate feature importance
print('\nFeature Importance')
feature_importance = dt_model_jobs.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f'Feature \'{column}\': {feature_importance[i]:.2f}')

Model Evaluation
Accuracy: 0.8015227536993677
Recall: 0.7287427034609204
F1 score: 0.8024157329760173
ROC curve: 0.8147425140029069

Feature Importance
Feature 'scheduling_class_encoded': 0.17
Feature 'priority': 0.22
Feature 'cpus': 0.18
Feature 'memory': 0.38


## Random Forest

In [5]:
rf_model_jobs = random_forest.fit(jobs_train_data)
rf_prediction_jobs = rf_model_jobs.transform(jobs_test_data)

In [6]:
# Evaluate the model
print('Model Evaluation')
print('Accuracy:', evaluator.evaluate(rf_prediction_jobs, {evaluator.metricName: 'accuracy'}))
print('Recall:', evaluator.evaluate(rf_prediction_jobs, {evaluator.metricName: 'recallByLabel'}))
print('F1 score:', evaluator.evaluate(rf_prediction_jobs, {evaluator.metricName: 'f1'}))
print('ROC curve:', roc_evaluator.evaluate(rf_prediction_jobs, {evaluator.metricName: "areaUnderROC"}))

# Evaluate feature importance
print('\nFeature Importance')
feature_importance = rf_model_jobs.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f'Feature \'{column}\': {feature_importance[i]:.2f}')

Model Evaluation
Accuracy: 0.8139819372103024
Recall: 0.7780637180457148
F1 score: 0.8150798327374832
ROC curve: 0.8205061185804742

Feature Importance
Feature 'scheduling_class_encoded': 0.16
Feature 'priority': 0.09
Feature 'cpus': 0.20
Feature 'memory': 0.22


## Gradient Boosting

In [7]:
gb_model_jobs = gradient_boosting.fit(jobs_train_data)
gb_prediction_jobs = gb_model_jobs.transform(jobs_test_data)

In [8]:
# Evaluate the model
print('Model Evaluation')
print('Accuracy:', evaluator.evaluate(gb_prediction_jobs, {evaluator.metricName: 'accuracy'}))
print('Recall:', evaluator.evaluate(gb_prediction_jobs, {evaluator.metricName: 'recallByLabel'}))
print('F1 score:', evaluator.evaluate(gb_prediction_jobs, {evaluator.metricName: 'f1'}))
print('ROC curve:', roc_evaluator.evaluate(gb_prediction_jobs, {evaluator.metricName: "areaUnderROC"}))

# Evaluate feature importance
print('\nFeature Importance')
feature_importance = gb_model_jobs.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f'Feature \'{column}\': {feature_importance[i]:.2f}')

Model Evaluation
Accuracy: 0.8397158375941766
Recall: 0.7645502791599247
F1 score: 0.8404732798466041
ROC curve: 0.8533689013434725

Feature Importance
Feature 'scheduling_class_encoded': 0.09
Feature 'priority': 0.14
Feature 'cpus': 0.07
Feature 'memory': 0.44


## Logistic Regression

In [3]:
lr_model_jobs = logistic_regression.fit(jobs_train_data)
lr_prediction_jobs = lr_model_jobs.transform(jobs_test_data)

In [29]:
# Evaluate the model
print('Model Evaluation')
print('Accuracy:', evaluator.evaluate(lr_prediction_jobs, {evaluator.metricName: 'accuracy'}))
print('Recall:', evaluator.evaluate(lr_prediction_jobs, {evaluator.metricName: 'recallByLabel'}))
print('F1 score:', evaluator.evaluate(lr_prediction_jobs, {evaluator.metricName: 'f1'}))
print('ROC curve:', roc_evaluator.evaluate(lr_prediction_jobs, {evaluator.metricName: "areaUnderROC"}))

# Evaluate feature importance
print('\nFeature Importance')
coefficients = lr_model_jobs.coefficients.toArray()[:-2] # remove the additional 2 coefficients

# The coefficients are given as logs, so the exponential must be computed, and the values normalized
odds_ratios = np.exp(coefficients)
normalized_odds = odds_ratios / np.sum(odds_ratios)

for i, column in enumerate(assembler.getInputCols()):
    print(f'Feature \'{column}\': {normalized_odds[i]:.2f}')

Model Evaluation
Accuracy: 0.750446791226645
Recall: 0.6766825529188918
F1 score: 0.751379808707238
ROC curve: 0.7638453193426163

Feature Importance
Feature 'scheduling_class_encoded': 0.29
Feature 'priority': 0.13
Feature 'cpus': 0.59
Feature 'memory': 0.00


# Tasks Failure Prediction

## Decision Tree

In [3]:
dt_model_tasks = decision_tree.fit(tasks_train_data)
dt_prediction_tasks = dt_model_tasks.transform(tasks_test_data)

In [12]:
dt_model_tasks.save('./task_models_TML/decision_tree')

In [13]:
# Evaluate the model
print('Model Evaluation')
print('Accuracy:', evaluator.evaluate(dt_prediction_tasks, {evaluator.metricName: 'accuracy'}))
print('Recall:', evaluator.evaluate(dt_prediction_tasks, {evaluator.metricName: 'recallByLabel'}))
print('F1 score:', evaluator.evaluate(dt_prediction_tasks, {evaluator.metricName: 'f1'}))
print('ROC curve:', roc_evaluator.evaluate(dt_prediction_tasks, {evaluator.metricName: "areaUnderROC"}))

# Evaluate feature importance
print('\nFeature Importance')
feature_importance = dt_model_tasks.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f'Feature \'{column}\': {feature_importance[i]:.2f}')

Model Evaluation
Accuracy: 0.9879837879354493
Recall: 0.9880643422534124
F1 score: 0.9885088142283688
ROC curve: 0.9872863757309096

Feature Importance
Feature 'scheduling_class_encoded': 0.04
Feature 'priority': 0.35
Feature 'cpus': 0.00
Feature 'memory': 0.49


In [4]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# TODO: explore with other parameters too
dt_param_grid = ParamGridBuilder().addGrid(dt_model_tasks.maxDepth, [5, 10, 15]).addGrid(dt_model_tasks.maxBins, [64, 128]).build()

dt_cv_evaluator = MulticlassClassificationEvaluator(labelCol="event_success", predictionCol="prediction", metricName="accuracy")
dt_cv = CrossValidator(estimator=decision_tree, estimatorParamMaps=dt_param_grid, evaluator=dt_cv_evaluator)

dt_cv_model = dt_cv.fit(tasks_train_data)
dt_cv_predictions = dt_cv_model.transform(tasks_test_data)

dt_cv_auc = evaluator.evaluate(dt_cv_predictions)
print("Decision Tree cross validation result: ", dt_cv_auc)

Decision Tree cross validation result:  0.9932381227693627


## Random Forest

In [14]:
rf_model_tasks = random_forest.fit(tasks_train_data)
rf_prediction_tasks = rf_model_tasks.transform(tasks_test_data)

In [15]:
rf_model_tasks.save('./task_models_TML/random_forest')

In [16]:
# Evaluate the model
print('Model Evaluation')
print('Accuracy:', evaluator.evaluate(rf_prediction_tasks, {evaluator.metricName: 'accuracy'}))
print('Recall:', evaluator.evaluate(rf_prediction_tasks, {evaluator.metricName: 'recallByLabel'}))
print('F1 score:', evaluator.evaluate(rf_prediction_tasks, {evaluator.metricName: 'f1'}))
print('ROC curve:', roc_evaluator.evaluate(rf_prediction_tasks, {evaluator.metricName: "areaUnderROC"}))

# Evaluate feature importance
print('\nFeature Importance')
feature_importance = rf_model_tasks.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f'Feature \'{column}\': {feature_importance[i]:.2f}')

Model Evaluation
Accuracy: 0.9868253845934012
Recall: 0.9866989750868808
F1 score: 0.98746995436015
ROC curve: 0.9879197956050814

Feature Importance
Feature 'scheduling_class_encoded': 0.17
Feature 'priority': 0.22
Feature 'cpus': 0.01
Feature 'memory': 0.38


## Gradient Boosting

In [17]:
gb_model_tasks = gradient_boosting.fit(tasks_train_data)
gb_prediction_tasks = gb_model_tasks.transform(tasks_test_data)

In [18]:
gb_model_tasks.save('./task_models_TML/gradient_boosting')

In [19]:
# Evaluate the model
print('Model Evaluation')
print('Accuracy:', evaluator.evaluate(gb_prediction_tasks, {evaluator.metricName: 'accuracy'}))
print('Recall:', evaluator.evaluate(gb_prediction_tasks, {evaluator.metricName: 'recallByLabel'}))
print('F1 score:', evaluator.evaluate(gb_prediction_tasks, {evaluator.metricName: 'f1'}))
print('ROC curve:', roc_evaluator.evaluate(gb_prediction_tasks, {evaluator.metricName: "areaUnderROC"}))

# Evaluate feature importance
print('\nFeature Importance')
feature_importance = gb_model_tasks.featureImportances.toArray()

for i, column in enumerate(assembler.getInputCols()):
    print(f'Feature \'{column}\': {feature_importance[i]:.2f}')

Model Evaluation
Accuracy: 0.9869224955183636
Recall: 0.9865438367847023
F1 score: 0.9875807085190534
ROC curve: 0.9902007955386307

Feature Importance
Feature 'scheduling_class_encoded': 0.03
Feature 'priority': 0.26
Feature 'cpus': 0.00
Feature 'memory': 0.39


## Logistic Regression

In [5]:
lr_model_tasks = logistic_regression.fit(tasks_train_data)
lr_prediction_tasks = lr_model_tasks.transform(tasks_test_data)

In [21]:
lr_model_tasks.save('./task_models_TML/logistic_regression')

In [31]:
# Evaluate the model
print('Model Evaluation')
print('Accuracy:', evaluator.evaluate(lr_prediction_tasks, {evaluator.metricName: 'accuracy'}))
print('Recall:', evaluator.evaluate(lr_prediction_tasks, {evaluator.metricName: 'recallByLabel'}))
print('F1 score:', evaluator.evaluate(lr_prediction_tasks, {evaluator.metricName: 'f1'}))
print('ROC curve:', roc_evaluator.evaluate(lr_prediction_tasks, {evaluator.metricName: "areaUnderROC"}))

# Evaluate feature importance
print('\nFeature Importance')
coefficients = lr_model_tasks.coefficients.toArray()[:-2] # remove the additional 2 coefficients

# The coefficients are given as logs, so the exponential must be computed, and the values normalized
odds_ratios = np.exp(coefficients)
normalized_odds = odds_ratios / np.sum(odds_ratios)

for i, column in enumerate(assembler.getInputCols()):
    print(f'Feature \'{column}\': {normalized_odds[i]:.2f}')

Model Evaluation
Accuracy: 0.9393074622178677
Recall: 0.9375199222077988
F1 score: 0.9492245262607429
ROC curve: 0.9547833826979623

Feature Importance
Feature 'scheduling_class_encoded': 0.48
Feature 'priority': 0.00
Feature 'cpus': 0.00
Feature 'memory': 0.52
