In [27]:
import os
import sys
import copy
import time
import random
import pyspark
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType

# for creating pipelines and model
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# sklearn
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [6]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Predictive Maintenance") \
        .config("spark.some.config.option", "some-value") \
        .config("spark.executor.memory", "70g")\
        .config("spark.driver.memory", "50g")\
        .config("spark.memory.offHeap.enabled", "true")\
        .config("spark.memory.offHeap.size","30g")\
        .getOrCreate()
    return spark

spark = init_spark()

# Data Split

In [29]:
filename = "..\data\machines.csv"
machines = spark.read.csv(filename, sep=',', header=True)

print(machines.count())
machines.show()

100
+---------+------+---+
|machineID| model|age|
+---------+------+---+
|        1|model3| 18|
|        2|model4|  7|
|        3|model3|  8|
|        4|model3|  7|
|        5|model3|  2|
|        6|model3|  7|
|        7|model3| 20|
|        8|model3| 16|
|        9|model4|  7|
|       10|model3| 10|
|       11|model2|  6|
|       12|model3|  9|
|       13|model1| 15|
|       14|model3|  1|
|       15|model3| 14|
|       16|model1|  3|
|       17|model1| 14|
|       18|model3| 15|
|       19|model3| 17|
|       20|model2| 16|
+---------+------+---+
only showing top 20 rows



In [23]:
feat_data = spark.read.parquet('../data/labeled_features.parquet')
feat_data = feat_data.withColumn("age", feat_data.age.cast(DoubleType()))

print(feat_data.count())
# highly imbalanced data
print(feat_data.groupby('label_e').count().show())
feat_data.limit(10).toPandas().head(10)

73142
+-------+-----+
|label_e|count|
+-------+-----+
|    0.0|67470|
|    1.0| 1337|
|    4.0| 1473|
|    3.0| 1000|
|    2.0| 1862|
+-------+-----+

None


Unnamed: 0,dt_truncated,volt_rollingmean_12,rotate_rollingmean_12,pressure_rollingmean_12,vibration_rollingmean_12,volt_rollingmean_24,rotate_rollingmean_24,pressure_rollingmean_24,vibration_rollingmean_24,volt_rollingmean_36,...,error5sum_rollingmean_24,comp1sum,comp2sum,comp3sum,comp4sum,model,age,model_encoded,failure,label_e
0,2016-01-01 07:00:00,172.083928,453.576897,101.30311,40.62741,169.230878,451.007306,100.487259,40.839262,167.339602,...,0.0,579.0,534.0,474.0,459.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
1,2015-12-31 19:00:00,168.173348,453.181951,99.527531,40.981132,165.787189,449.842118,100.598808,41.791947,166.190766,...,0.0,578.0,533.0,473.0,458.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
2,2015-12-31 07:00:00,163.40103,446.502286,101.670084,42.602762,165.199475,445.038344,101.074817,41.722713,168.995817,...,0.0,578.0,533.0,473.0,458.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
3,2015-12-30 19:00:00,166.997919,443.574402,100.47955,40.842664,171.793211,450.456864,100.955598,40.418503,172.419415,...,0.0,577.0,532.0,472.0,457.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
4,2015-12-30 07:00:00,176.588502,457.339327,101.431647,39.994342,175.130162,466.483595,99.468624,40.920312,174.914358,...,0.0,577.0,532.0,472.0,457.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
5,2015-12-29 19:00:00,173.671822,475.627863,97.505602,41.846282,174.077285,464.681249,99.852795,41.520096,173.561514,...,0.0,576.0,531.0,471.0,456.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
6,2015-12-29 07:00:00,174.482749,453.734636,102.199989,41.193909,173.50636,450.268077,101.890206,39.760938,173.318924,...,0.0,576.0,531.0,471.0,456.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
7,2015-12-28 19:00:00,172.529971,446.801518,101.580424,38.327966,172.737011,443.114257,99.159233,38.946824,171.416655,...,0.0,575.0,530.0,470.0,455.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
8,2015-12-28 07:00:00,172.944052,439.426996,96.738042,39.565681,170.859997,444.430674,99.034004,39.874349,172.38422,...,0.0,575.0,530.0,470.0,455.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0
9,2015-12-27 19:00:00,168.775941,449.434352,101.329965,40.183016,172.104304,444.571586,98.413556,39.550401,170.807337,...,0.0,574.0,529.0,469.0,454.0,model4,3.0,"(0.0, 1.0, 0.0)",0.0,0.0


In [24]:
label_var = ['label_e']
key_cols =['machineID','dt_truncated']
input_features = feat_data.columns
remove_cols = label_var + key_cols + ['failure','model_encoded','model' ]

# Remove the extra names if that are in the input_features list
input_features = [x for x in input_features if x not in set(remove_cols)]
# Use cols
input_features

['volt_rollingmean_12',
 'rotate_rollingmean_12',
 'pressure_rollingmean_12',
 'vibration_rollingmean_12',
 'volt_rollingmean_24',
 'rotate_rollingmean_24',
 'pressure_rollingmean_24',
 'vibration_rollingmean_24',
 'volt_rollingmean_36',
 'vibration_rollingmean_36',
 'rotate_rollingmean_36',
 'pressure_rollingmean_36',
 'volt_rollingstd_12',
 'rotate_rollingstd_12',
 'pressure_rollingstd_12',
 'vibration_rollingstd_12',
 'volt_rollingstd_24',
 'rotate_rollingstd_24',
 'pressure_rollingstd_24',
 'vibration_rollingstd_24',
 'volt_rollingstd_36',
 'rotate_rollingstd_36',
 'pressure_rollingstd_36',
 'vibration_rollingstd_36',
 'error1sum_rollingmean_24',
 'error2sum_rollingmean_24',
 'error3sum_rollingmean_24',
 'error4sum_rollingmean_24',
 'error5sum_rollingmean_24',
 'comp1sum',
 'comp2sum',
 'comp3sum',
 'comp4sum',
 'age']

In [25]:
# assemble features
va = VectorAssembler(inputCols=(input_features), outputCol='features')
feat_data = va.transform(feat_data).select('machineID','dt_truncated','label_e','features')

# set maxCategories so features with > 10 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", 
                               outputCol="indexedFeatures", 
                               maxCategories=10).fit(feat_data)

# fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol="label_e", outputCol="indexedLabel").fit(feat_data)

# split the data into train/test based on date
split_date = "2015-10-30"
training = feat_data.filter(feat_data.dt_truncated < split_date)
testing = feat_data.filter(feat_data.dt_truncated >= split_date)

print(training.count())
print(testing.count())

60434
12708


In [None]:
# Down sample majority class, do we really need this?
# SampleBy returns a stratified sample without replacement based on the fraction given on each stratum
train_downsampled = training.sampleBy('label', fractions={0.0: 0.135, 1.0: 1.0}, seed=123).cache()
train_downsampled.groupby('label').count().show()

testing.groupby('label').count().show()

In [None]:
# Cache results
# cache datasets in memory
train_downsampled.cache()
testing.cache()

# check the number of devices in training and testing data
print(train_downsampled.select('deviceid').distinct().count())
print(testing.select('deviceid').distinct().count())

# GBT Gradient-Boosted Tree

In [26]:
# GBTClassifier currently only supports binary classification.
training.dtypes

[('machineID', 'string'),
 ('dt_truncated', 'timestamp'),
 ('label_e', 'double'),
 ('features', 'vector')]

In [None]:
%%time

# Train a GBT model.
gbt = GBTClassifier(labelCol='label_e', featuresCol='features', maxDepth=10, minInstancesPerNode=5, maxIter=50)

# Chain indexers and GBT in a Pipeline
pipeline_gbt = Pipeline(stages=[labelIndexer, featureIndexer, gbt])

# Train model. This also runs the indexers.
model_gbt = pipeline_gbt.fit(training)

# save model
datestamp = unicode(datetime.datetime.now()).replace(' ','').replace(':','_');
gbt_fileName = '../checkpoints/GradientBoostedTree_' + datestamp;
gbtDirfilename = modelDir + gbt_fileName;
model_gbt.save(gbtDirfilename)

# Make predictions.
predictions_gbt = model_gbt.transform(testing)

# GBT eXtreme Gradient Boosting

In [2]:
!wget https://repo1.maven.org/maven2/com/nvidia/xgboost4j_3.0/1.0.0-0.1.0/xgboost4j_3.0-1.0.0-0.1.0.jar --no-check-certificate
!wget https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.0.0-0.1.0/xgboost4j-spark_3.0-1.0.0-0.1.0.jar --no-check-certificate


--2021-03-26 19:59:07--  https://repo1.maven.org/maven2/com/nvidia/xgboost4j_3.0/1.0.0-0.1.0/xgboost4j_3.0-1.0.0-0.1.0.jar
Resolving repo1.maven.org (repo1.maven.org)... 151.101.124.209
Connecting to repo1.maven.org (repo1.maven.org)|151.101.124.209|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 231556205 (221M) [application/java-archive]
Saving to: 'xgboost4j_3.0-1.0.0-0.1.0.jar'

     0K .......... .......... .......... .......... ..........  0% 2.61M 85s
    50K .......... .......... .......... .......... ..........  0% 3.14M 78s
   100K .......... .......... .......... .......... ..........  0% 2.78M 78s
   150K .......... .......... .......... .......... ..........  0% 3.09M 76s
   200K .......... .......... .......... .......... ..........  0% 3.00M 76s
   250K .......... .......... .......... .......... ..........  0% 2.86M 76s
   300K .......... .......... .......... .......... ..........  0% 2.98M 76

 15100K .......... .......... .......... .......... ..........  6% 3.09M 76s
 15150K .......... .......... .......... .......... ..........  6% 2.83M 76s
 15200K .......... .......... .......... .......... ..........  6% 3.42M 76s
 15250K .......... .......... .......... .......... ..........  6% 3.05M 76s
 15300K .......... .......... .......... .......... ..........  6% 3.12M 76s
 15350K .......... .......... .......... .......... ..........  6% 2.42M 76s
 15400K .......... .......... .......... .......... ..........  6% 2.98M 76s
 15450K .......... .......... .......... .......... ..........  6% 3.40M 76s
 15500K .......... .......... .......... .......... ..........  6% 2.90M 76s
 15550K .......... .......... .......... .......... ..........  6% 1.51M 76s
 15600K .......... .......... .......... .......... ..........  6% 26.2M 76s
 15650K .......... .......... .......... .......... ..........  6% 4.07M 76s
 15700K .......... .......... .......... .......... ..........  6% 3.17M 76s

149500K .......... .......... .......... .......... .......... 66% 3.22M 28s
149550K .......... .......... .......... .......... .......... 66% 2.71M 28s
149600K .......... .......... .......... .......... .......... 66% 3.62M 28s
149650K .......... .......... .......... .......... .......... 66% 2.73M 28s
149700K .......... .......... .......... .......... .......... 66% 3.58M 28s
149750K .......... .......... .......... .......... .......... 66% 2.36M 28s
149800K .......... .......... .......... .......... .......... 66% 3.09M 28s
149850K .......... .......... .......... .......... .......... 66% 3.08M 28s
149900K .......... .......... .......... .......... .......... 66% 3.20M 28s
149950K .......... .......... .......... .......... .......... 66% 3.13M 28s
150000K .......... .......... .......... .......... .......... 66% 2.80M 28s
150050K .......... .......... .......... .......... .......... 66% 3.46M 28s
150100K .......... .......... .......... .......... .......... 66% 3.15M 28s

--2021-03-26 20:00:32--  https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.0.0-0.1.0/xgboost4j-spark_3.0-1.0.0-0.1.0.jar
Resolving repo1.maven.org (repo1.maven.org)... 151.101.124.209
Connecting to repo1.maven.org (repo1.maven.org)|151.101.124.209|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 2040779 (1.9M) [application/java-archive]
Saving to: 'xgboost4j-spark_3.0-1.0.0-0.1.0.jar'

     0K .......... .......... .......... .......... ..........  2% 2.98M 1s
    50K .......... .......... .......... .......... ..........  5% 2.98M 1s
   100K .......... .......... .......... .......... ..........  7% 2.89M 1s
   150K .......... .......... .......... .......... .......... 10% 2.95M 1s
   200K .......... .......... .......... .......... .......... 12% 3.02M 1s
   250K .......... .......... .......... .......... .......... 15% 2.92M 1s
   300K .......... .......... .......... .......... .......... 17