1. Data Analysis of Telemetry Data
2. Feature Engineering

In [0]:
#Importing all the libraries
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import col, unix_timestamp, round
from pyspark.sql.window import Window

In [0]:
# Loading the dataset for telemetry
telemetry = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/telemetry.csv')

display(telemetry)

datetime,machineID,volt,rotate,pressure,vibration
2015-01-01 06:00:00,1,151.919998705647,530.813577555042,101.788175260076,49.6040134898504
2015-01-01 07:00:00,1,174.522001096471,535.523532319384,113.256009499254,41.5159054753218
2015-01-01 08:00:00,1,146.912821646066,456.080746005808,107.786964633461,42.0996936545816
2015-01-01 09:00:00,1,179.530560852404,503.469990485512,108.283817221771,37.8477274946112
2015-01-01 10:00:00,1,180.544276621327,371.600611295334,107.55330679883,41.4678800376109
2015-01-01 11:00:00,1,141.41175703074,530.857266087542,87.6140012779218,44.9858461978707
2015-01-01 12:00:00,1,184.083821743344,450.2275288129,87.6973797069792,30.8312627133489
2015-01-01 13:00:00,1,166.632618417563,486.466837788584,108.067733800301,50.3800539242367
2015-01-01 14:00:00,1,159.892748369181,488.968697483274,102.131884360457,43.661296546187
2015-01-01 15:00:00,1,176.686811672085,508.202759433056,90.9511892146129,43.039695633682


In [0]:
# handle missing values
#spark automatically detects if a column is string or numeric
telemetry.na.fill('unknown').show()

#fill integer value column
telemetry.na.fill(0).show()

In [0]:
# converting the telemetry datatime column from string to timestamp
telemetry = telemetry.withColumn('datetime', col('datetime').cast('timestamp'))
telemetry.show()

In [0]:
print(telemetry.count())
display(telemetry.summary())

summary,machineID,volt,rotate,pressure,vibration
count,8761000.0,8761000.0,8761000.0,8761000.0,8761000.0
mean,500.5,170.7600315391495,446.6013637342127,100.83714717366897,40.34960671614738
stddev,288.67500673220314,15.501526728387756,52.61506763136102,10.9967139684778,5.3370521845496
min,1.0,87.7808478966142,102.17298474357,48.6222988089569,14.3571880559077
25%,250.0,160.295086523296,412.33741383005,93.5192785328517,36.7635647362332
50%,500.0,170.581760114328,447.586299475947,100.418131169443,40.211542645054
75%,750.0,181.004697201631,482.109392303265,107.528183967248,43.7446129066388
max,1000.0,272.681768924129,700.199821201254,195.271111141582,79.107487038337


In [0]:
# displaying the rotate against machineID
telemetry.select("datetime","rotate").display()
telemetry.select("datetime","pressure").display()
telemetry.select("datetime","vibration").display()
telemetry.select("datetime","volt").display()

datetime,rotate
2015-01-01T06:00:00.000+0000,530.813577555042
2015-01-01T07:00:00.000+0000,535.523532319384
2015-01-01T08:00:00.000+0000,456.080746005808
2015-01-01T09:00:00.000+0000,503.469990485512
2015-01-01T10:00:00.000+0000,371.600611295334
2015-01-01T11:00:00.000+0000,530.857266087542
2015-01-01T12:00:00.000+0000,450.2275288129
2015-01-01T13:00:00.000+0000,486.466837788584
2015-01-01T14:00:00.000+0000,488.968697483274
2015-01-01T15:00:00.000+0000,508.202759433056


datetime,pressure
2015-01-01T06:00:00.000+0000,101.788175260076
2015-01-01T07:00:00.000+0000,113.256009499254
2015-01-01T08:00:00.000+0000,107.786964633461
2015-01-01T09:00:00.000+0000,108.283817221771
2015-01-01T10:00:00.000+0000,107.55330679883
2015-01-01T11:00:00.000+0000,87.6140012779218
2015-01-01T12:00:00.000+0000,87.6973797069792
2015-01-01T13:00:00.000+0000,108.067733800301
2015-01-01T14:00:00.000+0000,102.131884360457
2015-01-01T15:00:00.000+0000,90.9511892146129


datetime,vibration
2015-01-01T06:00:00.000+0000,49.6040134898504
2015-01-01T07:00:00.000+0000,41.5159054753218
2015-01-01T08:00:00.000+0000,42.0996936545816
2015-01-01T09:00:00.000+0000,37.8477274946112
2015-01-01T10:00:00.000+0000,41.4678800376109
2015-01-01T11:00:00.000+0000,44.9858461978707
2015-01-01T12:00:00.000+0000,30.8312627133489
2015-01-01T13:00:00.000+0000,50.3800539242367
2015-01-01T14:00:00.000+0000,43.661296546187
2015-01-01T15:00:00.000+0000,43.039695633682


datetime,volt
2015-01-01T06:00:00.000+0000,151.919998705647
2015-01-01T07:00:00.000+0000,174.522001096471
2015-01-01T08:00:00.000+0000,146.912821646066
2015-01-01T09:00:00.000+0000,179.530560852404
2015-01-01T10:00:00.000+0000,180.544276621327
2015-01-01T11:00:00.000+0000,141.41175703074
2015-01-01T12:00:00.000+0000,184.083821743344
2015-01-01T13:00:00.000+0000,166.632618417563
2015-01-01T14:00:00.000+0000,159.892748369181
2015-01-01T15:00:00.000+0000,176.686811672085


Feature Engineering for Telemetry Data

From dataset we are able to see timestamps, calculating the lag features are helpful.

so what is lag feature??

A lag features is a fancy name for a variable which contains data from prior time steps. If we have time-series data, we can convert it into rows. Every row contains data about one observation and includes all previous occurrences of that observation.

A common method in lag features are to pick up a window size to be created, it can be in hrs.

Once we are ready with the window size, compute the rolling aggregate measures because it changes over hte time,for the window sized features.

Here, we are taking the lag window size to be 12 hours and 24 hours.

In [0]:
# Temporary storage for results stored after rolling aggregate measures 
tel_agg = telemetry

# Features that we are interested in telemetry data set
int_feat = ['volt','rotate', 'pressure', 'vibration']
      
# n hours = n * 3600 seconds  
time_val = 12 * 3600

# dt_truncated looks at the column named "datetime" in the current data set.
df_time = ((round(unix_timestamp(col("datetime")) / time_val) * time_val).cast("timestamp"))

In [0]:
#  windows for our rolling windows 12hrs, 24 hrs
lags = [12, 24]

# Align the data
for lag_n in lags:
    wSpec = Window.partitionBy('machineID').orderBy('datetime').rowsBetween(1-lag_n, 0)
    for col_name in int_feat:
        tel_agg = tel_agg.withColumn(col_name+'_rollingmean_'+str(lag_n), 
                                       F.avg(col(col_name)).over(wSpec))
        tel_agg = tel_agg.withColumn(col_name+'_rollingstd_'+str(lag_n), 
                                       F.stddev(col(col_name)).over(wSpec))

display(tel_agg)

datetime,machineID,volt,rotate,pressure,vibration,volt_rollingmean_12,volt_rollingstd_12,rotate_rollingmean_12,rotate_rollingstd_12,pressure_rollingmean_12,pressure_rollingstd_12,vibration_rollingmean_12,vibration_rollingstd_12,volt_rollingmean_24,volt_rollingstd_24,rotate_rollingmean_24,rotate_rollingstd_24,pressure_rollingmean_24,pressure_rollingstd_24,vibration_rollingmean_24,vibration_rollingstd_24
2015-01-01T06:00:00.000+0000,31,170.703867612729,488.773760538982,88.6211494027848,35.8242209642812,170.703867612729,,488.773760538982,,88.6211494027848,,35.8242209642812,,170.703867612729,,488.773760538982,,88.6211494027848,,35.8242209642812,
2015-01-01T07:00:00.000+0000,31,187.415324587796,433.878270722005,100.696381768879,45.4953038672765,179.0595961002625,11.8167845505771,461.3260156304936,38.81697310614149,94.6587655858319,8.538478690468487,40.65976241577885,6.838488302125255,179.0595961002625,11.8167845505771,461.3260156304936,38.81697310614149,94.6587655858319,8.538478690468487,40.65976241577885,6.838488302125255
2015-01-01T08:00:00.000+0000,31,178.312598498125,440.713005198229,97.1335365068298,42.907496994735,178.81059689955,8.366851281291261,454.455012153072,29.91673055201172,95.48368922616451,6.204377987432032,41.4090072754309,5.006652045858293,178.81059689955,8.366851281291261,454.455012153072,29.91673055201172,95.48368922616451,6.204377987432032,41.4090072754309,5.006652045858293
2015-01-01T09:00:00.000+0000,31,167.657212505414,524.303787621787,96.8129449390367,41.3195188107834,176.022250801016,8.818671258736728,471.9172060202508,42.61908848431872,95.81600315438256,5.109266189467745,41.38663515926903,4.088159143819004,176.022250801016,8.818671258736728,471.9172060202508,42.61908848431872,95.81600315438256,5.109266189467745,41.38663515926903,4.088159143819004
2015-01-01T10:00:00.000+0000,31,168.777360988058,441.749794958639,93.649201153096,53.6495014222728,174.57327283842443,8.296047719102903,465.8837238079285,39.29764071108714,95.38264275412526,4.529619954111447,43.83920841186978,6.527661272290131,174.57327283842443,8.296047719102903,465.8837238079285,39.29764071108714,95.38264275412526,4.529619954111447,43.83920841186978,6.527661272290131
2015-01-01T11:00:00.000+0000,31,151.484765840703,518.011103057586,105.34310182543,42.4901291419228,170.72518833880417,11.996084865630577,474.571620349538,41.089182643031805,97.0427192660094,5.740129762888973,43.61436186687862,5.864437326919942,170.72518833880417,11.996084865630577,474.571620349538,41.089182643031805,97.0427192660094,5.740129762888973,43.61436186687862,5.864437326919942
2015-01-01T12:00:00.000+0000,31,208.391650514777,451.54372039343,115.482625521725,33.4652959437061,176.1061115068003,17.961126052264028,471.2819203558083,38.50570075847954,99.67699158825448,8.719708104053456,42.1644953064254,6.585930377427299,176.1061115068003,17.961126052264028,471.2819203558083,38.50570075847954,99.67699158825448,8.719708104053456,42.1644953064254,6.585930377427299
2015-01-01T13:00:00.000+0000,31,174.749970943667,421.163836039051,92.4822050253058,44.1575805960393,175.93659393640863,16.63568248402687,465.0171598162137,39.81022575750271,98.77764326788588,8.46416134563812,42.413630967627135,6.137969812895721,175.93659393640863,16.63568248402687,465.0171598162137,39.81022575750271,98.77764326788588,8.46416134563812,42.413630967627135,6.137969812895721
2015-01-01T14:00:00.000+0000,31,162.594155897933,468.973529373218,123.70432852743,37.1732386743588,174.45410082102245,16.18433695831236,465.4567564336586,37.262400918084126,101.5472749633908,11.477129930129674,41.83136515726399,6.001386549063261,174.45410082102245,16.18433695831236,465.4567564336586,37.262400918084126,101.5472749633908,11.477129930129674,41.83136515726399,6.001386549063261
2015-01-01T15:00:00.000+0000,31,166.595375177386,568.518146473385,99.7085796902869,42.2230441361141,173.66822825665878,15.459789110640536,475.7628954376312,47.92050973883597,101.3634054360804,10.836352460026124,41.870533055149,5.6595170118860185,173.66822825665878,15.459789110640536,475.7628954376312,47.92050973883597,101.3634054360804,10.836352460026124,41.870533055149,5.6595170118860185


In [0]:
# Calculate lag values 12,24 hours rolling mean and SD
tel_feat = (tel_agg.withColumn("df_time", df_time)
                  .drop('volt', 'rotate', 'pressure', 'vibration')
                  .fillna(0)
                  .groupBy("machineID","df_time")
                  .agg(F.mean('volt_rollingmean_12').alias('volt_rollingmean_12'),
                       F.mean('rotate_rollingmean_12').alias('rotate_rollingmean_12'), 
                       F.mean('pressure_rollingmean_12').alias('pressure_rollingmean_12'), 
                       F.mean('vibration_rollingmean_12').alias('vibration_rollingmean_12'), 
                       F.mean('volt_rollingmean_24').alias('volt_rollingmean_24'),
                       F.mean('rotate_rollingmean_24').alias('rotate_rollingmean_24'), 
                       F.mean('pressure_rollingmean_24').alias('pressure_rollingmean_24'), 
                       F.mean('vibration_rollingmean_24').alias('vibration_rollingmean_24'),
                       
                       F.stddev('volt_rollingstd_12').alias('volt_rollingstd_12'),
                       F.stddev('rotate_rollingstd_12').alias('rotate_rollingstd_12'), 
                       F.stddev('pressure_rollingstd_12').alias('pressure_rollingstd_12'), 
                       F.stddev('vibration_rollingstd_12').alias('vibration_rollingstd_12'), 
                       F.stddev('volt_rollingstd_24').alias('volt_rollingstd_24'),
                       F.stddev('rotate_rollingstd_24').alias('rotate_rollingstd_24'), 
                       F.stddev('pressure_rollingstd_24').alias('pressure_rollingstd_24'), 
                       F.stddev('vibration_rollingstd_24').alias('vibration_rollingstd_24')))

display(tel_feat)

machineID,df_time,volt_rollingmean_12,rotate_rollingmean_12,pressure_rollingmean_12,vibration_rollingmean_12,volt_rollingmean_24,rotate_rollingmean_24,pressure_rollingmean_24,vibration_rollingmean_24,volt_rollingstd_12,rotate_rollingstd_12,pressure_rollingstd_12,vibration_rollingstd_12,volt_rollingstd_24,rotate_rollingstd_24,pressure_rollingstd_24,vibration_rollingstd_24
53,2015-02-02T00:00:00.000+0000,165.67396206489252,446.5689552792148,98.1978159816598,38.19262407771616,168.64746730772072,452.84584534103647,97.76488318760364,39.37645164962586,1.8371458954801052,5.0952302099210405,0.9588721228085948,0.7232581261914476,0.6849693816036906,1.2593770687530697,0.5268910581762881,0.3788814485341503
108,2015-01-26T00:00:00.000+0000,168.5877303165531,435.29889606326566,98.65678506907813,39.837964896930494,169.46780641156678,432.916343055228,98.56659738817764,39.84184618416017,0.765180830740079,5.032403859478181,1.3029115354283374,0.3464720470810551,0.9523239926503388,3.3693522331984505,0.7888996365732895,0.0942204445349933
148,2015-01-31T12:00:00.000+0000,168.6175705117557,447.49055637120665,100.14067210035633,40.34170510449623,172.13892837007987,455.4895684018909,99.77624711207552,38.71070116108127,0.8940105437295798,6.114088829085324,2.1899788500139428,0.3770719001747341,0.7783447795332863,3.4568736596933705,0.5283087380924376,0.0797732249982073
155,2015-01-17T12:00:00.000+0000,167.80285014300026,413.6781780142583,140.9071106797972,41.92992159150683,169.5883975315993,428.5023840648253,146.67579231838974,41.14016706905558,1.5173876161372333,6.462436770011659,2.181511933893924,0.39443393364282,0.7153569526124902,5.70737056629651,1.4457600098565846,0.1426157926959174
243,2015-01-13T00:00:00.000+0000,169.5876951645235,383.9576083491995,98.57636833480592,41.24439049978698,170.76716103441854,381.8095105757628,99.16095494320592,40.63478621725898,2.06123266351348,3.2434150749676127,0.8547078996804338,0.4931406619603448,1.8271433810771625,2.2323010222404047,0.3438340142592576,0.4065052150026872
251,2015-01-09T00:00:00.000+0000,171.38574887437403,437.9876506515856,97.54336256074592,39.897985579367386,169.59855033264756,446.9508112150005,99.13814978737402,39.41755281535797,1.6947043155132306,6.4824094620367605,0.796096928602542,0.548434405626187,0.5073413560630265,2.497655073799552,0.7561631509770353,0.3683618665333102
255,2015-01-02T12:00:00.000+0000,173.1303170582797,455.0155444800764,99.80231202072368,42.43199713172303,173.29901331528825,457.2619873743566,99.500465070489,41.3161840610966,1.6069855887120177,5.563114486990273,0.7900816560977746,0.6492602967609106,0.4313632471019502,1.334719626828482,0.5317006760001156,0.3195763845421623
471,2015-01-03T00:00:00.000+0000,168.63464316628654,448.388455483082,101.310575681363,41.78480469794555,170.79920363853856,458.53777768745937,100.86218595592376,41.30405602775813,0.657294769909752,5.199826371028019,1.936369568585852,0.722728897894122,0.4456366700770764,1.8178772181839185,0.6355238995237151,0.3951222404263737
481,2015-01-10T00:00:00.000+0000,174.6057642697284,457.1862370197318,101.76899993172356,38.88834610520931,172.61103837379974,450.1699018743832,101.72330020655976,39.31066270634247,3.103173642324069,2.0681499049182426,0.7974434001738427,0.4090718720554174,0.6132670806184809,1.4626253086881102,0.5262563775115318,0.3948135893120589
540,2015-01-13T12:00:00.000+0000,171.68636726140232,463.5089561787179,99.72133089761684,40.78386567090468,170.3866949317497,448.33902339511405,100.80328189730108,40.04656193158751,0.9770530658030532,7.520047769785819,1.3189776758561156,0.7821578804986661,0.608524461979163,8.517758417134704,0.6706873061281404,0.5935222599037933
