## Step 1: Data Ingestion

In [2]:
import os
import sys
import copy
import time
import random
import pyspark

from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc, size, max, abs, split, col

import matplotlib.pyplot as plt
import numpy as np



In [3]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Predictive Maintenance") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

spark = init_spark()

In [6]:
filename = "..\data\machines.csv"
machines = spark.read.csv(filename, sep=',', header=True)

machines.show()

+---------+------+---+
|machineID| model|age|
+---------+------+---+
|        1|model3| 18|
|        2|model4|  7|
|        3|model3|  8|
|        4|model3|  7|
|        5|model3|  2|
|        6|model3|  7|
|        7|model3| 20|
|        8|model3| 16|
|        9|model4|  7|
|       10|model3| 10|
|       11|model2|  6|
|       12|model3|  9|
|       13|model1| 15|
|       14|model3|  1|
|       15|model3| 14|
|       16|model1|  3|
|       17|model1| 14|
|       18|model3| 15|
|       19|model3| 17|
|       20|model2| 16|
+---------+------+---+
only showing top 20 rows



In [9]:
filename = "..\data\errors.csv"
errors = spark.read.csv(filename, sep=',', header=True)

print(errors.count())
errors.show()

3919
+--------------------+---------+-------+
|            datetime|machineID|errorID|
+--------------------+---------+-------+
| 1/3/2015 7:00:00 AM|        1| error1|
| 1/3/2015 8:00:00 PM|        1| error3|
| 1/4/2015 6:00:00 AM|        1| error5|
|1/10/2015 3:00:00 PM|        1| error4|
|1/22/2015 10:00:0...|        1| error4|
|1/25/2015 3:00:00 PM|        1| error4|
|1/27/2015 4:00:00 AM|        1| error1|
|3/3/2015 10:00:00 PM|        1| error2|
| 3/5/2015 6:00:00 AM|        1| error1|
|3/20/2015 6:00:00 PM|        1| error1|
|3/26/2015 1:00:00 AM|        1| error2|
|3/31/2015 11:00:0...|        1| error1|
|4/19/2015 6:00:00 AM|        1| error2|
|4/19/2015 6:00:00 AM|        1| error3|
|4/29/2015 7:00:00 PM|        1| error4|
|5/4/2015 11:00:00 PM|        1| error2|
|5/12/2015 9:00:00 AM|        1| error1|
|5/21/2015 7:00:00 AM|        1| error4|
|5/24/2015 2:00:00 AM|        1| error3|
|5/25/2015 5:00:00 AM|        1| error1|
+--------------------+---------+-------+
only showin

In [16]:
filename = "..\data\\telemetry.csv"
telemetry = spark.read.csv(filename, sep=',', header=True)

print(telemetry.count())
telemetry.show()

876100
+--------------------+---------+----------------+----------------+----------------+----------------+
|            datetime|machineID|            volt|          rotate|        pressure|       vibration|
+--------------------+---------+----------------+----------------+----------------+----------------+
| 1/1/2015 6:00:00 AM|        1|176.217853015625|418.504078221616|113.077935462083|45.0876857639276|
| 1/1/2015 7:00:00 AM|        1| 162.87922289706|402.747489565395|95.4605253823187|43.4139726834815|
| 1/1/2015 8:00:00 AM|        1|170.989902405567|527.349825452291|75.2379048586662|34.1788471214451|
| 1/1/2015 9:00:00 AM|        1|162.462833264092|346.149335043074|109.248561276504|41.1221440884256|
|1/1/2015 10:00:00 AM|        1| 157.61002119306|435.376873016938|111.886648210168|25.9905109982024|
|1/1/2015 11:00:00 AM|        1|172.504839196295|430.323362106675|95.9270416939636|35.6550173268837|
|1/1/2015 12:00:00 PM|        1|156.556030606329|499.071623068962|111.755684290096|4