# Creation of the Spark Session and Context

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pyspark.sql.functions as sf
from pyspark.sql import SparkSession
from pyspark import SparkFiles

os.environ["PYSPARK_PYTHON"] = "/opt/miniconda3/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/opt/miniconda3/bin/python"

spark = SparkSession.builder \
    .appName("User_B_Session") \
    .master("spark://10.67.22.135:7077") \
    .config("spark.scheduler.mode", "FAIR") \
    .config("spark.dynamicAllocation.enabled", "false") \
    .config("spark.shuffle.service.enabled", "false") \
    .config("spark.scheduler.pool", "user_b") \
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/27 15:47:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Dataset upload and partition

In [3]:
Npartition = 16 # to change (one of project goal)

In [2]:
df = spark.read.option("header", True).csv("file:///mnt/shared/dataset.csv")
df = df.repartition(Npartition)

25/07/27 15:47:41 WARN FairSchedulableBuilder: A job was submitted with scheduler pool user_b, which has not been configured. This can happen when the file that pools are read from isn't set, or when that file doesn't contain user_b. Created user_b with default configuration (schedulingMode: FIFO, minShare: 0, weight: 1)
                                                                                

In [6]:
#df.groupBy("hwid").count().show()
#df.rdd.getNumPartitions()

In [7]:
df.printSchema()

root
 |-- when: string (nullable = true)
 |-- hwid: string (nullable = true)
 |-- metric: string (nullable = true)
 |-- value: string (nullable = true)



# Preprocessing

In [19]:
from pyspark.sql.functions import col, from_unixtime, first, desc, asc

In [16]:
df = df.withColumn("when", col("when").cast("long"))

In [12]:
hwid_list = df.select("hwid").distinct()
hwid_list.show()



+------+
|  hwid|
+------+
|SW-088|
|SW-106|
|SW-065|
|SW-115|
+------+



                                                                                

In [33]:
# CHANGE HERE HARDWARE TO ANALYZE
hardware = "SW-106"
df_hard = df.filter(col("hwid") == hardware)

In [34]:
df_hard.printSchema()

root
 |-- when: long (nullable = true)
 |-- hwid: string (nullable = true)
 |-- metric: string (nullable = true)
 |-- value: string (nullable = true)



In [35]:
df_hard = df_hard.groupBy("when").pivot("metric").agg(first("value"))\
    .withColumn("timestamp", from_unixtime(col("when")/1000)).orderBy("when")

                                                                                

In [36]:
df_hard.printSchema()

root
 |-- when: long (nullable = true)
 |-- A5: string (nullable = true)
 |-- A9: string (nullable = true)
 |-- ComError: string (nullable = true)
 |-- P1: string (nullable = true)
 |-- P10: string (nullable = true)
 |-- P15: string (nullable = true)
 |-- P16: string (nullable = true)
 |-- P17: string (nullable = true)
 |-- P18: string (nullable = true)
 |-- P2: string (nullable = true)
 |-- P5: string (nullable = true)
 |-- P6: string (nullable = true)
 |-- P7: string (nullable = true)
 |-- P8: string (nullable = true)
 |-- P9: string (nullable = true)
 |-- S1: string (nullable = true)
 |-- S10: string (nullable = true)
 |-- S100: string (nullable = true)
 |-- S101: string (nullable = true)
 |-- S102: string (nullable = true)
 |-- S106: string (nullable = true)
 |-- S107: string (nullable = true)
 |-- S108: string (nullable = true)
 |-- S109: string (nullable = true)
 |-- S11: string (nullable = true)
 |-- S110: string (nullable = true)
 |-- S112: string (nullable = true)
 |-- S113: s

In [37]:
selected_cols = ["timestamp", "S117", "S118", "S169", "S170"]

In [38]:
# show first 5 rows
df_hard.select(*selected_cols).orderBy(col("timestamp").asc()).show(5)



+-------------------+----+----+----+----+
|          timestamp|S117|S118|S169|S170|
+-------------------+----+----+----+----+
|2020-10-01 00:00:22|   0|   1|NULL|NULL|
|2020-10-01 00:00:25|NULL|NULL|   0|   1|
|2020-10-01 00:00:52|   0|   1|NULL|NULL|
|2020-10-01 00:00:55|NULL|NULL|   0|   1|
|2020-10-01 00:01:22|   0|   1|NULL|NULL|
+-------------------+----+----+----+----+
only showing top 5 rows



                                                                                

In [39]:
# show last 5 rows
df_hard.select(*selected_cols).orderBy(col("timestamp").desc()).limit(5).orderBy(col("timestamp").asc()).show()



+-------------------+----+----+----+----+
|          timestamp|S117|S118|S169|S170|
+-------------------+----+----+----+----+
|2021-03-27 07:27:35|NULL|NULL|   0|   0|
|2021-03-27 07:28:02|   1|   0|NULL|NULL|
|2021-03-27 07:28:05|NULL|NULL|   0|   0|
|2021-03-27 07:28:33|   0|   0|NULL|NULL|
|2021-03-27 07:28:36|NULL|NULL|   0|   0|
+-------------------+----+----+----+----+



                                                                                

# Anomaly Detection

# Predictive Maintenance

# *** Remember to close Spark Session ***

In [None]:
spark.stop()