In [1]:
%load_ext autoreload
%autoreload 2

# Standard libraries
import os
import numpy as np
import pandas as pd
from math import ceil
import time
import matplotlib.pyplot as plt
from IPython.display import clear_output


#Import all functions (improve readibility)
from Functions import *

# PySpark core
from pyspark import SparkFiles
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql.types import IntegerType

# PySpark functions
from pyspark.sql.functions import (
    coalesce,
    col, lit, expr, when, count, sum as spark_sum, abs as spark_abs,
    round as spark_round, min as spark_min, max as spark_max, avg as spark_avg,
    first, last, lag, row_number, desc, asc,
    explode, sequence, from_unixtime, to_date, unix_timestamp,
    window, min_by, mode, concat, monotonically_increasing_id
)

In [2]:
#nCore = np.arange(1, 17)
#nPartitions = np.arange(2, 33, 2)
nCores = [10, 12]
nPartitions = [8, 10]
frequency = 60

OptimizationResults = {}


for core in nCores:
    for partitions in nPartitions:

        clear_output(wait=True)
        
        
        TimeResults = {}
        
        #Create spark session
        print(f'Creating Spark session for {(core, partitions)}')
        spark = CreateSparkSession(core)

        print('Reading the CSV...')
        startTime = time.time()
        df = spark.read.option("header", True).option("inferSchema", True).csv("file:///mnt/shared/dataset.csv")
        endTime = time.time()

        TimeResults['LoadCSV'] = endTime - startTime
        print('Load CSV time: ', np.round(endTime - startTime, 3))

        #Define the number of partitions
        df = df.repartition(partitions)

        #Convert milliseconds into seconds
        df = df.withColumn("when", spark_round(col("when") / 1000).cast(IntegerType()))
        
        #Focus only on 1 hardware (conventional)
        print('Filtering hardware...')
        hardware = "SW-106"
        df_hw = df  .filter(col("hwid") == hardware)\
                    .groupBy("when")\
                    .pivot("metric")\
                    .agg(first("value"))\
                    .withColumn("time", from_unixtime(col("when")))\
                    .orderBy("when")

        #Momentarily persist
        print('Persist the filtered dataframe...')
        df_hw = df_hw.persist()
        #Trigge the persist through an action
        df_hw.count()

        
        # Fill sensor gaps and build blocks of independent measurement
        print('Starting preprocessing...')
        startTime = time.time()
        df_grid = FillGaps(df_hw, interval=frequency, modality="auto")
        df_final = BuildBlocks(df_grid, max_interval = 240)
        endTime = time.time()

        TimeResults['Preprocessing'] = endTime - startTime

        print('Preprocessing time: ', np.round(endTime - startTime, 3))
        
        #Persist and trigger the persist operation
        print('Starting final persist...')
        df_final = df_final.persist()
        df_final.count()

        OptimizationResults[(core, partitions)] = TimeResults


        
        spark.stop()

        


#------------- ANOMALY DETECTION NOW


Creating Spark session for (12, 10)
Read the CSV


                                                                                

Load CSV time:  18.0 4
Filtering hardware...


                                                                                

Persist the filtered dataframe


                                                                                

Starting preprocessing...
Preprocessing time:  28.0 4
Starting final persist


                                                                                

In [3]:
OptimizationResults

{(10, 8): {'LoadCSV': 20.984825611114502, 'Preprocessing': 34.580507040023804},
 (10, 10): {'LoadCSV': 19.71999764442444, 'Preprocessing': 40.66104340553284},
 (12, 8): {'LoadCSV': 18.766189575195312, 'Preprocessing': 28.407060623168945},
 (12, 10): {'LoadCSV': 18.23853588104248, 'Preprocessing': 28.279947996139526}}