<div style="border-top: 3px solid #0a6d91; padding: 15px; display: flex; align-items: center; justify-content: space-between;">

  <!-- Left text -->
  <div style="flex: 1; padding-right: 20px;">
    <h2 style= display: inline-block; padding: 5px 10px; border-radius: 3px;">
      MAPD (mod.B) Final Project
    </h2>
    <h3>Anomaly detection and <br>Predictive maintenance for
industrial devices</h3>
  </div>

  <!-- Right images -->
  <div style="flex: 0 0 auto; display: flex; align-items: center; gap: 20px;">
    <img src="https://th.bing.com/th/id/R.f158dd00f7e0e326ff081cf1acb39901?rik=tfJW%2frH3keCJ%2fg&riu=http%3a%2f%2fboostlab.dfa.unipd.it%2fimg%2flogo_pod.png&ehk=Th6GDiUuQTgD%2faBhIK7JUi15%2bG%2f35LzMJV9PFEPd9rg%3d&risl=&pid=ImgRaw&r=0" alt="PoD" width="250"/>
    <img src="https://www.unidformazione.com/wp-content/uploads/2018/04/unipd-universita-di-padova-1024x463.png" alt="UNIPD" width = "350" />
  </div>

</div>
<div style="border-bottom: 3px solid #0a6d91">
    <p><strong>Authors</strong></p>
    <ul>
      <li>Boscolo Marco (2157559)</li>
      <li>La Rovere Francesco (2164968)</li>
      <li>Montagner Nicolò (2165809)</li>
      <li>Sabatini Raffaele (2165739)</li>
    </ul>
</div>

# Cluster parameters optimization

We wish to study the behaviour of the parallelization in our code. In order to do so we apply all the transoformation to the dataset, timing each stage of the processing to compare different configurations, mainly the number of cores and the number of partitions used in the process. Since using all the dataset was a quite heavy task we decided to restrict to one hardware analysis. SW-088 was the hardware of choice. All the functions used in the following code are found in the file **Functions.py**.

In [27]:
%load_ext autoreload
%autoreload 2

# Standard libraries
import os
import numpy as np
import pandas as pd
from math import ceil
import time
import matplotlib.pyplot as plt
from IPython.display import clear_output
import gc
from pyspark import StorageLevel
import seaborn as sns




#Import all functions (improve readibility)
from Functions import *

# PySpark core
from pyspark import SparkFiles
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql.types import IntegerType

# PySpark functions
from pyspark.sql.functions import (
    coalesce,
    col, lit, expr, when, count, sum as spark_sum, abs as spark_abs,
    round as spark_round, min as spark_min, max as spark_max, avg as spark_avg,
    first, last, lag, row_number, desc, asc,
    explode, sequence, from_unixtime, to_date, unix_timestamp,
    window, min_by, mode, concat, monotonically_increasing_id
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
sensors = ['P1', 'P10', 'P15', 'P16', 'P17', 'P18', 'P2', 'P5', 'P6', 'P7', 'P8', 'P9', 'S1', 'S10', 'S100', 'S101', 'S102', 'S106', 
           'S107', 'S108', 'S109', 'S11', 'S110', 'S112', 'S113', 'S114', 'S115', 'S117', 'S118', 'S122', 'S123', 'S124', 'S125', 'S126', 
           'S127', 'S128', 'S129', 'S130', 'S137', 'S138', 'S140', 'S143', 'S147', 'S15', 'S151', 'S154', 'S157', 'S158', 'S159', 'S16', 
           'S163', 'S164', 'S165', 'S166', 'S167', 'S169', 'S17', 'S170', 'S171', 'S172', 'S173', 'S174', 'S175', 'S176', 'S178', 'S179', 
           'S180', 'S181', 'S183', 'S19', 'S2', 'S201', 'S202', 'S203', 'S204', 'S205', 'S206', 'S25', 'S3', 'S33', 'S34', 'S35', 'S37', 
           'S39', 'S40', 'S41', 'S42', 'S43', 'S45', 'S46', 'S47', 'S49', 'S5', 'S50', 'S53', 'S54', 'S55', 'S56', 'S57', 'S6', 'S63', 
           'S64', 'S69', 'S7', 'S70', 'S71', 'S72', 'S73', 'S8', 'S80', 'S81', 'S83', 'S86', 'S9', 'S90', 'S94', 'S97', 'SA1', 'SA10', 
           'SA11', 'SA12', 'SA2', 'SA3', 'SA4', 'SA5', 'SA6', 'SA7', 'SA8', 'SA9', 'SW']
alarms = ['A5', 'A9', 'ComError']
engines = ["S117", "S118", "S169", "S170"]
list_hwid = ['SW-106', 'SW-065', 'SW-115', 'SW-088']



frequency = 60
nCores = [2, 4, 6, 8, 12, 14, 16]
nPartitions = [2, 8, 16, 32, 64, 256]
nExecutors = [2, 4, 6, 8] 
MEMexec = "2200m" 

loops = [1, 2, 3]
for loop in loops:
    OptimizationResults = {}
    for core in nCores:
        for partition in nPartitions:
    
            clear_output(wait=True)
    
            #----------------------CREATING DATAFRAME --------------------------
            
            TimeResults = {}
            
            #Create spark session
            print(f'Creating Spark session for {(core, partition)}')
            spark = CreateSparkSession(core, partition, MEMexec, log = False)
    
            print('Reading the CSV...')
            startTime = time.time()
            df = spark.read.option("header", True).option("inferSchema", True).csv("file:///mnt/shared/dataset.csv").repartition(partition, col("hwid"))
    
            #Convert milliseconds into seconds
            df = df.withColumn("when", spark_round(col("when") / 1000).cast(IntegerType()))
            df.count()
            endTime = time.time()
    
            TimeResults['LoadCSV'] = endTime - startTime
            print('Load CSV time: ', np.round(endTime - startTime, 2), ' seconds')
    
    
            #----------------------PREPROCESSING PIPELINE--------------------------
    
            
            print('Pivot dataset...')
    
            startTime = time.time()
            df = df.filter(col('hwid') == 'SW-088')
            df_all_hw = (df.groupBy("hwid", "when")
                       .pivot("metric")
                       .agg(first("value"))
                       .withColumn("time", from_unixtime(col("when")))
                       .orderBy("hwid", "when"))
    
            print('Persist the dataframe...')
            df_all_hw = df_all_hw.persist()
            df_all_hw.count()
            
            endTime = time.time()
    
            TimeResults['Pivot'] = endTime - startTime
            print('Pivot time: ', np.round(endTime - startTime, 2), ' seconds')
    
            
            # Fill sensor gaps and build blocks of independent measurement
            print('Starting preprocessing...')
            
            startTime = time.time()
            #Create grid, homogeneous data
            df_grid = CreateGrid(df_all_hw, interval=frequency)
    
            #Build independent blocks
            df_blocks = BuildBlocks(df_grid, max_interval = 1800, sensors = sensors )
    
            #Fill the NULL values
            df_blocks = FillNull(df_blocks, sensors + engines, max_gap=240)
            
            df_blocks = df_blocks.persist()
            df_blocks.count()
            
            endTime = time.time()
    
            TimeResults['Preprocessing'] = endTime - startTime
    
            print('Preprocessing time: ', np.round(endTime - startTime, 2), ' seconds')
    
            df_all_hw.unpersist()
    
            #----------------------ANOMALY DETECTION--------------------------
    
            useless_sensors, useful_sensors = UsefulSensors(df_blocks, sensors)
    
            #Compute the anomalies for all the hardware sequentially (parallelized internally)
            print('Starting Anomaly detection...')
            startTime = time.time()
    
            df_anomalies = detect_anomalies(df = df_blocks, time_separator = 60*40, threshold = 8, sensors = engines, partition = partition).persist()
            df_anomalies.count()
    
            endTime = time.time()
    
            TimeResults['AnomalyDetection'] = endTime - startTime
    
            print('Anomaly detection time: ', np.round(endTime - startTime, 2), ' seconds')
    
            #------------------------CORRELATIONS--------------------------
            print('Starting computing Correlations...')
    
            startTime = time.time()
    
            #df_blocks = spark.createDataFrame(df_blocks.rdd, df_blocks.schema)
            #df_anomalies = spark.createDataFrame(df_anomalies.rdd, df_anomalies.schema)
            
            joined_df = df_blocks.join(df_anomalies, on =["hwid", 'when', *engines, 'BlockID'], how='left').persist()
            joined_df.count()
    
            for i in list_hwid:
                filter_hw = joined_df.filter(col('hwid') == i)
                anomaly_corr = correlations(filter_hw, useful_sensors, 'flag_anomaly')
                
            endTime = time.time()
    
            TimeResults['Correlations'] = endTime - startTime
            print('Correlation time: ', np.round(endTime - startTime, 2), ' seconds')
    
    
            #------------------------PREDICTIVE MAINTEINANCE--------------------------
    
            print('Starting Predictive Mainteinance...')
            startTime1 = time.time()
    
            df_alarms = extract_alarms(df_blocks.select("when","A5","A9"), columns=["A5", "A9"], bits=[6, 7, 8] ).persist()
            df_alarms.count()
    
            endTime1 = time.time()
            
            list_df_final = {}
            
            df_final = joined_df.join(df_alarms.select('when', 'overheating') , on=['when'], how='left' )
    
            startTime2 = time.time()
            
            #Computing correlations
            alarm_corr = correlations(df_final, useful_sensors, 'overheating')

            df_final = df_final.filter(col("hwid") == "SW-088")
    
            #Computing the predictive dataframe on overheating signals
            target = "overheating"
            df_final088 = add_predictive(df_final, target, window_before_heating=30, debug=False, join=True, partition = partition )
            df_final088.count()
    
            endTime2 = time.time()
    
            TimeResults['Predictive Maintenance'] = (endTime1 - startTime1) + (endTime2 - startTime2)
            print('Predictive Maintenance time: ', np.round((endTime1 - startTime1) + (endTime2 - startTime2), 2), ' seconds')
    
            #-----------------------------------Saving the results 
            
            OptimizationResults[(core, partition)] = TimeResults
    
            data = []
            for (cores, partition), metrics in OptimizationResults.items():
                row = {'Resources': (cores, partition)}
                row.update(metrics)
                data.append(row)
            
            df_alt = pd.DataFrame(data)
            df_alt.to_pickle(f'SC_4HW_{loop}.pkl')
    
            #-----------------------------------Clean up the memory
    
            try:
                df_blocks.unpersist()
                df_anomalies.unpersist()
                df_alarms.unpersist()
                
                
                #Garbage collection
                gc.collect()
                
                #Clear Spark cache
                spark.catalog.clearCache()
                
            except Exception as cleanup_error:
                print(f"Cleanup error: {cleanup_error}")
    
            spark.stop()
            time.sleep(5)    