This Jupyter file runs the entire anomaly detection code for the compressor insights use case.

from the import statements below, the main_util_script, does all the data cleaning and transformation and provides a dataframe and a list as output. To run the util script, the pressure ranges of the racks need to be supplied as input. The dataframe provided as output here can be split for different uses, those needed for compressor quadrants are selected and written to file, while ml_df, refers to data used as input for anomaly detection.

Training_script trains the model, saves the scaling parameters used, model trained and results of the prediction to a file unique to the rack that calls the script. Prediction script is run if a result.csv file exists for the rack.

The threshold script contains the dates that were flagged as anomalies as well as the alarm parameter that is either set to true or false. Alarm parameter is set to true when we have 14 consecutive days flagged as anomalies.

In [None]:
import os.path
import pandas as pd
import matplotlib.pyplot as plt
from pressure_range_script import pressure_range_dict
from main_util_script import aggregate_dataframe
from training_script import model_train
from threshold_script import threshold
from prediction import run_predict

In [None]:
pd.options.display.max_rows = 2000
pd.options.display.max_columns = 2000
seperator = '_'
path = 'C:/Users/U378246/Documents/ECS_data_analysis_notebooks/Data/'
filename = 'wm67.csv'

In [None]:
colnames = ['SiteID', 'SiteName', 'AssetID', 'AssetName', 'PointName', 'DataValue', 'Timetag', 'Units', 'PropertyName']
df = pd.read_csv(path+filename, names=colnames, header=None)
site_id = df.SiteID.unique()[0]
pressure_range = pressure_range_dict.get(site_id)

In [None]:
data_agg = aggregate_dataframe(df,site_id, suc_pres_range=pressure_range)
data,rack_names = data_agg.aggregate_data()
data.head()

In [None]:
def appendDFToCSV_void(df, csvFilePath, sep=","):
    if not os.path.isfile(csvFilePath):
        df.to_csv(csvFilePath, mode='a', index=True, sep=sep)
    else:
        df.to_csv(csvFilePath, mode='a', index=True, sep=sep, header=False)

In [None]:
quadrants_df = data.filter(regex='Runtime|Switch|% Capacity|Neutral|Quadrant')
quadrants_df['SiteID'] = site_id
quadrant_filename = 'quadrants.csv'
appendDFToCSV_void(quadrants_df, quadrant_filename)

In [None]:
ml_df = data[data.columns.drop(list(data.filter(regex='% Capacity|Neutral|Quadrant')))]

In [None]:
def get_racks(df, rack_var):
    var_name = rack_var[-1:].upper()
    rack_name = df.loc[:, df.columns.str.contains('{}$|{}\s[0-9]*$|{}\s'.format(var_name, var_name, var_name))]
    return rack_name

In [None]:
racks = [0]*len(rack_names)
result = [0]*len(rack_names)

for i,j in enumerate(rack_names):
    rack_names[i] = rack_names[i].replace(" ", "_")
    alarm_file = str(site_id)+ seperator+ rack_names[i]+ '_flags.csv'
    result_file= str(site_id)+ seperator+ rack_names[i]+ '_results.csv'
    racks[i] = get_racks(ml_df, j)
    
    if os.path.isfile(result_file):
        scaler_file = str(site_id)+ seperator+ rack_names[i]+ '_std_scaler.bin'
        model_file  = str(site_id)+ seperator+ rack_names[i]+ '_model.bin'
        run_predict(racks[i], scaler_file, model_file, result_file)
        
        prev_result = pd.read_csv(result_file, index_col='Timetag')
        prev_result = prev_result[~prev_result.index.duplicated(keep='last')]
        prev_result.to_csv(result_file)
        
        prev_alarm = pd.read_csv(alarm_file)
        prev_alarm = prev_alarm.drop_duplicates(subset = ['Timetag', 'alarm'])
        prev_alarm.to_csv(alarm_file, index=False)
        threshold(prev_result['isolation_forest_pred'], alarm_file)
    
    else:
        train_instance = model_train(racks[i], rack_names[i], site_id)
        result[i] = train_instance.train_model()
        appendDFToCSV_void(result[i], result_file)
        threshold(result[i]['isolation_forest_pred'], alarm_file)

In [None]:
def gen_plot(df, variable='cluster', style='*'):
    fig,ax = plt.subplots(figsize = (15,10))
    for i in df.columns:
        if variable in i.lower():
            unique_val = df[i].unique()
            for j in unique_val:
                ax.plot(df[i][df[i] == j], '.', label=(i+' '+str(j)))
                ax.legend()
            
    return plt.show()

In [None]:
gen_plot(result[0], 'isol')

In [None]:
result[0][result[0]['isolation_forest_pred'] == -1]