# Underperformance detection

In this notebook, we explore a self-supervised approach for the underperformance use-case. Given a time series comprising measurements from a turbine including power output and dependent parameters like wind speed and rotor speed, we i) distinguish between periods of underperformance and periods of optimal performance using an unsupervised method, and ii) we use those periods to train a classifier which can be then tested in new data.

To evaluate our approach, we compare our output with the ground-truth information, where we assume that we have underperformance if and only if the static yaw angle (which is provided) is non-zero.

In [None]:
import os, sys
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import seaborn as sns
module_path = './'
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.preprocessing import *
from modules.io import *
from glob import glob
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from tqdm.notebook import tqdm
import seaborn
import pickle
from copy import deepcopy
from timeit import default_timer as timer
from sklearn.ensemble import RandomForestRegressor as RFRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.pipeline import Pipeline
import lightgbm as lgb
plt.style.use('ggplot')
from sklearn.linear_model import QuantileRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import time

# Test the performance of underperformance method

## Import Dask

In [None]:
from dask.distributed import Client
from dask.distributed import wait
from dask.distributed import as_completed
import dask
from dask import delayed
dask.config.set(scheduler='synchronous')


## Start Dask

In [None]:
num_workers = 30
client = Client(n_workers=num_workers,threads_per_worker=1)
client

## Read filenames of datasets (streams)

In [None]:
#read filenames of datasets (streams)
filenames = sorted(glob(os.path.join('/data/data1/synthetic_yaw_data','testing_*.csv')))
filenames = filenames[:100] # load 100 for a test


## Read datasets (streams) with associated variables

In [None]:
#read datasets (streams) with associated variables
target = 'active power'
feats = ['active power', 'wind speed',  'rotor speed']
test_data = []
for f in filenames:
    df = load_df(f)
    
    df = df.set_index('timestamp')
    df = df.dropna(axis=1, how='all')
    df.columns = df.columns.str.replace('cor. ', '', regex=False)
    cols = ['wind speed', 'pitch angle', 'rotor speed', 'active power',
            'nacelle direction', 'wind direction']
    df = df[cols]

    test_data.append(df[feats].values)

## Read pretrained model

In [None]:
#read pretrained model
file_path = '/home/ipsarros/pretrained_models/underperformance_classifier2.pickle'
with open(file_path, 'rb') as file:
    loaded_model = pickle.load(file)

## Sequential code

In [None]:
%%time
#sequential


window = 1440
i = 0
running_time = []
total_running_time = 0.0


while i< len(test_data[0])-window:
    
    start = time.time()
    for arr in test_data:
        result = loaded_model.predict(arr[i:i+window],num_threads=1)
    end = time.time()
    
    running_time_temp = end - start
    running_time.append(running_time_temp)
    total_running_time = total_running_time + running_time_temp  
    i = i + window
    

print ("total_running_time = ", total_running_time )

## Parallel Batch Code

In [None]:
def underperformance_predict(loaded_model,batch_data):
    results = []
    for batch in batch_data:
        result = loaded_model.predict(batch,num_threads=1)
        results.append(result)
    return results

In [None]:
def parallel_batch_processing_futures(loaded_model, batch_data):
    running_time = 0.0
    futures = []
    
    start = time.time()
    for batch in batch_data:
        future = client.submit(underperformance_predict,loaded_model, batch)
        futures.append(future)
    wait(futures, return_when="ALL_COMPLETED")
    end = time.time()
    del futures
    futures = []
    
    running_time = end - start
    
    return running_time
    

In [None]:
%%time


window = 1440
i = 0

batch_data_size = len(test_data)//num_workers
batch_data = []
batch_data_all = []
counter = 0
num_worker = 0

running_time = []
total_running_time = 0.0

while i< len(test_data[0])-window:
    for arr in test_data:
        if (counter < batch_data_size):
            batch_data.append(arr[i:i+window])
            counter = counter + 1
        elif num_worker == num_workers - 1:
            batch_data.append(arr[i:i+window])
        else:
            counter = 0
            batch_data_all.append(batch_data)
            
            batch_data = []
            batch_data.append(arr[i:i+window])
            counter = counter + 1
            num_worker = num_worker + 1
    
    batch_data_all.append(batch_data)
    batch_data = []
        
    num_worker = 0
    counter = 0
    
    running_time_temp = parallel_batch_processing_futures(loaded_model, batch_data_all)
    running_time.append(running_time_temp)
    total_running_time = total_running_time + running_time_temp
    
    batch_data_all = []

    i = i + window

    
if i < len(test_data[0]):
    for arr in test_data:
        if (counter < batch_data_size):
            batch_data.append(arr[i:i+window])
            counter = counter + 1
        elif num_worker == num_workers - 1:
            batch_data.append(arr[i:i+window])
        else:
            counter = 0
            batch_data_all.append(batch_data)
            
            batch_data = []
            batch_data.append(arr[i:i+window])
            counter = counter + 1
            num_worker = num_worker + 1
    
    batch_data_all.append(batch_data_all)
    batch_data = []
    
    num_worker = 0
    counter = 0
    
    running_time_temp = parallel_batch_processing_futures(loaded_model, batch_data_all)
    running_time.append(running_time_temp)
    total_running_time = total_running_time + running_time_temp
    
    batch_data_all = []


print ("total_running_time = ", total_running_time )

In [None]:
n_streams = 20000, window_size =  1440 :
        
    sequential: 
        total_running_time =  10964.847057819366
        CPU times: user 3h 2min 42s, sys: 11 ms, total: 3h 2min 42s
        Wall time: 3h 2min 44s
    
    threads = 8 :
        total_running_time =  1282.675329208374
        CPU times: user 2min 21s, sys: 1min 5s, total: 3min 27s
        Wall time: 21min 22s
    
    threads = 16:
        total_running_time =  723.8945622444153
        CPU times: user 2min 40s, sys: 1min 7s, total: 3min 47s
        Wall time: 12min 4s
        
    threads = 30:
        total_running_time =  472.08199548721313
        CPU times: user 3min 5s, sys: 50.3 s, total: 3min 55s
        Wall time: 7min 52s
            
            
threads = 30, window_size = 1440:
    
    n_streams = 20000 : 
        total_running_time =  472.08199548721313
        CPU times: user 3min 5s, sys: 50.3 s, total: 3min 55s
        Wall time: 7min 52s
    
    n_streams = 40000 :
        total_running_time =  868.0069897174835
        CPU times: user 17min 34s, sys: 2min 30s, total: 20min 4s
        Wall time: 14min 28s

    n_streams = 60000 :
        total_running_time =  1290.3016259670258
        CPU times: user 19min 49s, sys: 3min 45s, total: 23min 34s
        Wall time: 21min 31s

    n_streams = 80000 :
        total_running_time =  1705.2955605983734
        CPU times: user 20min 59s, sys: 4min 52s, total: 25min 51s
        Wall time: 28min 26s
            
            
threads = 30, n_streams = 80000 :
    
    window_size = 1440 : 
        total_running_time =  1705.2955605983734
        CPU times: user 20min 59s, sys: 4min 52s, total: 25min 51s
        Wall time: 28min 26s
        
    window_size = 2880 :
        total_running_time =  1656.778636932373
        CPU times: user 14min 4s, sys: 4min 28s, total: 18min 33s
        Wall time: 27min 37s
        
    window_size = 4320 :
        total_running_time =  1449.8256077766418
        CPU times: user 9min 59s, sys: 3min 50s, total: 13min 50s
        Wall time: 24min 10s
        
    
    window_size =  5760 : 
        total_running_time =  1602.882644891739
        CPU times: user 9min 41s, sys: 4min 13s, total: 13min 55s
        Wall time: 26min 43s
        
       
    window_size = 7200 :
        total_running_time =  1603.0916907787323
        CPU times: user 8min 46s, sys: 4min 18s, total: 13min 4s
        Wall time: 26min 43s
        

## Close Dask

In [None]:
client.close()