# Underperformance detection

In this notebook, we explore a self-supervised approach for the underperformance use-case. Given a time series comprising measurements from a turbine including power output and dependent parameters like wind speed and rotor speed, we i) distinguish between periods of underperformance and periods of optimal performance using an unsupervised method, and ii) we use those periods to train a classifier which can be then tested in new data.

To evaluate our approach, we compare our output with the ground-truth information, where we assume that we have underperformance if and only if the static yaw angle (which is provided) is non-zero.

In [None]:
import os, sys
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import seaborn as sns
module_path = './'
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.preprocessing import *
from modules.io import *
from glob import glob
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from tqdm.notebook import tqdm
import seaborn
import pickle
from copy import deepcopy
from timeit import default_timer as timer
from sklearn.ensemble import RandomForestRegressor as RFRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.pipeline import Pipeline
import lightgbm as lgb
plt.style.use('ggplot')
from sklearn.linear_model import QuantileRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import time

# Test the performance of Underperformance detection for big data

## Import Dask

In [None]:
from dask.distributed import Client
from dask.distributed import wait
import dask
from dask import delayed
dask.config.set(scheduler='synchronous')

## Start Dask

In [None]:
num_workers = 30
client = Client(n_workers=num_workers,threads_per_worker=1)
#client = Client(n_workers=1,threads_per_worker=num_workers)

#client = Client(n_workers=1,threads_per_worker=30)
client

## Read and create files

In [None]:
feats = ['active power', 'wind speed',  'rotor speed']

#parameters: 
n_streams = 600000
window_size = 1000
n_iter = 30 

# must: n_windows < len(filenames)
temp_stream = pd.read_csv('/data/data1/synthetic_yaw_data/testing_set0.csv', nrows = window_size)[feats].values


## Read pretrained model

In [None]:
#read pretrained model
file_path = '/home/ipsarros/pretrained_models/underperformance_classifier2.pickle'

with open(file_path, 'rb') as file:
    loaded_model = pickle.load(file)

## Sequential code

In [None]:
total_running_time = 0
for i in range(n_iter):
    #create new data of size window_size
    test_data = np.tile([temp_stream], (n_streams, 1, 1))
    for j in range(len(test_data)):
        for i in range(test_data.shape[2]):
            noise =  np.random.normal(0, 0.1, test_data.shape[1])
            test_data[j, :, i] = test_data[j, :, i] + noise
            
    #make predictions for all new data
    for arr in test_data:
        start = time.time()
        result = loaded_model.predict(arr, num_threads=1)
        end = time.time()
        running_time_temp = end - start
        total_running_time = total_running_time + running_time_temp  
    
    
print ("total_running_time = ", total_running_time )

## Parallel Batch code 

In [None]:
def underperformance_predict(loaded_model,batch_data):
    #results = []
    for batch in batch_data:
        result = loaded_model.predict(batch,num_threads=1)
        #print(result)
     #   results.append(result)
    #return results

In [None]:
def parallel_batch_processing_futures(loaded_model, batch_data):
    running_time = 0.0
    futures = []
    
    start = time.time()
    for batch in batch_data:
        future = client.submit(underperformance_predict,loaded_model, batch)
        futures.append(future)
    
    #client.gather(futures)
    #print (results)
    wait(futures, return_when="ALL_COMPLETED")
    #wait(futures)
    #counter = 0
   # for future, result in as_completed(futures, with_results=True):
        #print("size = ",len(result))
    #    counter = counter + len (result)
    #    print("result = ", result)
    
    #print ("counter = " , counter)
    end = time.time()
    del futures
    futures = []
    
    running_time = end - start
    
    return running_time

In [None]:
%%time

#new code to do
total_running_time = 0
for i in range(n_iter):
    #create new data of size window_size
    test_data = np.tile([temp_stream], (n_streams, 1, 1))
    for j in range(len(test_data)):
        for i in range(test_data.shape[2]):
            noise =  np.random.normal(0, 0.1, test_data.shape[1])
            test_data[j, :, i] = test_data[j, :, i] + noise
    
    batch_data_size = len(test_data)//num_workers
    batch_data = []
    batch_data_all = []
    counter = 0
    num_worker = 0
    
    #make predictions for all new data
    for arr in test_data:
        if (counter < batch_data_size):
            batch_data.append(arr)
            counter = counter + 1
        elif num_worker == num_workers - 1:
            batch_data.append(arr)
        else:
            counter = 0
            batch_data_all.append(batch_data)
            
            batch_data = []
            batch_data.append(arr)
            counter = counter + 1
            num_worker = num_worker + 1
    
    batch_data_all.append(batch_data)
    batch_data = []
    
    num_worker = 0
    counter = 0
    
    running_time_temp = parallel_batch_processing_futures(loaded_model, batch_data_all)
    #running_time.append(running_time_temp)
    total_running_time = total_running_time + running_time_temp
    
    batch_data_all = []
   
    
print ("total_running_time = ", total_running_time )

In [None]:
## Experiments

In [None]:
experiments 

n_streams = 600000, window_size = 1000, n_iterations = 10, threads = 30:
    
    total_running_time =  3953.3994550704956
    CPU times: user 33min 29s, sys: 11min 31s, total: 45min
    Wall time: 1h 15min 21s


n_streams = 600000, window_size = 1000, n_iterations = 20, threads = 30:
    
    total_running_time =  7809.9085302352905
    CPU times: user 1h 4min 23s, sys: 23min 32s, total: 1h 27min 55s
    Wall time: 2h 28min 39s
    
    
n_streams = 600000, window_size = 1000, n_iterations = 30, threads = 30:

    total_running_time =  11975.88755440712
    CPU times: user 1h 33min, sys: 37min 8s, total: 2h 10min 9s
    Wall time: 3h 47min 21s


## Close Dask

In [None]:
client.close()