In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from datetime import datetime, timedelta, timezone

In [2]:
failure_2016 = pd.read_csv('./data/init/failures-2016.csv',sep=';')
failure_2017 = pd.read_csv('./data/init/failures-2017.csv',sep=';')
metmast_2016 = pd.read_csv('./data/init/metmast-2016.csv',sep=';')
metmast_2017 = pd.read_csv('./data/init/metmast-2017.csv',sep=';')
signals_2016 = pd.read_csv('./data/init/signals-2016.csv', sep=';')
signals_2017= pd.read_csv('./data/init/signals-2017.csv', sep=';')



# 1. Cleaning Signal data

Combining signals from both years and aggregating the time series data into recurring once a day.

In [3]:
def signal_preprocess(signals):
    
    signals['Timestamp'] = pd.to_datetime(signals['Timestamp'])
    signals=signals.set_index('Timestamp')
    return signals


In [4]:
signals=pd.concat([signals_2016, signals_2017], axis=0)
signals = signal_preprocess(signals)

Aggregate the time series dataframe into a daily data frame

In [5]:
def aggregate_signals(signals):
    agg_signals=signals.groupby('Turbine_ID').resample('D').sum()
    agg_signals['Turbine_ID'] = agg_signals.index.get_level_values('Turbine_ID')  
    agg_signals=agg_signals.reset_index('Timestamp')
    return agg_signals


In [6]:
agg_signals=aggregate_signals(signals)
agg_signals=agg_signals.reset_index(drop=True)

In [10]:
agg_signals

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Grd_Prod_PsbleInd_Avg,Grd_Prod_PsbleInd_Max,Grd_Prod_PsbleInd_Min,Grd_Prod_PsbleInd_Std,Grd_Prod_PsbleCap_Avg,Grd_Prod_PsbleCap_Max,Grd_Prod_PsbleCap_Min,Grd_Prod_PsbleCap_Std,Gen_Bear2_Temp_Avg,Nac_Direction_Avg
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,-92929.7,-68241.0,-109983.9,9823.7,92920.8,109983.9,67537.9,9877.8,5554,32453.4
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,-45063.8,-35598.1,-53448.7,4432.3,44758.1,53448.7,33839.1,4969.9,4752,23121.9
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,-67850.8,-54531.9,-73000.0,5227.6,63328.1,73000.0,42621.4,8344.0,5503,35495.5
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,-133426.6,-100994.2,-143994.0,13072.6,119419.5,143390.4,77022.7,19260.8,8811,38177.5
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,-129142.2,-116437.2,-142871.2,6964.7,119370.2,140043.4,101280.0,8765.6,7097,45004.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,-120201.6,-97406.6,-143258.6,13087.6,107426.6,140996.6,76593.5,15766.5,8355,40865.5
2920,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,-135333.5,-109675.3,-143738.0,8928.4,130039.6,143738.0,94313.4,12493.0,6940,39614.2
2921,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,-85117.9,-37053.6,-122976.5,20889.7,85117.9,122976.5,37053.6,20889.7,5539,43819.8
2922,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,-12643.7,-1886.9,-30372.6,6473.7,12643.7,30372.6,1886.9,6473.7,4220,22927.5


Prepare signal for each turbine

In [7]:
turbine_names = agg_signals["Turbine_ID"].unique()
def create_df_for_each_turbine(signals: pd.DataFrame) -> list[pd.DataFrame]:
    turbine_dfs = list()

    for turbine in turbine_names:
        test = agg_signals["Turbine_ID"]
        turbine_df = signals[agg_signals["Turbine_ID"] == turbine]
        turbine_df = turbine_df.sort_values("Timestamp")
        turbine_df = turbine_df.reset_index()
        
        turbine_dfs.append(turbine_df)

    return turbine_dfs

turbine_dfs = create_df_for_each_turbine(agg_signals)
#convert list to dataframe
#turbine_signals = pd.concat(turbine_dfs, axis=0)


Drop the columns with low variance

In [8]:
def get_signals_with_low_variance(df: pd.DataFrame, threshold=0) -> list:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cont_data = df.select_dtypes(include=numerics)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(cont_data)
    inverted_list = ~np.array(selector.get_support())
    return cont_data.columns[inverted_list].tolist()

In [9]:
cols_to_drop = set()
for i, df in enumerate(turbine_dfs):
    cols_to_drop.update(get_signals_with_low_variance(df))

In [10]:
signals_clean=pd.concat(turbine_dfs, axis=0)
signals_clean=signals_clean.drop(list(cols_to_drop), axis=1)

In [11]:
#create a binary column to indicate if a signal data has any missing values
signals_clean['missing_values'] = signals_clean.isnull().any(axis=1).astype(int)
#fill the missing valeu with backfill and forwardfill
signals_clean=signals_clean.fillna(method='bfill')

  signals_clean=signals_clean.fillna(method='bfill')


In [12]:
signals_clean

Unnamed: 0,index,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,...,Grd_Prod_PsbleInd_Max,Grd_Prod_PsbleInd_Min,Grd_Prod_PsbleInd_Std,Grd_Prod_PsbleCap_Avg,Grd_Prod_PsbleCap_Max,Grd_Prod_PsbleCap_Min,Grd_Prod_PsbleCap_Std,Gen_Bear2_Temp_Avg,Nac_Direction_Avg,missing_values
0,0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,...,-68241.0,-109983.9,9823.7,92920.8,109983.9,67537.9,9877.8,5554,32453.4,0
1,1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,...,-35598.1,-53448.7,4432.3,44758.1,53448.7,33839.1,4969.9,4752,23121.9,0
2,2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,...,-54531.9,-73000.0,5227.6,63328.1,73000.0,42621.4,8344.0,5503,35495.5,0
3,3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,...,-100994.2,-143994.0,13072.6,119419.5,143390.4,77022.7,19260.8,8811,38177.5,0
4,4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,...,-116437.2,-142871.2,6964.7,119370.2,140043.4,101280.0,8765.6,7097,45004.3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,2919,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,...,-97406.6,-143258.6,13087.6,107426.6,140996.6,76593.5,15766.5,8355,40865.5,0
727,2920,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,...,-109675.3,-143738.0,8928.4,130039.6,143738.0,94313.4,12493.0,6940,39614.2,0
728,2921,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,...,-37053.6,-122976.5,20889.7,85117.9,122976.5,37053.6,20889.7,5539,43819.8,0
729,2922,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,...,-1886.9,-30372.6,6473.7,12643.7,30372.6,1886.9,6473.7,4220,22927.5,0


# 2. Clean Metacast data

Combine metamast data for both 2016 and 2017. Aggregate the metamast data into daily time slots

In [13]:
metmast = pd.concat([metmast_2016, metmast_2017], axis=0)
metmast['Timestamp'] = pd.to_datetime(metmast['Timestamp'])
metmast=metmast.set_index('Timestamp')
agg_metmast=metmast.resample('D').sum()

In [14]:
cols_to_drop = get_signals_with_low_variance(agg_metmast)  
metmast_clean = agg_metmast.drop(cols_to_drop, axis=1)
metmast_clean = metmast_clean.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)


In [15]:
metmast_clean['metamast_missing_values'] = metmast_clean.isnull().any(axis=1).astype(int)
metmast_clean=metmast_clean.fillna(method='bfill')
metmast_clean=metmast_clean.reset_index()


  metmast_clean=metmast_clean.fillna(method='bfill')


# Merge metamast data and signals data

In [16]:
def merge_signals_metmast(signals: pd.DataFrame, metmast: pd.DataFrame) -> pd.DataFrame:
    merged_df = pd.merge(signals.reset_index(drop=True), metmast.reset_index(drop=True), on="Timestamp", how="left")
    merged_df.drop(columns=["index"], inplace=True)
    merged_df = merged_df[merged_df["missing_values"] == 0]
    merged_df = merged_df[merged_df["metamast_missing_values"] == 0]
    merged_df.drop(columns=["missing_values", "metamast_missing_values"], inplace=True)
    return merged_df


In [17]:
merged_df = merge_signals_metmast(signals_clean, metmast_clean)

In [18]:
merged_df.to_csv('./data/merged_signals_metmast.csv', index=False)

# 3. Preprocess Failure data

In [19]:
failures=pd.concat([failure_2016, failure_2017], axis=0)
failures['Timestamp'] = pd.to_datetime(failures['Timestamp'])
failures['Timestamp'] = failures['Timestamp'].dt.floor('d')


In [20]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 11
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   Turbine_ID  28 non-null     object             
 1   Component   28 non-null     object             
 2   Timestamp   28 non-null     datetime64[ns, UTC]
 3   Remarks     28 non-null     object             
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 1.1+ KB


In [21]:

failures

Unnamed: 0,Turbine_ID,Component,Timestamp,Remarks
0,T01,GEARBOX,2016-07-18 00:00:00+00:00,Gearbox pump damaged
1,T06,GENERATOR,2016-07-11 00:00:00+00:00,Generator replaced
2,T06,GENERATOR,2016-07-24 00:00:00+00:00,Generator temperature sensor failure
3,T06,GENERATOR,2016-09-04 00:00:00+00:00,High temperature generator error
4,T06,GENERATOR,2016-10-27 00:00:00+00:00,Generator replaced
5,T06,GENERATOR,2016-10-02 00:00:00+00:00,Refrigeration system and temperature sensors i...
6,T06,HYDRAULIC_GROUP,2016-04-04 00:00:00+00:00,Error in pitch regulation
7,T07,GENERATOR_BEARING,2016-04-30 00:00:00+00:00,High temperature in generator bearing (replace...
8,T07,TRANSFORMER,2016-07-10 00:00:00+00:00,High temperature transformer
9,T07,TRANSFORMER,2016-08-23 00:00:00+00:00,High temperature transformer. Transformer refr...


In [22]:
days_lookback = 60
def create_failure_list(failures: pd.DataFrame, days_lookback: int, value_function, target_name: str = "Target") -> pd.DataFrame:
    failure_list = []

    for i in range(len(failures)):
        turbine_id = str(failures.iloc[i]["Turbine_ID"])
        failure_datetime = failures.iloc[i]["Timestamp"]
        components = failures.iloc[i]["Component"]
        rounded_datetime = failure_datetime.replace(hour=0, minute=0, second=0, microsecond=0)  # Round to the start of the day

        for j in range(days_lookback):
            delta = timedelta(days=j)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            # Calculate the target value using the provided value_function
            target_value = value_function(j, days_lookback)
            failure_list.append([turbine_id, datetime_formated.isoformat(), components,target_value])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", "components",target_name])
    return failure_df

In [32]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"
regression_function = lambda i, j: i 
classif_function = lambda i, j: 1
failure_df_reg = create_failure_list(failures, days_lookback, regression_function, reg_target_name)
failure_df_class = create_failure_list(failures, days_lookback, classif_function, class_target_name)


In [33]:
failure_df_reg

Unnamed: 0,Turbine_ID,Timestamp,components,RUL (Target)
0,T01,2016-07-18T00:00:00+00:00,GEARBOX,0
1,T01,2016-07-17T00:00:00+00:00,GEARBOX,1
2,T01,2016-07-16T00:00:00+00:00,GEARBOX,2
3,T01,2016-07-15T00:00:00+00:00,GEARBOX,3
4,T01,2016-07-14T00:00:00+00:00,GEARBOX,4
...,...,...,...,...
1675,T11,2017-07-19T00:00:00+00:00,HYDRAULIC_GROUP,55
1676,T11,2017-07-18T00:00:00+00:00,HYDRAULIC_GROUP,56
1677,T11,2017-07-17T00:00:00+00:00,HYDRAULIC_GROUP,57
1678,T11,2017-07-16T00:00:00+00:00,HYDRAULIC_GROUP,58


In [34]:
failure_df_reg.to_csv('./data/failures_df.csv', index=False)

In [35]:
failure_df_reg['Timestamp'] = pd.to_datetime(failure_df_reg['Timestamp'])
failure_df_class['Timestamp'] = pd.to_datetime(failure_df_class['Timestamp'])

In [36]:
labeled_df_temp = pd.merge(merged_df.reset_index(drop=True), failure_df_reg.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 
labeled_df = pd.merge(labeled_df_temp.reset_index(drop=True), failure_df_class.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 

In [37]:
labeled_df[reg_target_name] = labeled_df[reg_target_name].fillna(60).astype(int)
labeled_df[class_target_name] = labeled_df[class_target_name].fillna(0).astype(int)

In [38]:
labeled_df.drop_duplicates(inplace=True)
labeled_df.drop(columns=["components_x", "components_y"], inplace=True)

In [39]:
labeled_df.to_csv('./data/labeled_data.csv', index=False)