In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from datetime import datetime, timedelta, timezone

In [2]:
failure_2016 = pd.read_csv('./data/init/failures-2016.csv',sep=';')
failure_2017 = pd.read_csv('./data/init/failures-2017.csv',sep=';')
metmast_2016 = pd.read_csv('./data/init/metmast-2016.csv',sep=';')
metmast_2017 = pd.read_csv('./data/init/metmast-2017.csv',sep=';')
signals_2016 = pd.read_csv('./data/init/signals-2016.csv', sep=';')
signals_2017= pd.read_csv('./data/init/signals-2017.csv', sep=';')



# 1. Cleaning Signal data

Combining signals from both years and aggregating the time series data into recurring once a day.

In [40]:
#combine signals for both the years
signals = pd.concat([signals_2016, signals_2017], axis=0)

Aggregate the time series dataframe into a daily data frame

In [41]:
signals['Timestamp'] = pd.to_datetime(signals['Timestamp'])
signals=signals.set_index('Timestamp')

In [42]:
agg_signals=signals.groupby('Turbine_ID').resample('D').sum()
agg_signals['Turbine_ID'] = agg_signals.index.get_level_values('Turbine_ID')  
agg_signals=agg_signals.reset_index('Timestamp')


In [43]:
agg_signals=agg_signals.set_index('Timestamp')

In [44]:
agg_signals

Unnamed: 0_level_0,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,Hyd_Oil_Temp_Avg,...,Grd_Prod_PsbleInd_Avg,Grd_Prod_PsbleInd_Max,Grd_Prod_PsbleInd_Min,Grd_Prod_PsbleInd_Std,Grd_Prod_PsbleCap_Avg,Grd_Prod_PsbleCap_Max,Grd_Prod_PsbleCap_Min,Grd_Prod_PsbleCap_Std,Gen_Bear2_Temp_Avg,Nac_Direction_Avg
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,4242,...,-92929.7,-68241.0,-109983.9,9823.7,92920.8,109983.9,67537.9,9877.8,5554,32453.4
2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,4265,...,-45063.8,-35598.1,-53448.7,4432.3,44758.1,53448.7,33839.1,4969.9,4752,23121.9
2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,4102,...,-67850.8,-54531.9,-73000.0,5227.6,63328.1,73000.0,42621.4,8344.0,5503,35495.5
2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,5204,...,-133426.6,-100994.2,-143994.0,13072.6,119419.5,143390.4,77022.7,19260.8,8811,38177.5
2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,5338,...,-129142.2,-116437.2,-142871.2,6964.7,119370.2,140043.4,101280.0,8765.6,7097,45004.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,5492,...,-120201.6,-97406.6,-143258.6,13087.6,107426.6,140996.6,76593.5,15766.5,8355,40865.5
2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,4340,...,-135333.5,-109675.3,-143738.0,8928.4,130039.6,143738.0,94313.4,12493.0,6940,39614.2
2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,4159,...,-85117.9,-37053.6,-122976.5,20889.7,85117.9,122976.5,37053.6,20889.7,5539,43819.8
2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,4089,...,-12643.7,-1886.9,-30372.6,6473.7,12643.7,30372.6,1886.9,6473.7,4220,22927.5


Prepare signal for each turbine

In [45]:
turbine_names = agg_signals["Turbine_ID"].unique()
def create_df_for_each_turbine(signals: pd.DataFrame) -> list[pd.DataFrame]:
    turbine_dfs = list()

    for turbine in turbine_names:
        test = agg_signals["Turbine_ID"]
        turbine_df = signals[agg_signals["Turbine_ID"] == turbine]
        turbine_df = turbine_df.sort_values("Timestamp")
        turbine_df = turbine_df.reset_index()
        
        turbine_dfs.append(turbine_df)

    return turbine_dfs

turbine_dfs = create_df_for_each_turbine(agg_signals)
#convert list to dataframe
#turbine_signals = pd.concat(turbine_dfs, axis=0)


Drop the columns with low variance

In [46]:
def get_signals_with_low_variance(df: pd.DataFrame, threshold=0) -> list:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cont_data = df.select_dtypes(include=numerics)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(cont_data)
    inverted_list = ~np.array(selector.get_support())
    return cont_data.columns[inverted_list].tolist()

In [47]:
cols_to_drop = set()
for i, df in enumerate(turbine_dfs):
    cols_to_drop.update(get_signals_with_low_variance(df))

In [48]:
signals_clean=pd.concat(turbine_dfs, axis=0)
signals_clean=signals_clean.drop(list(cols_to_drop), axis=1)

In [49]:
signals_clean

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Grd_Prod_PsbleInd_Avg,Grd_Prod_PsbleInd_Max,Grd_Prod_PsbleInd_Min,Grd_Prod_PsbleInd_Std,Grd_Prod_PsbleCap_Avg,Grd_Prod_PsbleCap_Max,Grd_Prod_PsbleCap_Min,Grd_Prod_PsbleCap_Std,Gen_Bear2_Temp_Avg,Nac_Direction_Avg
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,-92929.7,-68241.0,-109983.9,9823.7,92920.8,109983.9,67537.9,9877.8,5554,32453.4
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,-45063.8,-35598.1,-53448.7,4432.3,44758.1,53448.7,33839.1,4969.9,4752,23121.9
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,-67850.8,-54531.9,-73000.0,5227.6,63328.1,73000.0,42621.4,8344.0,5503,35495.5
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,-133426.6,-100994.2,-143994.0,13072.6,119419.5,143390.4,77022.7,19260.8,8811,38177.5
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,-129142.2,-116437.2,-142871.2,6964.7,119370.2,140043.4,101280.0,8765.6,7097,45004.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,-120201.6,-97406.6,-143258.6,13087.6,107426.6,140996.6,76593.5,15766.5,8355,40865.5
727,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,-135333.5,-109675.3,-143738.0,8928.4,130039.6,143738.0,94313.4,12493.0,6940,39614.2
728,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,-85117.9,-37053.6,-122976.5,20889.7,85117.9,122976.5,37053.6,20889.7,5539,43819.8
729,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,-12643.7,-1886.9,-30372.6,6473.7,12643.7,30372.6,1886.9,6473.7,4220,22927.5


# 2. Clean Metacast data

Combine metamast data for both 2016 and 2017.

In [50]:
metamast = pd.concat([metmast_2016, metmast_2017], axis=0)
metamast

Unnamed: 0,Timestamp,Min_Windspeed1,Max_Windspeed1,Avg_Windspeed1,Var_Windspeed1,Min_Windspeed2,Max_Windspeed2,Avg_Windspeed2,Var_Windspeed2,Min_Winddirection2,...,Anemometer1_CorrOffset,Anemometer2_Freq,Anemometer2_Offset,Anemometer2_CorrGain,Anemometer2_CorrOffset,DistanceAirPress,AirRessureSensorZeroOffset,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq
0,2016-06-24T13:50:00+00:00,0.9,5.4,3.2,0.64,1.5,5.2,3.2,0.64,236.0,...,0,0.0499,0.24,1,0,0,600,60,60,410
1,2016-06-24T14:20:00+00:00,0.4,4.9,2.6,0.96,0.5,5.0,2.6,0.96,236.0,...,0,0.0499,0.24,1,0,0,600,49,49,410
2,2016-11-28T12:10:00+00:00,0.3,1.5,0.9,0.06,0.3,1.5,0.9,0.06,236.0,...,0,0.0499,0.24,1,0,0,600,15,14,406
3,2016-06-25T06:30:00+00:00,1.0,1.9,1.2,0.03,1.0,1.9,1.3,0.02,236.0,...,0,0.0499,0.24,1,0,0,600,21,22,407
4,2016-12-23T11:50:00+00:00,2.6,9.6,5.6,1.84,3.3,8.2,5.4,0.77,236.0,...,0,0.0499,0.24,1,0,0,600,109,105,427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34826,2017-06-05T11:40:00+00:00,2.6,5.9,4.4,0.33,2.9,6.1,4.4,0.33,236.0,...,0,0.0499,0.24,1,0,0,600,84,85,411
34827,2017-05-11T00:20:00+00:00,7.8,16.0,11.8,2.05,8.0,16.3,11.9,2.12,236.0,...,0,0.0499,0.24,1,0,0,600,233,235,397
34828,2017-07-08T04:50:00+00:00,3.5,7.6,6.1,0.77,3.3,7.8,6.2,0.81,236.0,...,0,0.0499,0.24,1,0,0,600,120,122,409
34829,2017-05-10T23:40:00+00:00,9.0,14.2,11.2,1.01,9.1,14.4,11.3,1.01,236.0,...,0,0.0499,0.24,1,0,0,600,221,224,397


Aggregate the metamast data into daily time slots

In [51]:
metamast['Timestamp'] = pd.to_datetime(metamast['Timestamp'])
metamast=metamast.set_index('Timestamp')
metamast=metamast.resample('D').sum()

In [52]:
metamast = metamast.sort_values("Timestamp")
metmast = metamast.reset_index()

In [63]:
metmast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   Timestamp                   731 non-null    datetime64[ns, UTC]
 1   Min_Windspeed1              731 non-null    float64            
 2   Max_Windspeed1              731 non-null    float64            
 3   Avg_Windspeed1              731 non-null    float64            
 4   Var_Windspeed1              731 non-null    float64            
 5   Min_Windspeed2              731 non-null    float64            
 6   Max_Windspeed2              731 non-null    float64            
 7   Avg_Windspeed2              731 non-null    float64            
 8   Var_Windspeed2              731 non-null    float64            
 9   Min_Winddirection2          731 non-null    float64            
 10  Max_Winddirection2          731 non-null    int64             

In [64]:
cols_to_drop = get_signals_with_low_variance(metamast)  
metmast_clean = metamast.drop(cols_to_drop, axis=1)
metmast_clean = metmast_clean.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)
metmast_clean=metmast_clean.reset_index('Timestamp')

In [66]:
metmast_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   Timestamp                   731 non-null    datetime64[ns, UTC]
 1   Min_Windspeed1              731 non-null    float64            
 2   Max_Windspeed1              731 non-null    float64            
 3   Avg_Windspeed1              731 non-null    float64            
 4   Var_Windspeed1              731 non-null    float64            
 5   Min_Windspeed2              731 non-null    float64            
 6   Max_Windspeed2              731 non-null    float64            
 7   Avg_Windspeed2              731 non-null    float64            
 8   Var_Windspeed2              731 non-null    float64            
 9   Min_AmbientTemp             731 non-null    float64            
 10  Max_AmbientTemp             731 non-null    int64             

# 3. Failure Data cleaning

In [24]:
failures=pd.concat([failure_2016, failure_2017], axis=0)

In [25]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 11
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Turbine_ID  28 non-null     object
 1   Component   28 non-null     object
 2   Timestamp   28 non-null     object
 3   Remarks     28 non-null     object
dtypes: object(4)
memory usage: 1.1+ KB


In [59]:
signals_clean.index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       721, 722, 723, 724, 725, 726, 727, 728, 729, 730],
      dtype='int64', length=2924)

In [67]:
merged_df = pd.merge(signals_clean.reset_index(drop=True), metmast_clean.reset_index(drop=True), on="Timestamp", how="left")

In [117]:
merged_df

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Anemometer1_Freq,Anemometer1_Offset,Anemometer1_CorrGain,Anemometer2_Freq,Anemometer2_Offset,Anemometer2_CorrGain,AirRessureSensorZeroOffset,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,7.1856,34.56,144,7.1856,34.56,144,86400,16889,17014,59950
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,7.1856,34.56,144,7.1856,34.56,144,86400,11386,11376,59953
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,7.1856,34.56,144,7.1856,34.56,144,86400,15764,15811,59603
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,7.1856,34.56,144,7.1856,34.56,144,86400,28377,28462,58864
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,7.1856,34.56,144,7.1856,34.56,144,86400,28373,28164,58591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,7.1856,34.56,144,7.1856,34.56,144,86400,28645,28811,59293
2920,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,7.1856,34.56,144,7.1856,34.56,144,86400,23033,23268,59498
2921,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,7.1856,34.56,144,7.1856,34.56,144,86400,14460,14378,60189
2922,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,7.1856,34.56,144,7.1856,34.56,144,86400,7487,7472,60339


In [118]:
merged_df = merged_df.dropna()

In [119]:
merged_df

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Anemometer1_Freq,Anemometer1_Offset,Anemometer1_CorrGain,Anemometer2_Freq,Anemometer2_Offset,Anemometer2_CorrGain,AirRessureSensorZeroOffset,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,7.1856,34.56,144,7.1856,34.56,144,86400,16889,17014,59950
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,7.1856,34.56,144,7.1856,34.56,144,86400,11386,11376,59953
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,7.1856,34.56,144,7.1856,34.56,144,86400,15764,15811,59603
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,7.1856,34.56,144,7.1856,34.56,144,86400,28377,28462,58864
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,7.1856,34.56,144,7.1856,34.56,144,86400,28373,28164,58591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,7.1856,34.56,144,7.1856,34.56,144,86400,28645,28811,59293
2920,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,7.1856,34.56,144,7.1856,34.56,144,86400,23033,23268,59498
2921,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,7.1856,34.56,144,7.1856,34.56,144,86400,14460,14378,60189
2922,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,7.1856,34.56,144,7.1856,34.56,144,86400,7487,7472,60339


In [120]:
#set hours min and seconds to 0

failures['Timestamp'] = pd.to_datetime(failures['Timestamp'])
#failures['Timestamp'] = failures['Timestamp'].dt.floor('d')

In [121]:
failures

Unnamed: 0,Turbine_ID,Component,Timestamp,Remarks
0,T01,GEARBOX,2016-07-18 02:10:00+00:00,Gearbox pump damaged
1,T06,GENERATOR,2016-07-11 19:48:00+00:00,Generator replaced
2,T06,GENERATOR,2016-07-24 17:01:00+00:00,Generator temperature sensor failure
3,T06,GENERATOR,2016-09-04 08:08:00+00:00,High temperature generator error
4,T06,GENERATOR,2016-10-27 16:26:00+00:00,Generator replaced
5,T06,GENERATOR,2016-10-02 17:08:00+00:00,Refrigeration system and temperature sensors i...
6,T06,HYDRAULIC_GROUP,2016-04-04 18:53:00+00:00,Error in pitch regulation
7,T07,GENERATOR_BEARING,2016-04-30 12:40:00+00:00,High temperature in generator bearing (replace...
8,T07,TRANSFORMER,2016-07-10 03:46:00+00:00,High temperature transformer
9,T07,TRANSFORMER,2016-08-23 02:21:00+00:00,High temperature transformer. Transformer refr...


In [122]:
days_lookback = 60
def create_failure_list(failures: pd.DataFrame, days_lookback: int, target_name: str = "Target") -> pd.DataFrame:
    failure_list = []

    for i in range(len(failures)):
        turbine_id = str(failures.iloc[i]["Turbine_ID"])
        failure_datetime = failures.iloc[i]["Timestamp"]
        components = failures.iloc[i]["Component"]
        rounded_datetime = failure_datetime.replace(hour=0, minute=0, second=0, microsecond=0)  # Round to the start of the day

        for j in range(days_lookback):
            delta = timedelta(days=j)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            failure_list.append([turbine_id, datetime_formated.isoformat(), components, 1])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", "component",target_name])
    return failure_df


In [123]:
failure_df_class = create_failure_list(failures, days_lookback, "Failure (Target)")

In [126]:
failure_df_class.to_csv('./data/failures_df.csv', index=False)

In [None]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"

regression_function = lambda i, j: i / j
classif_function = lambda i, j: 1

failure_df_reg = create_failure_list(days_lookback, regression_function, reg_target_name)
failure_df_class = create_failure_list(days_lookback, classif_function, class_target_name)

In [112]:
#failure_df_reg['Timestamp'] = pd.to_datetime(failure_df_reg['Timestamp'])
failure_df_class['Timestamp'] = pd.to_datetime(failure_df_class['Timestamp'])

In [114]:
#labeled_df_temp = pd.merge(merged_df.reset_index(drop=True), failure_df_reg.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 
labeled_df = pd.merge(merged_df.reset_index(drop=True), failure_df_class.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 

In [115]:
#labeled_df[reg_target_name].fillna(1.0, inplace = True)
labeled_df[class_target_name].fillna(0, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  labeled_df[class_target_name].fillna(0, inplace = True)


In [127]:
labeled_df.drop_duplicates(inplace=True)

In [129]:
labeled_df.to_csv('./data/labeled_data.csv', index=False)