In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from datetime import datetime, timedelta, timezone

In [4]:
failure_2016 = pd.read_csv('./data/init/failures-2016.csv',sep=';')
failure_2017 = pd.read_csv('./data/init/failures-2017.csv',sep=';')
metmast_2016 = pd.read_csv('./data/init/metmast-2016.csv',sep=';')
metmast_2017 = pd.read_csv('./data/init/metmast-2017.csv',sep=';')
signals_2016 = pd.read_csv('./data/init/signals-2016.csv', sep=';')
signals_2017= pd.read_csv('./data/init/signals-2017.csv', sep=';')

In [5]:
def signal_preprocess(signals):
    
    signals['Timestamp'] = pd.to_datetime(signals['Timestamp'])
    signals=signals.set_index('Timestamp')
    return signals

In [6]:
signals=pd.concat([signals_2016, signals_2017], axis=0)
signals = signal_preprocess(signals)

In [7]:
def get_signals_with_low_variance(df: pd.DataFrame, threshold=0) -> list:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cont_data = df.select_dtypes(include=numerics)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(cont_data)
    inverted_list = ~np.array(selector.get_support())
    return cont_data.columns[inverted_list].tolist()

In [8]:
cols_to_drop = set()
for i, df in enumerate([signals]):
    cols_to_drop.update(get_signals_with_low_variance(df))

In [9]:
signals_clean=signals.drop(list(cols_to_drop), axis=1)

In [10]:
signals_clean.reset_index(inplace=True)

In [11]:
metmast = pd.concat([metmast_2016, metmast_2017], axis=0)
metmast['Timestamp'] = pd.to_datetime(metmast['Timestamp'])
metmast=metmast.set_index('Timestamp')

In [12]:
cols_to_drop = get_signals_with_low_variance(metmast)  
metmast_clean = metmast.drop(cols_to_drop, axis=1)
metmast_clean = metmast_clean.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)


In [13]:
metmast_clean['metamast_missing_values'] = metmast_clean.isnull().any(axis=1).astype(int)
metmast_clean=metmast_clean.fillna(method='bfill')
metmast_clean=metmast_clean.reset_index()

  metmast_clean=metmast_clean.fillna(method='bfill')


In [14]:
def merge_signals_metmast(signals: pd.DataFrame, metmast: pd.DataFrame) -> pd.DataFrame:
    merged_df = pd.merge(signals.reset_index(drop=True), metmast.reset_index(drop=True), on="Timestamp", how="left")
    #merged_df.drop(columns=["index_x"], inplace=True)
    #merged_df = merged_df[merged_df["missing_values"] == 0]
    #merged_df = merged_df[merged_df["metamast_missing_values"] == 0]
    #merged_df.drop(columns=["missing_values", "metamast_missing_values"], inplace=True)
    return merged_df

In [15]:
merged_df = merge_signals_metmast(signals_clean, metmast_clean)

In [16]:
merged_df['Timestamp'] = pd.to_datetime(merged_df['Timestamp'])

In [17]:
failures=pd.concat([failure_2016, failure_2017], axis=0)
failures['Timestamp'] = pd.to_datetime(failures['Timestamp'])
failures['Timestamp'] = failures['Timestamp'].dt.floor('d')

In [18]:
def get_round_minute_diff(datetime_in: datetime) -> timedelta:
    min = datetime_in.minute
    rounded_min = round(min, -1)
    diff = rounded_min - min
    return timedelta(minutes=diff)

In [19]:
def convert_round_minute_to_time(datetime_in: datetime) -> datetime:
    td = get_round_minute_diff(datetime_in)
    return datetime_in + td

In [77]:
days_lookback = 60

def create_failure_list(days_lookback: int, value_function, target_name: str = "Target") -> pd.DataFrame:
    
    ten_mins_of_n_days = int(24 * 60 * days_lookback / 10)
    failure_list = []
    for i, failure in failures.iterrows():
        turbine_id = str(failure["Turbine_ID"])
        failure_ts = str(failure["Timestamp"])
        failure_datetime = datetime.fromisoformat(failure_ts)
        rounded_datetime = convert_round_minute_to_time(failure_datetime)
        for j in range(ten_mins_of_n_days):
            delta = timedelta(minutes=j*10)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            
            failure_list.append([turbine_id, datetime_formated.isoformat(), value_function(j, ten_mins_of_n_days)])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", target_name])

    return failure_df

In [21]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 11
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   Turbine_ID  28 non-null     object             
 1   Component   28 non-null     object             
 2   Timestamp   28 non-null     datetime64[ns, UTC]
 3   Remarks     28 non-null     object             
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 1.1+ KB


In [78]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"

regression_function = lambda i, j: i / j
#regression_function = lambda rul, max_rul: rul
classif_function = lambda i, j: 1

failure_df_reg = create_failure_list(days_lookback, regression_function, reg_target_name)
failure_df_class = create_failure_list(days_lookback, classif_function, class_target_name)

In [79]:
failure_df_reg['Timestamp'] = pd.to_datetime(failure_df_reg['Timestamp'])
failure_df_class['Timestamp'] = pd.to_datetime(failure_df_class['Timestamp'])

In [88]:
failure_df_reg.to_csv('./data/failures_reg.csv', index=False)

In [81]:
labeled_df_temp = pd.merge(merged_df.reset_index(drop=True), failure_df_reg.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 
labeled_df = pd.merge(labeled_df_temp.reset_index(drop=True), failure_df_class.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 

In [82]:
labeled_df[reg_target_name].fillna(1.0, inplace = True)
labeled_df[class_target_name].fillna(0, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  labeled_df[reg_target_name].fillna(1.0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  labeled_df[class_target_name].fillna(0, inplace = True)


In [83]:
labeled_df

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Min_Precipitation,Max_Precipitation,Avg_Precipitation,Max_Raindetection,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq,metamast_missing_values,RUL (Target),Failure (Target)
0,2016-01-08 23:10:00+00:00,T06,1634.3,1226.9,1416.4,96.4,49.0,62,63,64,...,0.0,0.0,0.0,0.0,164.0,166.0,406.0,0.0,1.0,0.0
1,2016-04-19 12:20:00+00:00,T06,1796.1,1597.0,1680.0,29.8,66.0,97,107,111,...,0.0,0.0,0.0,0.0,258.0,259.0,405.0,0.0,1.0,0.0
2,2016-01-08 23:10:00+00:00,T01,1657.5,1299.2,1495.0,83.8,47.0,62,62,61,...,0.0,0.0,0.0,0.0,164.0,166.0,406.0,0.0,1.0,0.0
3,2016-04-19 12:30:00+00:00,T11,1771.2,1590.0,1677.5,29.4,66.0,111,110,110,...,0.0,0.0,0.0,0.0,187.0,188.0,405.0,0.0,1.0,0.0
4,2016-01-08 23:50:00+00:00,T07,1667.5,1277.4,1481.3,119.1,46.0,73,72,71,...,0.0,0.0,0.0,0.0,157.0,158.0,405.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504281,2017-05-31 10:50:00+00:00,T01,1700.7,1362.5,1619.8,68.0,55.0,89,87,86,...,0.0,0.0,0.0,0.0,160.0,171.0,408.0,0.0,1.0,0.0
504282,2017-06-03 00:20:00+00:00,T06,0.0,0.0,0.0,0.0,42.0,45,46,46,...,0.0,0.0,0.0,0.0,17.0,15.0,409.0,0.0,1.0,0.0
504283,2017-05-31 11:10:00+00:00,T11,1675.5,1299.7,1565.7,102.2,64.0,72,73,73,...,0.0,0.0,0.0,0.0,167.0,187.0,408.0,0.0,1.0,0.0
504284,2017-06-03 00:50:00+00:00,T06,0.0,0.0,0.0,0.0,39.0,41,41,41,...,0.0,0.0,0.0,0.0,39.0,39.0,409.0,0.0,1.0,0.0


In [84]:

def aggregate_signals(signals):
    agg_signals=signals.groupby('Turbine_ID').resample('D').sum()
    agg_signals['Turbine_ID'] = agg_signals.index.get_level_values('Turbine_ID')  
    agg_signals=agg_signals.reset_index('Timestamp')
    return agg_signals



In [85]:
labeled_df['Timestamp'] = pd.to_datetime(labeled_df['Timestamp'])
labeled_df=labeled_df.set_index('Timestamp')
labeled_df_agg=aggregate_signals(labeled_df)

In [92]:
labeled_df_agg

<class 'pandas.core.frame.DataFrame'>
Index: 2924 entries, T01 to T11
Columns: 108 entries, Timestamp to Failure (Target)
dtypes: datetime64[ns, UTC](1), float64(78), int64(28), object(1)
memory usage: 2.4+ MB


In [93]:
#drop the last two columns with rul target
labeled_df_agg.drop(['RUL (Target)', 'Failure (Target)'], axis=1, inplace=True)


In [96]:
#drop the columns metamast_missing_values and missing_values
labeled_df_agg.drop(['metamast_missing_values'], axis=1, inplace=True)


In [129]:
def create_regression_failure_list(failures: pd.DataFrame, days_lookback: int, value_function, target_name: str = "Target") -> pd.DataFrame:
    failure_list = []

    for i in range(len(failures)):
        turbine_id = str(failures.iloc[i]["Turbine_ID"])
        failure_datetime = failures.iloc[i]["Timestamp"]
        rounded_datetime = failure_datetime.replace(hour=0, minute=0, second=0, microsecond=0)  # Round to the start of the day

        for j in range(days_lookback):
            delta = timedelta(days=j)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            # Calculate the target value using the provided value_function
            target_value = value_function(j, days_lookback)
            failure_list.append([turbine_id, datetime_formated.isoformat(), target_value])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", target_name])
    return failure_df

In [130]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"

regression_function = lambda i, j: i /j
#regression_function = lambda i, j: (j - i) / j
#regression_function = lambda rul, max_rul: rul
classif_function = lambda i, j: 1

days_lookback = 60
failure_df_reg = create_regression_failure_list(failures, days_lookback, regression_function, reg_target_name)
failure_df_class = create_regression_failure_list(failures, days_lookback, classif_function, class_target_name)
#failure_df_class = create_failure_list(days_lookback, classif_function, class_target_name)

In [132]:
failure_df_class

Unnamed: 0,Turbine_ID,Timestamp,Failure (Target)
0,T01,2016-07-18T00:00:00+00:00,1
1,T01,2016-07-17T00:00:00+00:00,1
2,T01,2016-07-16T00:00:00+00:00,1
3,T01,2016-07-15T00:00:00+00:00,1
4,T01,2016-07-14T00:00:00+00:00,1
...,...,...,...
1675,T11,2017-07-19T00:00:00+00:00,1
1676,T11,2017-07-18T00:00:00+00:00,1
1677,T11,2017-07-17T00:00:00+00:00,1
1678,T11,2017-07-16T00:00:00+00:00,1


In [86]:
labeled_df_agg.to_csv('./data/labeled_data_agg.csv', index=False)