In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import train_test_split

In [5]:
failure_2016 = pd.read_csv('../data/init/failures-2016.csv',sep=';')
failure_2017 = pd.read_csv('../data/init/failures-2017.csv',sep=';')
metmast_2016 = pd.read_csv('../data/init/metmast-2016.csv',sep=';')
metmast_2017 = pd.read_csv('../data/init/metmast-2017.csv',sep=';')
signals_2016 = pd.read_csv('../data/init/signals-2016.csv', sep=';')
signals_2017= pd.read_csv('../data/init/signals-2017.csv', sep=';')



# 1. Cleaning Signal data

Combining signals from both years and aggregating the time series data into recurring once a day.

In [6]:
def signal_preprocess(signals):
    
    signals['Timestamp'] = pd.to_datetime(signals['Timestamp'])
    signals=signals.set_index('Timestamp')
    return signals


In [7]:
signals=pd.concat([signals_2016, signals_2017], axis=0)
signals = signal_preprocess(signals)

Drop the columns with low variance

In [8]:
def get_signals_with_low_variance(df: pd.DataFrame, threshold=0) -> list:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cont_data = df.select_dtypes(include=numerics)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(cont_data)
    inverted_list = ~np.array(selector.get_support())
    return cont_data.columns[inverted_list].tolist()

In [9]:
cols_to_drop=get_signals_with_low_variance(signals)
signals.drop(cols_to_drop, axis=1, inplace=True)

Aggregate the time series dataframe into a daily data frame

In [10]:
def aggregate_signals(signals):
    agg_signals=signals.groupby('Turbine_ID').resample('D').sum()
    agg_signals['Turbine_ID'] = agg_signals.index.get_level_values('Turbine_ID')  
    agg_signals=agg_signals.reset_index('Timestamp')
    return agg_signals


In [11]:
agg_signals=aggregate_signals(signals)
agg_signals=agg_signals.reset_index(drop=True)

Prepare signal for each turbine

In [50]:
"""turbine_names = agg_signals["Turbine_ID"].unique()
def create_df_for_each_turbine(signals: pd.DataFrame) -> list[pd.DataFrame]:
    turbine_dfs = list()

    for turbine in turbine_names:
        test = agg_signals["Turbine_ID"]
        turbine_df = signals[agg_signals["Turbine_ID"] == turbine]
        turbine_df = turbine_df.sort_values("Timestamp")
        turbine_df = turbine_df.reset_index()
        
        turbine_dfs.append(turbine_df)

    return turbine_dfs

turbine_dfs = create_df_for_each_turbine(agg_signals)
#convert list to dataframe
#turbine_signals = pd.concat(turbine_dfs, axis=0)"""


In [12]:
# binary column to indicate if a signal data has any missing values
agg_signals['missing_values'] = agg_signals.isnull().any(axis=1).astype(int)
#fill the missing valeu with backfill and forwardfill
signals_clean=agg_signals.fillna(method='bfill')
#signals_clean=signals_clean.reset_index()

  signals_clean=agg_signals.fillna(method='bfill')


# 2. Clean Metacast data

Combine metamast data for both 2016 and 2017. Aggregate the metamast data into daily time slots

Metmast data do not have data from 2017-01-04 to 2017-05-05

In [13]:
metmast = pd.concat([metmast_2016, metmast_2017], axis=0)
metmast['Timestamp'] = pd.to_datetime(metmast['Timestamp'])
metmast=metmast.set_index('Timestamp')


Drop the columns with low variance

In [14]:
cols_to_drop = get_signals_with_low_variance(metmast)  
metmast = metmast.drop(cols_to_drop, axis=1)
metmast= metmast.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)

Aggregate the data according to the columns (with mean or sum)

In [15]:
aggregation_rules = {
    'Min_Windspeed1': 'sum',
    'Max_Windspeed1': 'sum',
    'Avg_Windspeed1': 'mean',
    'Var_Windspeed1': 'sum',
    'Min_Windspeed2': 'sum',
    'Max_Windspeed2': 'sum',
    'Avg_Windspeed2': 'mean',
    'Var_Windspeed2': 'sum',
    'Min_AmbientTemp': 'sum',
    'Max_AmbientTemp': 'sum',
    'Avg_AmbientTemp': 'mean',
    'Min_Pressure': 'sum',
    'Max_Pressure': 'sum',
    'Avg_Pressure': 'mean',
    'Min_Humidity': 'sum',
    'Max_Humidity': 'sum',
    'Avg_Humidity': 'mean',
    'Min_Precipitation': 'sum',
    'Max_Precipitation': 'sum',
    'Avg_Precipitation': 'sum',
    'Max_Raindetection': 'sum',
    'Anemometer1_Avg_Freq': 'mean',
    'Anemometer2_Avg_Freq': 'mean',
    'Pressure_Avg_Freq': 'mean',
}


In [16]:
agg_metmast=metmast.resample('D').agg(aggregation_rules)
agg_metmast=agg_metmast.reset_index()


In [17]:
agg_metmast['metamast_missing_values'] = agg_metmast.isnull().any(axis=1).astype(int)
metmast_clean=agg_metmast.fillna(method='bfill')

  metmast_clean=agg_metmast.fillna(method='bfill')


# Merge metamast data and signals data

In [18]:
def merge_signals_metmast(signals: pd.DataFrame, metmast: pd.DataFrame) -> pd.DataFrame:
    merged_df = pd.merge(signals, metmast, on="Timestamp", how="left")
    #merged_df.drop(columns=["index"], inplace=True)
    merged_df = merged_df[merged_df["missing_values"] == 0]
    merged_df = merged_df[merged_df["metamast_missing_values"] == 0 ]
    merged_df.drop(columns=["missing_values", "metamast_missing_values"], inplace=True)
    return merged_df


In [19]:
merged_df = merge_signals_metmast(signals_clean, metmast_clean)

In [20]:
merged_df.reset_index(drop=True)

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Min_Humidity,Max_Humidity,Avg_Humidity,Min_Precipitation,Max_Precipitation,Avg_Precipitation,Max_Raindetection,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,12425.0,12698,87.229167,0,0,0,0,117.284722,118.152778,416.319444
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,12523.0,12755,87.743056,22,85,54,0,79.069444,79.000000,416.340278
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,13499.0,13671,94.208333,4,23,14,0,109.472222,109.798611,413.909722
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,12603.0,12921,88.534722,6,30,17,0,197.062500,197.652778,408.777778
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,10847.0,11252,76.645833,33,89,61,0,197.034722,195.583333,406.881944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2431,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,11275.0,11590,79.347222,9,21,15,0,198.923611,200.076389,411.756944
2432,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,12388.0,12598,86.687500,2,10,6,0,159.951389,161.583333,413.180556
2433,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,11533.0,11954,81.493056,0,0,0,0,100.416667,99.847222,417.979167
2434,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,11581.0,11946,81.638889,0,4,2,0,51.993056,51.888889,419.020833


# 3. Clean Failure data

In [21]:
failures=pd.concat([failure_2016, failure_2017], axis=0)
failures['Timestamp'] = pd.to_datetime(failures['Timestamp'])
failures['Timestamp'] = failures['Timestamp'].dt.floor('d')


In [28]:
failures.to_csv('../data/model_data/failures.csv', index=False)

In [22]:
days_lookback = 60
def create_failure_list(failures: pd.DataFrame, days_lookback: int, value_function, target_name: str = "Target") -> pd.DataFrame:
    failure_list = []

    for i in range(len(failures)):
        turbine_id = str(failures.iloc[i]["Turbine_ID"])
        failure_datetime = failures.iloc[i]["Timestamp"]
        components = failures.iloc[i]["Component"]
        rounded_datetime = failure_datetime.replace(hour=0, minute=0, second=0, microsecond=0)  # Round to the start of the day

        for j in range(days_lookback):
            delta = timedelta(days=j)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            # Calculate the target value using the provided value_function
            target_value = value_function(j, days_lookback)
            failure_list.append([turbine_id, datetime_formated.isoformat(), components,target_value])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", "Component",target_name])
    return failure_df

In [23]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"
regression_function = lambda i, j: i
classif_function = lambda i, j: 1

In [73]:
components = failures["Component"].unique()
for component in components:
    globals()[f"failure_df_{component}"] = failures[failures["Component"] == "GEARBOX"]
    globals()[f"failure_df_{component}"] = create_failure_list(failures[failures["Component"] == component], days_lookback, classif_function, class_target_name)
    globals()[f"failure_df_{component}"]['Timestamp'] = pd.to_datetime(globals()[f"failure_df_{component}"]['Timestamp'])
    globals()[f"labeled_df_{component}"] = pd.merge(merged_df.reset_index(drop=True), globals()[f"failure_df_{component}"].reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left")
    globals()[f"labeled_df_{component}"][class_target_name] = globals()[f"labeled_df_{component}"][class_target_name].fillna(0).astype(int)
    globals()[f"labeled_df_{component}"].drop_duplicates(inplace=True)
    


In [25]:
import os
os.makedirs('./data/model_data', exist_ok=True)

In [63]:
for component in components:
    globals()[f"labeled_df_{component}"].to_csv(f'./data/model_data/labelled_data_{component}.csv', index=False)


In [27]:
failures.to_csv('./data/failures.csv', index=False)