In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from datetime import datetime, timedelta, timezone

In [2]:
failure_2016 = pd.read_csv('./data/init/failures-2016.csv',sep=';')
failure_2017 = pd.read_csv('./data/init/failures-2017.csv',sep=';')
metmast_2016 = pd.read_csv('./data/init/metmast-2016.csv',sep=';')
metmast_2017 = pd.read_csv('./data/init/metmast-2017.csv',sep=';')
signals_2016 = pd.read_csv('./data/init/signals-2016.csv', sep=';')
signals_2017= pd.read_csv('./data/init/signals-2017.csv', sep=';')

In [3]:
def signal_preprocess(signals):
    
    signals['Timestamp'] = pd.to_datetime(signals['Timestamp'])
    signals=signals.set_index('Timestamp')
    return signals

In [4]:
signals=pd.concat([signals_2016, signals_2017], axis=0)
signals = signal_preprocess(signals)

In [5]:
def get_signals_with_low_variance(df: pd.DataFrame, threshold=0) -> list:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cont_data = df.select_dtypes(include=numerics)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(cont_data)
    inverted_list = ~np.array(selector.get_support())
    return cont_data.columns[inverted_list].tolist()

In [6]:
cols_to_drop = set()
for i, df in enumerate([signals]):
    cols_to_drop.update(get_signals_with_low_variance(df))

In [7]:
signals_clean=signals.drop(list(cols_to_drop), axis=1)

In [8]:
signals_clean.reset_index(inplace=True)

In [9]:
metmast = pd.concat([metmast_2016, metmast_2017], axis=0)
metmast['Timestamp'] = pd.to_datetime(metmast['Timestamp'])
metmast=metmast.set_index('Timestamp')

In [10]:
cols_to_drop = get_signals_with_low_variance(metmast)  
metmast_clean = metmast.drop(cols_to_drop, axis=1)
metmast_clean = metmast_clean.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)


In [11]:
metmast_clean['metamast_missing_values'] = metmast_clean.isnull().any(axis=1).astype(int)
metmast_clean=metmast_clean.fillna(method='bfill')
metmast_clean=metmast_clean.reset_index()

  metmast_clean=metmast_clean.fillna(method='bfill')


In [12]:
def merge_signals_metmast(signals: pd.DataFrame, metmast: pd.DataFrame) -> pd.DataFrame:
    merged_df = pd.merge(signals.reset_index(drop=True), metmast.reset_index(drop=True), on="Timestamp", how="left")
    #merged_df.drop(columns=["index_x"], inplace=True)
    #merged_df = merged_df[merged_df["missing_values"] == 0]
    #merged_df = merged_df[merged_df["metamast_missing_values"] == 0]
    #merged_df.drop(columns=["missing_values", "metamast_missing_values"], inplace=True)
    return merged_df

In [13]:
merged_df = merge_signals_metmast(signals_clean, metmast_clean)

In [14]:
merged_df['Timestamp'] = pd.to_datetime(merged_df['Timestamp'])

In [15]:
failures=pd.concat([failure_2016, failure_2017], axis=0)
failures['Timestamp'] = pd.to_datetime(failures['Timestamp'])
#failures['Timestamp'] = failures['Timestamp'].dt.floor('d')
failures = failures[failures["Component"] == "GEARBOX"]
failures.reset_index(drop=True, inplace=True); failures

Unnamed: 0,Turbine_ID,Component,Timestamp,Remarks
0,T01,GEARBOX,2016-07-18 02:10:00+00:00,Gearbox pump damaged
1,T09,GEARBOX,2016-10-11 08:06:00+00:00,Gearbox repaired
2,T06,GEARBOX,2017-10-17 08:38:00+00:00,Gearbox bearings damaged
3,T09,GEARBOX,2017-10-18 08:32:00+00:00,Gearbox noise


In [16]:
failures_gearbox = failures[failures["Component"] == "GEARBOX"]
failures_gearbox.reset_index(drop=True, inplace=True); failures_gearbox

Unnamed: 0,Turbine_ID,Component,Timestamp,Remarks
0,T01,GEARBOX,2016-07-18 02:10:00+00:00,Gearbox pump damaged
1,T09,GEARBOX,2016-10-11 08:06:00+00:00,Gearbox repaired
2,T06,GEARBOX,2017-10-17 08:38:00+00:00,Gearbox bearings damaged
3,T09,GEARBOX,2017-10-18 08:32:00+00:00,Gearbox noise


In [17]:
def get_round_minute_diff(datetime_in: datetime) -> timedelta:
    min = datetime_in.minute
    rounded_min = round(min, -1)
    diff = rounded_min - min
    return timedelta(minutes=diff)

In [18]:
def convert_round_minute_to_time(datetime_in: datetime) -> datetime:
    td = get_round_minute_diff(datetime_in)
    return datetime_in + td

In [28]:
days_lookback = 60

def create_failure_list(days_lookback: int, value_function, target_name: str = "Target") -> pd.DataFrame:
    
    ten_mins_of_n_days = int(24 * 60 * days_lookback / 10)
    failure_list = []
    for i, failure in failures.iterrows():
        turbine_id = str(failure["Turbine_ID"])
        failure_ts = str(failure["Timestamp"])
        
        failure_datetime = datetime.fromisoformat(failure_ts)
        rounded_datetime = convert_round_minute_to_time(failure_datetime)
        for j in range(ten_mins_of_n_days):
            delta = timedelta(minutes=j*10)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            
            failure_list.append([turbine_id, datetime_formated.isoformat(),value_function(j, ten_mins_of_n_days)])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp",target_name])

    return failure_df

In [32]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   Turbine_ID  4 non-null      object             
 1   Component   4 non-null      object             
 2   Timestamp   4 non-null      datetime64[ns, UTC]
 3   Remarks     4 non-null      object             
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 260.0+ bytes


In [29]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"

regression_function = lambda i, j: i / j
#regression_function = lambda rul, max_rul: rul
classif_function = lambda i, j: 1

failure_df_reg = create_failure_list(days_lookback, regression_function, reg_target_name)
failure_df_class = create_failure_list(days_lookback, classif_function, class_target_name)

In [30]:
failure_df_reg['Timestamp'] = pd.to_datetime(failure_df_reg['Timestamp'])
failure_df_class['Timestamp'] = pd.to_datetime(failure_df_class['Timestamp'])

In [88]:
failure_df_reg.to_csv('./data/failures_reg.csv', index=False)

In [31]:
labeled_df_temp = pd.merge(merged_df.reset_index(drop=True), failure_df_reg.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 
labeled_df = pd.merge(labeled_df_temp.reset_index(drop=True), failure_df_class.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 

In [32]:
labeled_df[reg_target_name].fillna(1.0, inplace = True)
labeled_df[class_target_name].fillna(0, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  labeled_df[reg_target_name].fillna(1.0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  labeled_df[class_target_name].fillna(0, inplace = True)


In [37]:
plt.rcParams["figure.figsize"] = (10,5)

In [27]:
labeled_df.to_csv('./data/labeled_data_gearbox.csv', index=False)

In [33]:

def aggregate_signals(signals):
    agg_signals=signals.groupby('Turbine_ID').resample('D').sum()
    agg_signals['Turbine_ID'] = agg_signals.index.get_level_values('Turbine_ID')  
    agg_signals=agg_signals.reset_index('Timestamp')
    return agg_signals



In [34]:
labeled_df['Timestamp'] = pd.to_datetime(labeled_df['Timestamp'])
labeled_df=labeled_df.set_index('Timestamp')
labeled_df_agg=aggregate_signals(labeled_df)

In [26]:
labeled_df_agg

Unnamed: 0_level_0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Avg_Precipitation,Max_Raindetection,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq,metamast_missing_values,components_x,RUL (Target),components_y,Failure (Target)
Turbine_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T01,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,0.0,0.0,16889.0,17014.0,59950.0,0.0,0,144.0,0,0.0
T01,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,54.0,0.0,11386.0,11376.0,59953.0,0.0,0,144.0,0,0.0
T01,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,14.0,0.0,15764.0,15811.0,59603.0,0.0,0,144.0,0,0.0
T01,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,17.0,0.0,28377.0,28462.0,58864.0,0.0,0,144.0,0,0.0
T01,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,61.0,0.0,28373.0,28164.0,58591.0,0.0,0,144.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T11,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,15.0,0.0,28645.0,28811.0,59293.0,0.0,0,144.0,0,0.0
T11,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,6.0,0.0,23033.0,23268.0,59498.0,0.0,0,144.0,0,0.0
T11,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,0.0,0.0,14460.0,14378.0,60189.0,0.0,0,144.0,0,0.0
T11,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,2.0,0.0,7487.0,7472.0,60339.0,0.0,0,144.0,0,0.0


In [35]:
#drop the last two columns with rul target
labeled_df_agg.drop(['RUL (Target)', 'Failure (Target)'], axis=1, inplace=True)


In [36]:
#drop the columns metamast_missing_values and missing_values
labeled_df_agg.drop(['metamast_missing_values'], axis=1, inplace=True)


In [166]:
def create_regression_failure_list(failures: pd.DataFrame, days_lookback: int, value_function, target_name: str = "Target") -> pd.DataFrame:
    failure_list = []

    for i in range(len(failures)):
        turbine_id = str(failures.iloc[i]["Turbine_ID"])
        failure_datetime = failures.iloc[i]["Timestamp"]
        rounded_datetime = failure_datetime.replace(hour=0, minute=0, second=0, microsecond=0)  # Round to the start of the day

        for j in range(days_lookback):
            delta = timedelta(days=j)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            # Calculate the target value using the provided value_function
            target_value = value_function(j, days_lookback)
            failure_list.append([turbine_id, datetime_formated.isoformat(), target_value])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", target_name])
    return failure_df

In [167]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"

regression_function = lambda i, j: i /j
#regression_function = lambda i, j: (j - i) / j
#regression_function = lambda rul, max_rul: rul
classif_function = lambda i, j: 1

days_lookback = 60
failure_df_reg = create_regression_failure_list(failures, days_lookback, regression_function, reg_target_name)
failure_df_class = create_regression_failure_list(failures, days_lookback, classif_function, class_target_name)
#failure_df_class = create_failure_list(days_lookback, classif_function, class_target_name)

In [150]:
failure_df_class

Unnamed: 0,Turbine_ID,Timestamp,Failure (Target)
0,T01,2016-07-18T00:00:00+00:00,1
1,T01,2016-07-17T00:00:00+00:00,1
2,T01,2016-07-16T00:00:00+00:00,1
3,T01,2016-07-15T00:00:00+00:00,1
4,T01,2016-07-14T00:00:00+00:00,1
...,...,...,...
235,T09,2017-08-24T00:00:00+00:00,1
236,T09,2017-08-23T00:00:00+00:00,1
237,T09,2017-08-22T00:00:00+00:00,1
238,T09,2017-08-21T00:00:00+00:00,1


In [37]:
labeled_df_agg.to_csv('./data/labeled_data_agg.csv', index=False)

# DUMP

In [None]:
def train_runs(X_train, y_train, model_name):
    spline = SplineTransformer(degree=2, n_knots=3)
    pt = PowerTransformer(method="yeo-johnson")
    scaler = StandardScaler()

    base_estimator = Pipeline(
        [
            ("scaler", scaler),
            ("spline", spline),
            ("pt", pt),
            (model_name, RandomForestClassifier(random_state=0)),
        ]
    )
    param_grid = {
        "scaler": ["passthrough", scaler],
        "spline": ["passthrough", spline],
        "pt": ["passthrough", PowerTransformer(method="yeo-johnson")],
        "{}__max_depth".format(model_name): [10, 20, 40, 80],
    }

    sh = GridSearchCV(base_estimator, param_grid, scoring="f1_weighted", verbose=2).fit(
        X_train, y_train
    )
    results = sh.cv_results_
    clf = sh.best_estimator_

    return clf, results, param_grid