In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import train_test_split

In [3]:
failure_2016 = pd.read_csv('./data/init/failures-2016.csv',sep=';')
failure_2017 = pd.read_csv('./data/init/failures-2017.csv',sep=';')
metmast_2016 = pd.read_csv('./data/init/metmast-2016.csv',sep=';')
metmast_2017 = pd.read_csv('./data/init/metmast-2017.csv',sep=';')
signals_2016 = pd.read_csv('./data/init/signals-2016.csv', sep=';')
signals_2017= pd.read_csv('./data/init/signals-2017.csv', sep=';')



# 1. Cleaning Signal data

Combining signals from both years and aggregating the time series data into recurring once a day.

In [4]:
def signal_preprocess(signals):
    
    signals['Timestamp'] = pd.to_datetime(signals['Timestamp'])
    signals=signals.set_index('Timestamp')
    return signals


In [5]:
signals=pd.concat([signals_2016, signals_2017], axis=0)
signals = signal_preprocess(signals)

In [129]:
signals

Unnamed: 0_level_0,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,Hyd_Oil_Temp_Avg,...,Grd_Prod_PsbleInd_Avg,Grd_Prod_PsbleInd_Max,Grd_Prod_PsbleInd_Min,Grd_Prod_PsbleInd_Std,Grd_Prod_PsbleCap_Avg,Grd_Prod_PsbleCap_Max,Grd_Prod_PsbleCap_Min,Grd_Prod_PsbleCap_Std,Gen_Bear2_Temp_Avg,Nac_Direction_Avg
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-08 23:10:00+00:00,T06,1634.3,1226.9,1416.4,96.4,49.0,62,63,64,30,...,-998.7,-649.2,-1000.0,18.5,998.7,1000.0,649.2,18.5,40,207.1
2016-04-19 12:20:00+00:00,T06,1796.1,1597.0,1680.0,29.8,66.0,97,107,111,43,...,-615.2,-583.3,-1000.0,91.9,423.8,783.5,406.1,52.7,57,129.9
2016-01-08 23:10:00+00:00,T01,1657.5,1299.2,1495.0,83.8,47.0,62,62,61,29,...,-1000.0,-1000.0,-1000.0,0.0,1000.0,1000.0,1000.0,0.0,49,220.6
2016-04-19 12:30:00+00:00,T11,1771.2,1590.0,1677.5,29.4,66.0,111,110,110,50,...,-639.6,-583.3,-1000.0,132.4,452.2,1000.0,406.1,125.1,56,123.5
2016-01-08 23:50:00+00:00,T07,1667.5,1277.4,1481.3,119.1,46.0,73,72,71,30,...,-1000.0,-1000.0,-1000.0,0.0,998.6,1000.0,884.5,11.1,47,206.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-05-31 10:50:00+00:00,T01,1700.7,1362.5,1619.8,68.0,55.0,89,87,86,38,...,-998.4,-654.3,-1000.0,17.3,953.4,1000.0,443.4,95.5,65,123.8
2017-06-03 00:20:00+00:00,T06,0.0,0.0,0.0,0.0,42.0,45,46,46,40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36,90.3
2017-05-31 11:10:00+00:00,T11,1675.5,1299.7,1565.7,102.2,64.0,72,73,73,33,...,-1000.0,-1000.0,-1000.0,0.0,1000.0,1000.0,1000.0,0.0,53,106.6
2017-06-03 00:50:00+00:00,T06,0.0,0.0,0.0,0.0,39.0,41,41,41,39,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34,90.3


Drop the columns with low variance

In [6]:
def get_signals_with_low_variance(df: pd.DataFrame, threshold=0) -> list:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cont_data = df.select_dtypes(include=numerics)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(cont_data)
    inverted_list = ~np.array(selector.get_support())
    return cont_data.columns[inverted_list].tolist()

In [7]:
cols_to_drop=get_signals_with_low_variance(signals)
signals.drop(cols_to_drop, axis=1, inplace=True)

Aggregate the time series dataframe into a daily data frame

In [8]:
def aggregate_signals(signals):
    agg_signals=signals.groupby('Turbine_ID').resample('D').sum()
    agg_signals['Turbine_ID'] = agg_signals.index.get_level_values('Turbine_ID')  
    agg_signals=agg_signals.reset_index('Timestamp')
    return agg_signals


In [9]:
agg_signals=aggregate_signals(signals)
agg_signals=agg_signals.reset_index(drop=True)

In [9]:
agg_signals

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Grd_Prod_PsbleInd_Avg,Grd_Prod_PsbleInd_Max,Grd_Prod_PsbleInd_Min,Grd_Prod_PsbleInd_Std,Grd_Prod_PsbleCap_Avg,Grd_Prod_PsbleCap_Max,Grd_Prod_PsbleCap_Min,Grd_Prod_PsbleCap_Std,Gen_Bear2_Temp_Avg,Nac_Direction_Avg
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,-92929.7,-68241.0,-109983.9,9823.7,92920.8,109983.9,67537.9,9877.8,5554,32453.4
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,-45063.8,-35598.1,-53448.7,4432.3,44758.1,53448.7,33839.1,4969.9,4752,23121.9
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,-67850.8,-54531.9,-73000.0,5227.6,63328.1,73000.0,42621.4,8344.0,5503,35495.5
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,-133426.6,-100994.2,-143994.0,13072.6,119419.5,143390.4,77022.7,19260.8,8811,38177.5
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,-129142.2,-116437.2,-142871.2,6964.7,119370.2,140043.4,101280.0,8765.6,7097,45004.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,-120201.6,-97406.6,-143258.6,13087.6,107426.6,140996.6,76593.5,15766.5,8355,40865.5
2920,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,-135333.5,-109675.3,-143738.0,8928.4,130039.6,143738.0,94313.4,12493.0,6940,39614.2
2921,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,-85117.9,-37053.6,-122976.5,20889.7,85117.9,122976.5,37053.6,20889.7,5539,43819.8
2922,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,-12643.7,-1886.9,-30372.6,6473.7,12643.7,30372.6,1886.9,6473.7,4220,22927.5


Prepare signal for each turbine

In [50]:
"""turbine_names = agg_signals["Turbine_ID"].unique()
def create_df_for_each_turbine(signals: pd.DataFrame) -> list[pd.DataFrame]:
    turbine_dfs = list()

    for turbine in turbine_names:
        test = agg_signals["Turbine_ID"]
        turbine_df = signals[agg_signals["Turbine_ID"] == turbine]
        turbine_df = turbine_df.sort_values("Timestamp")
        turbine_df = turbine_df.reset_index()
        
        turbine_dfs.append(turbine_df)

    return turbine_dfs

turbine_dfs = create_df_for_each_turbine(agg_signals)
#convert list to dataframe
#turbine_signals = pd.concat(turbine_dfs, axis=0)"""


In [10]:
# binary column to indicate if a signal data has any missing values
agg_signals['missing_values'] = agg_signals.isnull().any(axis=1).astype(int)
#fill the missing valeu with backfill and forwardfill
signals_clean=agg_signals.fillna(method='bfill')
#signals_clean=signals_clean.reset_index()

  signals_clean=agg_signals.fillna(method='bfill')


In [12]:
signals_clean.to_csv('./data/signals_clean.csv', index=False)

# 2. Clean Metacast data

Combine metamast data for both 2016 and 2017. Aggregate the metamast data into daily time slots

Metmast data do not have data from 2017-01-04 to 2017-05-05

In [11]:
metmast = pd.concat([metmast_2016, metmast_2017], axis=0)
metmast['Timestamp'] = pd.to_datetime(metmast['Timestamp'])
metmast=metmast.set_index('Timestamp')


Drop the columns with low variance

In [12]:
cols_to_drop = get_signals_with_low_variance(metmast)  
metmast = metmast.drop(cols_to_drop, axis=1)
metmast= metmast.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)

Aggregate the data according to the columns (with mean or sum)

In [13]:
aggregation_rules = {
    'Min_Windspeed1': 'sum',
    'Max_Windspeed1': 'sum',
    'Avg_Windspeed1': 'mean',
    'Var_Windspeed1': 'sum',
    'Min_Windspeed2': 'sum',
    'Max_Windspeed2': 'sum',
    'Avg_Windspeed2': 'mean',
    'Var_Windspeed2': 'sum',
    'Min_AmbientTemp': 'sum',
    'Max_AmbientTemp': 'sum',
    'Avg_AmbientTemp': 'mean',
    'Min_Pressure': 'sum',
    'Max_Pressure': 'sum',
    'Avg_Pressure': 'mean',
    'Min_Humidity': 'sum',
    'Max_Humidity': 'sum',
    'Avg_Humidity': 'mean',
    'Min_Precipitation': 'sum',
    'Max_Precipitation': 'sum',
    'Avg_Precipitation': 'sum',
    'Max_Raindetection': 'sum',
    'Anemometer1_Avg_Freq': 'mean',
    'Anemometer2_Avg_Freq': 'mean',
    'Pressure_Avg_Freq': 'mean',
}


In [14]:
agg_metmast=metmast.resample('D').agg(aggregation_rules)
agg_metmast=agg_metmast.reset_index()


In [15]:
agg_metmast['metamast_missing_values'] = agg_metmast.isnull().any(axis=1).astype(int)
metmast_clean=agg_metmast.fillna(method='bfill')

  metmast_clean=agg_metmast.fillna(method='bfill')


In [16]:
agg_metmast.to_csv('./data/metmast_clean.csv', index=False)

# Merge metamast data and signals data

In [17]:
def merge_signals_metmast(signals: pd.DataFrame, metmast: pd.DataFrame) -> pd.DataFrame:
    merged_df = pd.merge(signals, metmast, on="Timestamp", how="left")
    #merged_df.drop(columns=["index"], inplace=True)
    merged_df = merged_df[merged_df["missing_values"] == 0]
    merged_df = merged_df[merged_df["metamast_missing_values"] == 0 ]
    merged_df.drop(columns=["missing_values", "metamast_missing_values"], inplace=True)
    return merged_df


In [18]:
merged_df = merge_signals_metmast(signals_clean, metmast_clean)

In [19]:
merged_df.reset_index(drop=True)

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Min_Humidity,Max_Humidity,Avg_Humidity,Min_Precipitation,Max_Precipitation,Avg_Precipitation,Max_Raindetection,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,12425.0,12698,87.229167,0,0,0,0,117.284722,118.152778,416.319444
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,12523.0,12755,87.743056,22,85,54,0,79.069444,79.000000,416.340278
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,13499.0,13671,94.208333,4,23,14,0,109.472222,109.798611,413.909722
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,12603.0,12921,88.534722,6,30,17,0,197.062500,197.652778,408.777778
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,10847.0,11252,76.645833,33,89,61,0,197.034722,195.583333,406.881944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2431,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,11275.0,11590,79.347222,9,21,15,0,198.923611,200.076389,411.756944
2432,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,12388.0,12598,86.687500,2,10,6,0,159.951389,161.583333,413.180556
2433,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,11533.0,11954,81.493056,0,0,0,0,100.416667,99.847222,417.979167
2434,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,11581.0,11946,81.638889,0,4,2,0,51.993056,51.888889,419.020833


Replace the null values in the merged dataframe with mean of that column

In [21]:
merged_df

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Min_Humidity,Max_Humidity,Avg_Humidity,Min_Precipitation,Max_Precipitation,Avg_Precipitation,Max_Raindetection,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,12425.0,12698,87.229167,0,0,0,0,117.284722,118.152778,416.319444
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,12523.0,12755,87.743056,22,85,54,0,79.069444,79.000000,416.340278
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,13499.0,13671,94.208333,4,23,14,0,109.472222,109.798611,413.909722
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,12603.0,12921,88.534722,6,30,17,0,197.062500,197.652778,408.777778
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,10847.0,11252,76.645833,33,89,61,0,197.034722,195.583333,406.881944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,11275.0,11590,79.347222,9,21,15,0,198.923611,200.076389,411.756944
2920,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,12388.0,12598,86.687500,2,10,6,0,159.951389,161.583333,413.180556
2921,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,11533.0,11954,81.493056,0,0,0,0,100.416667,99.847222,417.979167
2922,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,11581.0,11946,81.638889,0,4,2,0,51.993056,51.888889,419.020833


# 3. Clean Failure data

In [36]:
failures=pd.concat([failure_2016, failure_2017], axis=0)
failures['Timestamp'] = pd.to_datetime(failures['Timestamp'])
failures['Timestamp'] = failures['Timestamp'].dt.floor('d')
#failures=failures[failures["Component"] == "GEARBOX"]


In [37]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 11
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   Turbine_ID  28 non-null     object             
 1   Component   28 non-null     object             
 2   Timestamp   28 non-null     datetime64[ns, UTC]
 3   Remarks     28 non-null     object             
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 1.1+ KB


In [38]:
failures

Unnamed: 0,Turbine_ID,Component,Timestamp,Remarks
0,T01,GEARBOX,2016-07-18 00:00:00+00:00,Gearbox pump damaged
1,T06,GENERATOR,2016-07-11 00:00:00+00:00,Generator replaced
2,T06,GENERATOR,2016-07-24 00:00:00+00:00,Generator temperature sensor failure
3,T06,GENERATOR,2016-09-04 00:00:00+00:00,High temperature generator error
4,T06,GENERATOR,2016-10-27 00:00:00+00:00,Generator replaced
5,T06,GENERATOR,2016-10-02 00:00:00+00:00,Refrigeration system and temperature sensors i...
6,T06,HYDRAULIC_GROUP,2016-04-04 00:00:00+00:00,Error in pitch regulation
7,T07,GENERATOR_BEARING,2016-04-30 00:00:00+00:00,High temperature in generator bearing (replace...
8,T07,TRANSFORMER,2016-07-10 00:00:00+00:00,High temperature transformer
9,T07,TRANSFORMER,2016-08-23 00:00:00+00:00,High temperature transformer. Transformer refr...


In [39]:
days_lookback = 60
def create_failure_list(failures: pd.DataFrame, days_lookback: int, value_function, target_name: str = "Target") -> pd.DataFrame:
    failure_list = []

    for i in range(len(failures)):
        turbine_id = str(failures.iloc[i]["Turbine_ID"])
        failure_datetime = failures.iloc[i]["Timestamp"]
        components = failures.iloc[i]["Component"]
        rounded_datetime = failure_datetime.replace(hour=0, minute=0, second=0, microsecond=0)  # Round to the start of the day

        for j in range(days_lookback):
            delta = timedelta(days=j)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            # Calculate the target value using the provided value_function
            target_value = value_function(j, days_lookback)
            failure_list.append([turbine_id, datetime_formated.isoformat(), components,target_value])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", "components",target_name])
    return failure_df

In [40]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"
regression_function = lambda i, j: i
classif_function = lambda i, j: 1
failure_df_reg = create_failure_list(failures, days_lookback, regression_function, reg_target_name)
failure_df_class = create_failure_list(failures, days_lookback, classif_function, class_target_name)


In [41]:
# RUL target column should show two decimal point
failure_df_reg[reg_target_name] = failure_df_reg[reg_target_name].apply(lambda x: round(x, 2))
failure_df_reg


Unnamed: 0,Turbine_ID,Timestamp,components,RUL (Target)
0,T01,2016-07-18T00:00:00+00:00,GEARBOX,0
1,T01,2016-07-17T00:00:00+00:00,GEARBOX,1
2,T01,2016-07-16T00:00:00+00:00,GEARBOX,2
3,T01,2016-07-15T00:00:00+00:00,GEARBOX,3
4,T01,2016-07-14T00:00:00+00:00,GEARBOX,4
...,...,...,...,...
1675,T11,2017-07-19T00:00:00+00:00,HYDRAULIC_GROUP,55
1676,T11,2017-07-18T00:00:00+00:00,HYDRAULIC_GROUP,56
1677,T11,2017-07-17T00:00:00+00:00,HYDRAULIC_GROUP,57
1678,T11,2017-07-16T00:00:00+00:00,HYDRAULIC_GROUP,58


In [110]:
failure_df_reg.to_csv('./data/failures_df.csv', index=False)

In [42]:
failure_df_reg['Timestamp'] = pd.to_datetime(failure_df_reg['Timestamp'])
failure_df_class['Timestamp'] = pd.to_datetime(failure_df_class['Timestamp'])

# 4. Add target variable to the merged data

In [43]:
labeled_df_temp = pd.merge(merged_df.reset_index(drop=True), failure_df_reg.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 
labeled_df = pd.merge(labeled_df_temp.reset_index(drop=True), failure_df_class.reset_index(drop=True), on=["Turbine_ID", "Timestamp"], how="left"); 

In [30]:
labeled_df[reg_target_name]=labeled_df[reg_target_name].fillna(60).astype(int)
labeled_df[class_target_name] = labeled_df[class_target_name].fillna(0).astype(int)

In [44]:
labeled_df.drop_duplicates(inplace=True)
labeled_df.drop(columns=[ "components_y"], inplace=True)

In [45]:
#rename component_x as component
labeled_df.rename(columns = {'components_x':'components'}, inplace = True)


In [46]:
labeled_df=labeled_df.drop(columns=[reg_target_name])

In [47]:
labeled_df.to_csv('./data/labeled_data.csv', index=False)

In [48]:
labeled_df

Unnamed: 0,Timestamp,Turbine_ID,Gen_RPM_Max,Gen_RPM_Min,Gen_RPM_Avg,Gen_RPM_Std,Gen_Bear_Temp_Avg,Gen_Phase1_Temp_Avg,Gen_Phase2_Temp_Avg,Gen_Phase3_Temp_Avg,...,Avg_Humidity,Min_Precipitation,Max_Precipitation,Avg_Precipitation,Max_Raindetection,Anemometer1_Avg_Freq,Anemometer2_Avg_Freq,Pressure_Avg_Freq,components,Failure (Target)
0,2016-01-01 00:00:00+00:00,T01,181289.2,133914.9,158499.3,13332.7,5667.0,8128,8126,8049,...,87.229167,0,0,0,0,117.284722,118.152778,416.319444,,
1,2016-01-02 00:00:00+00:00,T01,100793.6,71082.2,86946.4,8378.4,4907.0,6556,6525,6458,...,87.743056,22,85,54,0,79.069444,79.000000,416.340278,,
2,2016-01-03 00:00:00+00:00,T01,133818.3,108905.3,124205.1,5775.7,6162.0,7969,7768,7627,...,94.208333,4,23,14,0,109.472222,109.798611,413.909722,,
3,2016-01-04 00:00:00+00:00,T01,246412.7,215913.9,234245.0,5730.7,9402.0,14207,13658,13447,...,88.534722,6,30,17,0,197.062500,197.652778,408.777778,,
4,2016-01-05 00:00:00+00:00,T01,240840.6,206798.4,225681.7,7157.1,8107.0,13162,12695,12515,...,76.645833,33,89,61,0,197.034722,195.583333,406.881944,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3082,2017-12-27 00:00:00+00:00,T11,243986.1,205969.3,228579.7,8038.6,9281.0,12396,12417,12362,...,79.347222,9,21,15,0,198.923611,200.076389,411.756944,,
3083,2017-12-28 00:00:00+00:00,T11,237997.3,192192.0,220250.6,10931.2,8367.0,10300,10401,10322,...,86.687500,2,10,6,0,159.951389,161.583333,413.180556,,
3084,2017-12-29 00:00:00+00:00,T11,188502.9,146124.7,165452.9,13330.9,5920.0,7934,8103,8036,...,81.493056,0,0,0,0,100.416667,99.847222,417.979167,,
3085,2017-12-30 00:00:00+00:00,T11,65884.7,51194.5,57913.6,4789.3,4327.0,5416,5475,5453,...,81.638889,0,4,2,0,51.993056,51.888889,419.020833,,


# Train and Test Split

In [36]:
drop_cols = [ class_target_name]
X=labeled_df.drop(drop_cols, axis=1)
y=labeled_df[class_target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Train=pd.concat([X_train, y_train], axis=1)
Test=pd.concat([X_test, y_test], axis=1)



## Save Train Test data as csv

In [37]:
import os
os.makedirs('./data/first_clean', exist_ok=True)

In [38]:
Train.to_csv('./data/first_clean/train.csv', index=False)
Test.to_csv('./data/first_clean/test.csv', index=False)