In [1]:
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
cols = ['unit_number', 'time_in_cycles',
        'operational_setting_1', 'operational_setting_2', 'operational_setting_3'] \
        + [f'sensor_{i}' for i in range(1, 22)]  # 21 sensors (some constant, some useful)

In [4]:
# Load training data
train_fd001 = pd.read_csv("train_FD001.txt", sep=" ", header=None)
train_fd001 = train_fd001.dropna(axis=1, how='all')  # remove empty cols caused by spacing
train_fd001.columns = cols


In [5]:
# Quick check
print(train_fd001.shape)
train_fd001.head()

(20631, 26)


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [6]:
# Compute max cycle for each engine
max_cycle = train_fd001.groupby("unit_number")["time_in_cycles"].max()

# Merge with original data
train_fd001 = train_fd001.merge(max_cycle.to_frame(name='max_cycle'), 
                                left_on='unit_number', right_index=True)

# RUL = max_cycle - current cycle
train_fd001["RUL"] = train_fd001["max_cycle"] - train_fd001["time_in_cycles"]

train_fd001.drop("max_cycle", axis=1, inplace=True)

train_fd001.head()


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [7]:
train_fd001.columns

Index(['unit_number', 'time_in_cycles', 'operational_setting_1',
       'operational_setting_2', 'operational_setting_3', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7',
       'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12',
       'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17',
       'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'RUL'],
      dtype='object')

# EDA starts here 

In [8]:
print("No of engine : ", train_fd001["unit_number"].nunique())
print("Total cycles : ", train_fd001["time_in_cycles"].max())
train_fd001["RUL"].describe()

No of engine :  100
Total cycles :  362


count    20631.000000
mean       107.807862
std         68.880990
min          0.000000
25%         51.000000
50%        103.000000
75%        155.000000
max        361.000000
Name: RUL, dtype: float64

In [19]:
#RUL Distribution 
fig = go.Figure(
    data= [
        go.Histogram(
            x=train_fd001["RUL"],
            xbins = dict(size = 50),
            marker= dict(color = "lightpink",line = dict(color = "black", width = 1))
        )
    ]
)
fig.update_layout(
    xaxis_title = "RUL Cycles",
    yaxis_title="Frequency",
    title = "RUL Distribution - FD001"
)
fig.show()

In [20]:
# Number of cycles per engine 
cycles_per_engine = train_fd001.groupby("unit_number")["time_in_cycles"].max()
fig = go.Figure(
    data=[
        go.Histogram(
            x=cycles_per_engine,
            xbins=dict(size = 50),
            marker= dict(color = "skyblue",line = dict(color = "black", width = 1))
        )
    ]
)

fig.update_layout(
    xaxis_title = "Cycles until failure",
    yaxis_title = "Number of engines",
    title = "Engine lifetime distribution - FD001"
    
)

fig.show()