In [1]:
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
cols = ['unit_number', 'time_in_cycles',
        'operational_setting_1', 'operational_setting_2', 'operational_setting_3'] \
        + [f'sensor_{i}' for i in range(1, 22)]  # 21 sensors (some constant, some useful)

In [4]:
# Load training data
train_fd001 = pd.read_csv("train_FD001.txt", sep=" ", header=None)
train_fd001 = train_fd001.dropna(axis=1, how='all')  # remove empty cols caused by spacing
train_fd001.columns = cols


In [5]:
# Quick check
print(train_fd001.shape)
train_fd001.head()

(20631, 26)


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [6]:
# Compute max cycle for each engine
max_cycle = train_fd001.groupby("unit_number")["time_in_cycles"].max()

# Merge with original data
train_fd001 = train_fd001.merge(max_cycle.to_frame(name='max_cycle'), 
                                left_on='unit_number', right_index=True)

# RUL = max_cycle - current cycle
train_fd001["RUL"] = train_fd001["max_cycle"] - train_fd001["time_in_cycles"]

train_fd001.drop("max_cycle", axis=1, inplace=True)

train_fd001.head()


Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [7]:
train_fd001.columns

Index(['unit_number', 'time_in_cycles', 'operational_setting_1',
       'operational_setting_2', 'operational_setting_3', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7',
       'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12',
       'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17',
       'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'RUL'],
      dtype='object')

# EDA starts here 

In [8]:
print("No of engine : ", train_fd001["unit_number"].nunique())
print("Total cycles : ", train_fd001["time_in_cycles"].max())
train_fd001["RUL"].describe()

No of engine :  100
Total cycles :  362


count    20631.000000
mean       107.807862
std         68.880990
min          0.000000
25%         51.000000
50%        103.000000
75%        155.000000
max        361.000000
Name: RUL, dtype: float64

In [9]:
#RUL Distribution 
fig = go.Figure(
    data= [
        go.Histogram(
            x=train_fd001["RUL"],
            xbins = dict(size = 50),
            marker= dict(color = "lightpink",line = dict(color = "black", width = 1))
        )
    ]
)
fig.update_layout(
    xaxis_title = "RUL Cycles",
    yaxis_title="Frequency",
    title = "RUL Distribution - FD001"
)
fig.show()

In [10]:
# Number of cycles per engine 
cycles_per_engine = train_fd001.groupby("unit_number")["time_in_cycles"].max()
fig = go.Figure(
    data=[
        go.Histogram(
            x=cycles_per_engine,
            xbins=dict(size = 50),
            marker= dict(color = "skyblue",line = dict(color = "black", width = 1))
        )
    ]
)

fig.update_layout(
    xaxis_title = "Cycles until failure",
    yaxis_title = "Number of engines",
    title = "Engine lifetime distribution - FD001"
    
)

fig.show()

In [11]:
train_fd001.columns

Index(['unit_number', 'time_in_cycles', 'operational_setting_1',
       'operational_setting_2', 'operational_setting_3', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7',
       'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12',
       'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17',
       'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'RUL'],
      dtype='object')

In [12]:
sensor_columns = [ c for c in train_fd001.columns if "sensor" in c ]

In [13]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

engine_id = 1
subset = train_fd001[train_fd001["unit_number"] == engine_id]

n_features = len(sensor_columns)
n_cols = 3
n_rows = math.ceil(n_features / n_cols)

# Create subplot grid
fig = make_subplots(
    rows=n_rows, cols=n_cols,
    shared_xaxes=True,
    subplot_titles=[f'{s} over cycles (Engine {engine_id})' for s in sensor_columns]
)

# Add traces
for i, s in enumerate(sensor_columns):
    row = (i // n_cols) + 1   # integer division for row index
    col = (i % n_cols) + 1    # modulo for column index

    fig.add_trace(
        go.Scatter(
            x=subset["time_in_cycles"],
            y=subset[s],
            mode="lines",
            name=s
        ),
        row=row, col=col
    )

# Layout adjustments
fig.update_layout(
    height=250*n_rows, width=1200,
    showlegend=False,
    title_text=f"Sensor Trends for Engine {engine_id}"
)

fig.show()


In [14]:
corr = train_fd001[sensor_columns + ["RUL"]].corr()["RUL"].sort_values(ascending= True)
print(corr)

sensor_11   -0.696228
sensor_4    -0.678948
sensor_15   -0.642667
sensor_2    -0.606484
sensor_17   -0.606154
sensor_3    -0.584520
sensor_8    -0.563968
sensor_13   -0.562569
sensor_9    -0.390102
sensor_14   -0.306769
sensor_6    -0.128348
sensor_20    0.629428
sensor_21    0.635662
sensor_7     0.657223
sensor_12    0.671983
RUL          1.000000
sensor_1          NaN
sensor_5          NaN
sensor_10         NaN
sensor_16         NaN
sensor_18         NaN
sensor_19         NaN
Name: RUL, dtype: float64


In [15]:
from sklearn.preprocessing import StandardScaler 

std = StandardScaler()

scaled_df = train_fd001.copy()

scaled_df[sensor_columns] = std.fit_transform(train_fd001[sensor_columns])

In [16]:
import plotly.graph_objects as go

agg_df = scaled_df.groupby("RUL").mean()

fig = go.Figure()

# Add one line per sensor
for s in sensor_columns:
    fig.add_trace(
        go.Scatter(
            x=agg_df.index,  # RUL is the index after groupby
            y=agg_df[s],
            mode="lines",
            name=s
        )
    )

# Layout adjustments
fig.update_layout(
    title="Average Sensor Values vs RUL",
    xaxis_title="RUL",
    yaxis_title="Mean Sensor Value",
    width=1400, height=900
)

fig.show()


In [17]:
scaled_corr = scaled_df[sensor_columns + ["RUL"]].corr()["RUL"].sort_values(ascending= False)
print(scaled_corr)

RUL          1.000000
sensor_12    0.671983
sensor_7     0.657223
sensor_21    0.635662
sensor_20    0.629428
sensor_6    -0.128348
sensor_14   -0.306769
sensor_9    -0.390102
sensor_13   -0.562569
sensor_8    -0.563968
sensor_3    -0.584520
sensor_17   -0.606154
sensor_2    -0.606484
sensor_15   -0.642667
sensor_4    -0.678948
sensor_11   -0.696228
sensor_1          NaN
sensor_5          NaN
sensor_10         NaN
sensor_16         NaN
sensor_18         NaN
sensor_19         NaN
Name: RUL, dtype: float64


In [18]:
Neg_scaled_corr = scaled_df[sensor_columns + ["RUL"]].corr()["RUL"].sort_values(ascending=True)
print(Neg_scaled_corr)

sensor_11   -0.696228
sensor_4    -0.678948
sensor_15   -0.642667
sensor_2    -0.606484
sensor_17   -0.606154
sensor_3    -0.584520
sensor_8    -0.563968
sensor_13   -0.562569
sensor_9    -0.390102
sensor_14   -0.306769
sensor_6    -0.128348
sensor_20    0.629428
sensor_21    0.635662
sensor_7     0.657223
sensor_12    0.671983
RUL          1.000000
sensor_1          NaN
sensor_5          NaN
sensor_10         NaN
sensor_16         NaN
sensor_18         NaN
sensor_19         NaN
Name: RUL, dtype: float64


In [19]:
scaled_corr_df = scaled_corr[:5]
scaled_corr_df = scaled_corr_df.index.to_list()
scaled_corr_df

['RUL', 'sensor_12', 'sensor_7', 'sensor_21', 'sensor_20']

In [20]:
Neg_scaled_corr_df = Neg_scaled_corr[:3]
Neg_scaled_corr_df = Neg_scaled_corr_df.index.to_list()
Neg_scaled_corr_df

['sensor_11', 'sensor_4', 'sensor_15']

In [21]:
scaled_df.columns

Index(['unit_number', 'time_in_cycles', 'operational_setting_1',
       'operational_setting_2', 'operational_setting_3', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7',
       'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12',
       'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17',
       'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'RUL'],
      dtype='object')

In [22]:
additional_columns = ["unit_number","time_in_cycles"]
all_cols = Neg_scaled_corr_df + additional_columns
Neg_scaled_df_top_5 = scaled_df[all_cols]
Neg_scaled_df_top_5

Unnamed: 0,sensor_11,sensor_4,sensor_15,unit_number,time_in_cycles
0,-0.266467,-0.925936,-0.603816,1,1
1,-0.191583,-0.643726,-0.275852,1,2
2,-1.015303,-0.525953,-0.649144,1,3
3,-1.539489,-0.784831,-1.971665,1,4
4,-0.977861,-0.301518,-0.339845,1,5
...,...,...,...,...,...
20626,1.980044,2.188375,1.425294,100,196
20627,1.867718,2.738351,1.913240,100,197
20628,2.054927,2.138377,3.265092,100,198
20629,3.178182,1.955051,2.579834,100,199


In [23]:
additional_columns = ["unit_number","time_in_cycles"]
all_cols = scaled_corr_df+ additional_columns
scaled_df_top_5 = scaled_df[all_cols]
scaled_df_top_5

Unnamed: 0,RUL,sensor_12,sensor_7,sensor_21,sensor_20,unit_number,time_in_cycles
0,191,0.334262,1.121141,1.194427,1.348493,1,1
1,190,1.174899,0.431930,1.236922,1.016528,1,2
2,189,1.364721,1.008155,0.503423,0.739891,1,3
3,188,1.961302,1.222827,0.777792,0.352598,1,4
4,187,1.052871,0.714393,1.059552,0.463253,1,5
...,...,...,...,...,...,...,...
20626,4,-2.607969,-2.189329,-2.921113,-1.805173,100,196
20627,3,-2.350355,-2.833345,-1.203764,-2.856395,100,197
20628,2,-1.902919,-2.742957,-3.292481,-2.081810,100,198
20629,1,-2.363913,-3.036719,-2.085072,-2.911722,100,199


In [24]:
bin_edges = np.arange(0, scaled_df_top_5['RUL'].max() + 50, 50)
scaled_df_top_5['RUL_bin'] = pd.cut(scaled_df_top_5['RUL'], bins=bin_edges, right=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [25]:
scaled_df_top_5['RUL_bin_str'] = scaled_df_top_5['RUL_bin'].astype(str)
scaled_df_top_5.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,RUL,sensor_12,sensor_7,sensor_21,sensor_20,unit_number,time_in_cycles,RUL_bin,RUL_bin_str
0,191,0.334262,1.121141,1.194427,1.348493,1,1,"[150, 200)","[150, 200)"
1,190,1.174899,0.43193,1.236922,1.016528,1,2,"[150, 200)","[150, 200)"
2,189,1.364721,1.008155,0.503423,0.739891,1,3,"[150, 200)","[150, 200)"
3,188,1.961302,1.222827,0.777792,0.352598,1,4,"[150, 200)","[150, 200)"
4,187,1.052871,0.714393,1.059552,0.463253,1,5,"[150, 200)","[150, 200)"


In [26]:
import plotly.express as px

sensors = ['sensor_12','sensor_7','sensor_21','sensor_20']

for sensor in sensors:
    fig = px.box(
        scaled_df_top_5, 
        x='RUL_bin_str', 
        y=sensor, 
        points='all',
        title=f'{sensor} Distribution Across RUL Bins'
    )
    fig.show()


In [27]:

sensors = ['sensor_12','sensor_7','sensor_21','sensor_20']

fig = go.Figure()

for sensor in sensors:
    # Compute mean & std grouped by RUL for this sensor
    sensor_rul = train_fd001.groupby("RUL")[sensor].agg(["mean", "std"]).reset_index()

    # Mean line
    fig.add_trace(go.Scatter(
        x=sensor_rul["RUL"],
        y=sensor_rul["mean"],
        mode="lines",
        name=f"Mean {sensor}"
    ))

    # Upper bound (mean + std)
    fig.add_trace(go.Scatter(
        x=sensor_rul["RUL"],
        y=sensor_rul["mean"] + sensor_rul["std"],
        mode="lines",
        line=dict(width=0),
        showlegend=False
    ))

    # Lower bound (mean - std), with fill to previous trace
    fig.add_trace(go.Scatter(
        x=sensor_rul["RUL"],
        y=sensor_rul["mean"] - sensor_rul["std"],
        mode="lines",
        line=dict(width=0),
        fill="tonexty",
        fillcolor="rgba(0,0,255,0.1)",   # light fill
        name=f"{sensor} ±1 std"
    ))

# Layout outside the loop
fig.update_layout(
    title="Sensor trends across all engines (aligned by RUL)",
    xaxis_title="Remaining Useful Life (RUL)",
    yaxis_title="Sensor value",
    template="plotly_white"
)

fig.show()


In [28]:
#Top 3 Negative relation sensors
sensors = ['sensor_11', 'sensor_4', 'sensor_15']

fig = go.Figure()

for sensor in sensors:
    # Compute mean & std grouped by RUL for this sensor
    sensor_rul = train_fd001.groupby("RUL")[sensor].agg(["mean", "std"]).reset_index()

    # Mean line
    fig.add_trace(go.Scatter(
        x=sensor_rul["RUL"],
        y=sensor_rul["mean"],
        mode="lines",
        name=f"Mean {sensor}"
    ))

    # Upper bound (mean + std)
    fig.add_trace(go.Scatter(
        x=sensor_rul["RUL"],
        y=sensor_rul["mean"] + sensor_rul["std"],
        mode="lines",
        line=dict(width=0),
        showlegend=False
    ))

    # Lower bound (mean - std), with fill to previous trace
    fig.add_trace(go.Scatter(
        x=sensor_rul["RUL"],
        y=sensor_rul["mean"] - sensor_rul["std"],
        mode="lines",
        line=dict(width=0),
        fill="tonexty",
        fillcolor="rgba(0,0,255,0.1)",   # light fill
        name=f"{sensor} ±1 std"
    ))

# Layout outside the loop
fig.update_layout(
    title="Sensor trends across all engines (aligned by RUL)",
    xaxis_title="Remaining Useful Life (RUL)",
    yaxis_title="Sensor value",
    template="plotly_white"
)

fig.show()


In [29]:
# Lets select the important features using random forest sampling 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

x = train_fd001[sensor_columns]
y = train_fd001["RUL"]

x_train,x_val,y_train,y_val = train_test_split(x,y, random_state=42,test_size=0.2)

rf = RandomForestRegressor(n_estimators=100, random_state=42,n_jobs=-1)
rf.fit(x_train,y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
importance = rf.feature_importances_

feature_importance = pd.Series(importance,index=sensor_columns).sort_values(ascending= False)

In [31]:
top_features = feature_importance.head(15)
fig = go.Figure(
    data=[
        go.Bar(
            x = top_features.index,
            y = top_features.values,
            marker_color="steelblue"
        )
    ]
)
# Layout
fig.update_layout(
    title="Top 15 Sensor Importances for Predicting RUL",
    xaxis_title="Features",
    yaxis_title="Importance Score",
    template="plotly_white"
)

fig.show()

In [32]:
top_features

sensor_11    0.404118
sensor_9     0.125799
sensor_4     0.124018
sensor_12    0.055026
sensor_7     0.040342
sensor_14    0.038937
sensor_21    0.035000
sensor_15    0.034101
sensor_3     0.030752
sensor_2     0.029646
sensor_20    0.026782
sensor_13    0.022718
sensor_8     0.021735
sensor_17    0.010332
sensor_6     0.000693
dtype: float64

In [33]:
import xgboost as xgb 

model = xgb.XGBRegressor(random_state = 42)
model.fit(x_train,y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [34]:
xgb_feature_importance = model.feature_importances_
print("Feature Importances (Gain):", xgb_feature_importance)

# Get feature importances by 'weight'
booster = model.get_booster()
feature_importances_weight = booster.get_score(importance_type='weight')
print("Feature Importances (Weight):", feature_importances_weight)

Feature Importances (Gain): [0.         0.01492269 0.0165465  0.15593447 0.         0.00652616
 0.0317844  0.02365291 0.10911884 0.         0.40819025 0.03826921
 0.02732023 0.02945343 0.02846144 0.         0.03510546 0.
 0.         0.04738929 0.02732473]
Feature Importances (Weight): {'sensor_2': 621.0, 'sensor_3': 469.0, 'sensor_4': 476.0, 'sensor_6': 7.0, 'sensor_7': 439.0, 'sensor_8': 250.0, 'sensor_9': 435.0, 'sensor_11': 341.0, 'sensor_12': 404.0, 'sensor_13': 234.0, 'sensor_14': 441.0, 'sensor_15': 363.0, 'sensor_17': 98.0, 'sensor_20': 298.0, 'sensor_21': 410.0}


In [35]:
series_xgb_feature_importance = pd.Series(xgb_feature_importance, index = sensor_columns).sort_values(ascending= False)
series_xgb_feature_importance.head(10)

sensor_11    0.408190
sensor_4     0.155934
sensor_9     0.109119
sensor_20    0.047389
sensor_12    0.038269
sensor_17    0.035105
sensor_7     0.031784
sensor_14    0.029453
sensor_15    0.028461
sensor_21    0.027325
dtype: float32

In [36]:
top_features = series_xgb_feature_importance.head(15)
fig = go.Figure(
    data=[
        go.Bar(
            x = top_features.index,
            y = top_features.values,
            marker_color="steelblue"
        )
    ]
)
# Layout
fig.update_layout(
    title="Top 15 Sensor Importances for Predicting RUL",
    xaxis_title="Features",
    yaxis_title="Importance Score",
    template="plotly_white"
)

fig.show()

In [37]:
# normalize per engine to remove engine-specific bias
def normalize_per_engine(df, sensors):
    df_norm = df.copy()
    for sensor in sensors:
        df_norm[sensor] = df.groupby("unit_number")[sensor].transform(
            lambda x: (x - x.mean()) / x.std()
        )
    return df_norm


def add_rolling_features(df, sensors, windows=[5, 10, 20]):
    df_feat = df.copy()
    for sensor in sensors:
        for w in windows:
            df_feat[f"{sensor}_mean_{w}"] = df.groupby("unit_number")[sensor].transform(
                lambda x: x.rolling(w, min_periods=1).mean()
            )
            df_feat[f"{sensor}_std_{w}"] = df.groupby("unit_number")[sensor].transform(
                lambda x: x.rolling(w, min_periods=1).std()
            )
    return df_feat

def add_diff_features(df, sensors):
    df_feat = df.copy()
    for sensor in sensors:
        df_feat[f"{sensor}_diff"] = df.groupby("unit_number")[sensor].diff().fillna(0)
    return df_feat


In [38]:
selected_sensors = ["sensor_11", "sensor_4", "sensor_9", "sensor_20", 
                    "sensor_12", "sensor_17", "sensor_7"]

def feature_engineering_pipeline(df, sensors=selected_sensors):
    df_proc = normalize_per_engine(df, sensors)
    df_proc = add_rolling_features(df_proc, sensors)
    df_proc = add_diff_features(df_proc, sensors)
    return df_proc

train_feat = feature_engineering_pipeline(train_fd001, selected_sensors)


In [39]:
train_feat.head()

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL,sensor_11_mean_5,sensor_11_std_5,sensor_11_mean_10,sensor_11_std_10,sensor_11_mean_20,sensor_11_std_20,sensor_4_mean_5,sensor_4_std_5,sensor_4_mean_10,sensor_4_std_10,sensor_4_mean_20,sensor_4_std_20,sensor_9_mean_5,sensor_9_std_5,sensor_9_mean_10,sensor_9_std_10,sensor_9_mean_20,sensor_9_std_20,sensor_20_mean_5,sensor_20_std_5,sensor_20_mean_10,sensor_20_std_10,sensor_20_mean_20,sensor_20_std_20,sensor_12_mean_5,sensor_12_std_5,sensor_12_mean_10,sensor_12_std_10,sensor_12_mean_20,sensor_12_std_20,sensor_17_mean_5,sensor_17_std_5,sensor_17_mean_10,sensor_17_std_10,sensor_17_mean_20,sensor_17_std_20,sensor_7_mean_5,sensor_7_std_5,sensor_7_mean_10,sensor_7_std_10,sensor_7_mean_20,sensor_7_std_20,sensor_11_diff,sensor_4_diff,sensor_9_diff,sensor_20_diff,sensor_12_diff,sensor_17_diff,sensor_7_diff
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,-0.777777,14.62,21.61,1.011159,2388.06,-0.422617,1.3,-0.1642,0.267725,2388.02,8138.62,8.4195,0.03,-0.578837,2388,100.0,1.317066,23.419,191,-0.1642,,-0.1642,,-0.1642,,-0.777777,,-0.777777,,-0.777777,,-0.422617,,-0.422617,,-0.422617,,1.317066,,1.317066,,1.317066,,0.267725,,0.267725,,0.267725,,-0.578837,,-0.578837,,-0.578837,,1.011159,,1.011159,,1.011159,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,-0.481242,14.62,21.61,0.341134,2388.04,-0.854226,1.3,-0.08967,1.0953,2388.07,8131.49,8.4318,0.03,-0.578837,2388,100.0,0.957781,23.4236,190,-0.126935,0.052701,-0.126935,0.052701,-0.126935,0.052701,-0.62951,0.209682,-0.62951,0.209682,-0.62951,0.209682,-0.638422,0.305194,-0.638422,0.305194,-0.638422,0.305194,1.137423,0.254053,1.137423,0.254053,1.137423,0.254053,0.681512,0.585184,0.681512,0.585184,0.681512,0.585184,-0.578837,0.0,-0.578837,0.0,-0.578837,0.0,0.676146,0.473779,0.676146,0.473779,0.676146,0.473779,0.074531,0.296535,-0.431609,-0.359285,0.827576,0.0,-0.670025
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,-0.357492,14.62,21.61,0.901318,2388.08,0.95161,1.3,-0.909507,1.282172,2388.03,8133.23,8.4178,0.03,-1.934162,2388,100.0,0.658377,23.3442,189,-0.387792,0.453352,-0.387792,0.453352,-0.387792,0.453352,-0.538837,0.215981,-0.538837,0.215981,-0.538837,0.215981,-0.108411,0.94303,-0.108411,0.94303,-0.108411,0.94303,0.977741,0.329798,0.977741,0.329798,0.977741,0.329798,0.881732,0.539893,0.881732,0.539893,0.881732,0.539893,-1.030612,0.782497,-1.030612,0.782497,-1.030612,0.782497,0.751204,0.359352,0.751204,0.359352,0.751204,0.359352,-0.819837,0.123751,1.805836,-0.299404,0.186872,-1.355325,0.560185
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,-0.62951,14.62,21.61,1.110015,2388.11,0.247191,1.3,-1.431221,1.869484,2388.08,8133.83,8.3682,0.03,-0.578837,2388,100.0,0.239211,23.3739,188,-0.648649,0.639691,-0.648649,0.639691,-0.648649,0.639691,-0.561505,0.182082,-0.561505,0.182082,-0.561505,0.182082,-0.019511,0.790242,-0.019511,0.790242,-0.019511,0.790242,0.793109,0.45702,0.793109,0.45702,0.793109,0.45702,1.12867,0.661994,1.12867,0.661994,1.12867,0.661994,-0.917668,0.677663,-0.917668,0.677663,-0.917668,0.677663,0.840906,0.343913,0.840906,0.343913,0.840906,0.343913,-0.521714,-0.272018,-0.704418,-0.419166,0.587312,1.355325,0.208696
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,-0.121665,14.62,21.61,0.615734,2388.06,1.401542,1.3,-0.872241,0.975168,2388.04,8133.8,8.4294,0.03,0.098826,2388,100.0,0.358973,23.4044,187,-0.693368,0.562941,-0.693368,0.562941,-0.693368,0.562941,-0.473537,0.252106,-0.473537,0.252106,-0.473537,0.252106,0.2647,0.933938,0.2647,0.933938,0.2647,0.933938,0.706282,0.440846,0.706282,0.440846,0.706282,0.440846,1.09797,0.577399,1.09797,0.577399,1.09797,0.577399,-0.714369,0.742342,-0.714369,0.742342,-0.714369,0.742342,0.795872,0.3144,0.795872,0.3144,0.795872,0.3144,0.55898,0.507845,1.15435,0.119762,-0.894316,0.677663,-0.494281


In [40]:
train_feat.describe()

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL,sensor_11_mean_5,sensor_11_std_5,sensor_11_mean_10,sensor_11_std_10,sensor_11_mean_20,sensor_11_std_20,sensor_4_mean_5,sensor_4_std_5,sensor_4_mean_10,sensor_4_std_10,sensor_4_mean_20,sensor_4_std_20,sensor_9_mean_5,sensor_9_std_5,sensor_9_mean_10,sensor_9_std_10,sensor_9_mean_20,sensor_9_std_20,sensor_20_mean_5,sensor_20_std_5,sensor_20_mean_10,sensor_20_std_10,sensor_20_mean_20,sensor_20_std_20,sensor_12_mean_5,sensor_12_std_5,sensor_12_mean_10,sensor_12_std_10,sensor_12_mean_20,sensor_12_std_20,sensor_17_mean_5,sensor_17_std_5,sensor_17_mean_10,sensor_17_std_10,sensor_17_mean_20,sensor_17_std_20,sensor_7_mean_5,sensor_7_std_5,sensor_7_mean_10,sensor_7_std_10,sensor_7_mean_20,sensor_7_std_20,sensor_11_diff,sensor_4_diff,sensor_9_diff,sensor_20_diff,sensor_12_diff,sensor_17_diff,sensor_7_diff
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20531.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,7.022426e-16,14.62,21.609803,-3.530155e-16,2388.096652,-1.670022e-15,1.3,-2.555488e-15,-1.684073e-14,2388.096152,8143.752722,8.442146,0.03,4.298179e-16,2388.0,100.0,7.239401e-16,23.289705,107.807862,-0.033229,0.410006,-0.07181,0.427025,-0.141055,0.442871,-0.032192,0.470403,-0.069541,0.488947,-0.135877,0.504009,-0.015269,0.481911,-0.033808,0.505574,-0.067214,0.525224,0.02944,0.575151,0.063368,0.596578,0.124123,0.609952,0.03211,0.447671,0.069154,0.466659,0.136054,0.482669,-0.027847,0.61434,-0.059757,0.639203,-0.117069,0.653595,0.031321,0.493793,0.067472,0.513659,0.132423,0.52937,0.017004,0.016845,0.007251,-0.015498,-0.016543,0.014916,-0.016148
std,29.227633,68.88099,0.002187,0.000293,0.0,0.0,0.500053,6.13115,0.9975977,1.7764e-15,0.001389,0.9975977,0.070985,0.9975977,0.0,0.9975977,0.9975977,0.071919,19.076176,0.037505,1.3878120000000003e-17,0.9975977,0.0,0.0,0.9975977,0.108251,68.88099,0.883699,0.163128,0.834563,0.124217,0.76021,0.105749,0.859165,0.180521,0.807415,0.132224,0.734377,0.109212,0.807363,0.365278,0.746754,0.339561,0.670886,0.32325,0.80533,0.216147,0.748643,0.156611,0.677194,0.122981,0.86737,0.181674,0.817503,0.139777,0.744327,0.11927,0.780895,0.240251,0.722628,0.172647,0.651767,0.134341,0.847304,0.201418,0.795799,0.151822,0.723009,0.126625,0.620674,0.709892,0.855333,0.866487,0.678528,0.925843,0.75458
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,-3.180582,14.62,21.6,-4.209499,2387.9,-3.649711,1.3,-2.499798,-4.081211,2387.88,8099.94,8.3249,0.03,-3.280361,2388.0,100.0,-3.784428,22.8942,0.0,-1.978298,0.0,-1.978298,0.0,-1.978298,0.0,-3.180582,0.005255,-3.180582,0.005255,-3.180582,0.005255,-2.49204,0.00749,-2.212567,0.00749,-1.930974,0.00749,-2.921147,0.0,-2.519506,0.0,-2.207862,0.0,-3.061567,0.0,-2.650826,0.0,-2.323213,0.0,-2.245171,0.0,-2.245171,0.0,-2.245171,0.0,-3.136826,0.013362,-2.755423,0.013362,-2.282867,0.013362,-2.655136,-3.268001,-4.804008,-3.554281,-2.895759,-4.54712,-3.524846
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,-0.7143964,14.62,21.61,-0.5647206,2388.05,-0.6752526,1.3,-0.713423,-0.5429033,2388.04,8133.245,8.4149,0.03,-0.7336946,2388.0,100.0,-0.5889016,23.2218,51.0,-0.677758,0.290601,-0.685936,0.338222,-0.707231,0.368043,-0.658684,0.341942,-0.666754,0.398983,-0.684429,0.431853,-0.62537,0.169271,-0.622401,0.191836,-0.628566,0.210468,-0.387042,0.41851,-0.326016,0.487268,-0.231748,0.525719,-0.409215,0.316729,-0.344646,0.368043,-0.232274,0.398764,-0.596723,0.436433,-0.58561,0.517658,-0.582666,0.566541,-0.417206,0.348594,-0.350037,0.406307,-0.247138,0.439815,-0.387385,-0.45302,-0.324297,-0.587827,-0.462877,-0.674078,-0.5091
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,-0.2021277,14.62,21.61,0.1897677,2388.09,-0.20527,1.3,-0.236708,0.2152893,2388.09,8140.54,8.4389,0.03,-0.1040418,2388.0,100.0,0.1415764,23.2979,103.0,-0.342808,0.392749,-0.383168,0.416764,-0.435025,0.436187,-0.32045,0.454749,-0.350268,0.479028,-0.403935,0.496116,-0.162435,0.383914,-0.143405,0.444365,-0.15092,0.497204,0.262365,0.55547,0.303399,0.585805,0.353634,0.602937,0.325386,0.428141,0.362593,0.454266,0.412703,0.471002,-0.236845,0.596264,-0.280002,0.628647,-0.33612,0.647585,0.302969,0.469546,0.347936,0.49923,0.399146,0.519918,0.0,0.007822,0.010074,0.0,-0.013275,0.0,-0.014558
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,0.5509863,14.62,21.61,0.70784,2388.14,0.5717016,1.3,0.5170163,0.7169309,2388.14,8148.31,8.4656,0.03,0.5881359,2388.0,100.0,0.7036988,23.3668,155.0,0.400313,0.512474,0.335464,0.505297,0.230209,0.50969,0.393284,0.579304,0.322919,0.569222,0.223544,0.569636,0.396248,0.724238,0.354136,0.770589,0.310755,0.797278,0.615521,0.710458,0.612252,0.694233,0.621425,0.688207,0.665145,0.555695,0.671786,0.550454,0.686827,0.558979,0.362339,0.768568,0.293081,0.743044,0.198614,0.734412,0.6462,0.617857,0.651206,0.60759,0.663909,0.60772,0.422414,0.490164,0.341042,0.565688,0.429746,0.685628,0.470577
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,3.852251,14.62,21.61,2.824867,2388.56,3.73905,1.3,4.654818,2.476173,2388.56,8293.72,8.5848,0.03,4.609756,2388.0,100.0,3.162842,23.6184,361.0,3.107155,1.304752,2.741084,1.304752,2.415171,1.304752,3.012636,2.373585,2.684807,2.373585,2.372534,2.373585,3.210835,2.269816,2.968476,2.098332,2.610119,2.098332,2.002187,1.763148,2.002187,1.763148,2.002187,1.763148,1.856282,1.543516,1.856282,1.543516,1.856282,1.543516,2.982047,2.137494,2.4547,2.137494,2.072047,2.137494,2.824867,1.975184,2.824867,1.975184,2.824867,1.975184,2.841387,3.356757,4.763086,3.817271,3.146142,4.661394,3.201734


In [41]:
train_feat.columns

Index(['unit_number', 'time_in_cycles', 'operational_setting_1',
       'operational_setting_2', 'operational_setting_3', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7',
       'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12',
       'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17',
       'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'RUL',
       'sensor_11_mean_5', 'sensor_11_std_5', 'sensor_11_mean_10',
       'sensor_11_std_10', 'sensor_11_mean_20', 'sensor_11_std_20',
       'sensor_4_mean_5', 'sensor_4_std_5', 'sensor_4_mean_10',
       'sensor_4_std_10', 'sensor_4_mean_20', 'sensor_4_std_20',
       'sensor_9_mean_5', 'sensor_9_std_5', 'sensor_9_mean_10',
       'sensor_9_std_10', 'sensor_9_mean_20', 'sensor_9_std_20',
       'sensor_20_mean_5', 'sensor_20_std_5', 'sensor_20_mean_10',
       'sensor_20_std_10', 'sensor_20_mean_20', 'sensor_20_std_20',
       'sensor_12_mean_5', 'sensor_12_std_5', 'sens

In [42]:
from sklearn.model_selection import train_test_split 
import xgboost as xgb 

x = train_feat.drop(columns='RUL')
y = train_feat["RUL"]

X_train , X_test , Y_train , Y_test = train_test_split(x , y , random_state=42, test_size=0.2)

model = xgb.XGBRegressor(random_state = 42)
model.fit(X_train,Y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [43]:
y_pred = model.predict(X_test)

In [44]:
print(y_pred)

[141.80855    4.720626 198.92766  ... 116.54945   90.208145 172.57639 ]


In [45]:
from sklearn.metrics import accuracy_score,root_mean_squared_error

rmse = root_mean_squared_error(Y_test,y_pred)
print(f'RMSE Score test :{rmse:.2f}')

RMSE Score test :10.75


In [46]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "n_estimators": [200, 500, 800],
    "max_depth": [4, 6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0]
}

xgb_model = xgb.XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(
    xgb_model, param_distributions=param_grid,
    n_iter=10, scoring="neg_root_mean_squared_error", 
    cv=3, verbose=2, n_jobs=-1, random_state=42
)

random_search.fit(X_train, Y_train)

print("Best params:", random_search.best_params_)
print("Best RMSE:", -random_search.best_score_)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best params: {'subsample': 0.7, 'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.01, 'colsample_bytree': 0.9}
Best RMSE: 9.737546602884928


In [47]:
best_params = random_search.best_params_

In [48]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import xgboost as xgb


# Imputer
imputer = SimpleImputer(strategy="mean")

# Wrap SVR in a pipeline so it imputes before fitting
svr_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("svr", SVR(kernel='rbf', C=100, gamma=0.1))
])

# Define base learners
base_learners = [
    ('xgb', xgb.XGBRegressor(**best_params, random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=300, random_state=42)),
    ('svr', svr_pipe),  # <- now safe
]

stack = StackingRegressor(
    estimators=base_learners,
    final_estimator=Ridge(alpha=1.0),
    cv=3,
    n_jobs=-1
)

stack.fit(X_train, Y_train)
y_pred_stack = stack.predict(X_test)

print("Stacked RMSE:", mean_squared_error(Y_test, y_pred_stack))


Stacked RMSE: 78.54643943006677


In [49]:
print("Stacked RMSE:", mean_squared_error(Y_test, y_pred_stack))

Stacked RMSE: 78.54643943006677


In [50]:
sqrt = mean_squared_error(Y_test,y_pred_stack)
rmse = np.sqrt(sqrt)
print(f'Stacked RMSE Value : {rmse:.2f}')

Stacked RMSE Value : 8.86
