### **Exploratory Data Analysis (EDA)**

#### **1. Imports**

In [50]:
import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy.stats import skew, gaussian_kde
from sklearn.preprocessing import StandardScaler

# Shows full list of rows in output
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

#### **2. Functions**

In [51]:
def add_RUL(df):
    """RUL = max cycle - current cycle """
    max_cycle = df.groupby("engine_id")["time_in_cycles"].transform("max")
    df["RUL"] = max_cycle - df["time_in_cycles"]
    return df

def outlier_detection(df):
    """Outlier detection using IQR"""
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers_mask = (df < lower) | (df > upper)
    return outliers_mask.sum()

def plot_cycles_per_engine(df, engine_id_col='engine_id', title='Number of cycles per engine'):
    rows_per_engine = df.groupby(engine_id_col).size().reset_index(name='cycles')
    min_cycles = rows_per_engine['cycles'].min()
    max_cycles = rows_per_engine['cycles'].max()
    med_cycles = rows_per_engine['cycles'].median()

    def highlight(row):
        if row['cycles'] == min_cycles:
            return 'Shortest (min)'
        if row['cycles'] == max_cycles:
            return 'Longest (max)'
        if row['cycles'] == med_cycles:
            return 'Median'
        return 'Other'

    rows_per_engine['highlight'] = rows_per_engine.apply(highlight, axis=1)
    color_map = {
        'Shortest (min)': '#E74C3C',
        'Longest (max)': '#27AE60',
        'Median': '#F39C12',
        'Other': '#BDC3C7',
    }
    fig = px.bar(
        rows_per_engine, x=engine_id_col, y='cycles', color='highlight',
        title=title,
        labels={engine_id_col: 'Engine ID', 'cycles': 'Cycles (lifecycle length)'},
        color_discrete_map=color_map,
        category_orders={'highlight': ['Shortest (min)', 'Longest (max)', 'Median', 'Other']},
    )
    fig.update_layout(xaxis_title='Engine ID', yaxis_title='Cycles', legend_title='')
    fig.show()

def histogram_plot_RUL(df, column_name="RUL"):
    fig = px.histogram(df, x=column_name, nbins=50, title=f"Distribution: {column_name}")
    fig.update_layout(xaxis_title=column_name, yaxis_title="Count")
    fig.show()

def rul_vs_cycle_plot(df): 
    # 1. Filter for the first 100 engines
    df_subset = df[df['engine_id'] <= 100].copy()

    # 2. Convert engine_id to string so Plotly treats it as a discrete category (distinct colors)
    df_subset['engine_id'] = df_subset['engine_id'].astype(str)

    # 3. Plot all lines at once
    fig = px.line(
        df_subset, 
        x='time_in_cycles', 
        y='RUL', 
        color='engine_id',
        title='RUL vs Cycle (Engines 1-100)',
        labels={'time_in_cycles': 'Cycle', 'RUL': 'Remaining Useful Life'}
    )

    # Optional: Hide the legend if 100 lines make it too cluttered
    fig.update_layout(showlegend=False)

    fig.show()

def correlation_sensors_rul(df): 
    corr_with_rul = df[[c for c in df.columns if 'sensor' in c]].corrwith(df['RUL']).dropna()
    corr_with_rul = corr_with_rul.sort_values()

    fig = px.bar(
        x=corr_with_rul.values,
        y=corr_with_rul.index,
        orientation='h',
        title='Correlation of sensors with RUL',
        labels={'x': 'Correlation', 'y': 'Sensor'},
        color=corr_with_rul.values,
        color_continuous_scale='RdBu_r',
        range_color=[-1, 1],
    )
    fig.update_layout(height=500, yaxis={'categoryorder': 'total ascending'})
    fig.show()

def get_sensor_column_names(df): 
    sensors = [c for c in df.columns if 'sensor' in c]
    return sensors

def sensor_measurements_distribution(df): 
    for s in get_sensor_column_names(df):
        fig = px.histogram(df, x=s, nbins=50, title=f"Distribution: {s}")
        fig.update_layout(height=300, width=600)
        fig.show()

def correlation_heatmap(df):
    sensors = get_sensor_column_names(df)
    corr_matrix = df[sensors].corr()
    fig = px.imshow(
        corr_matrix,
        color_continuous_scale="RdBu_r",
        zmin=-1, zmax=1,
        title="Correlation Heatmap",
        text_auto=".2f",
        height=800, width=900,
    )
    fig.show()

def engine_lifecycle_distribution(df): 
    max_cycles = df.groupby('engine_id')['time_in_cycles'].max().reset_index()
    x = max_cycles['time_in_cycles']

    # Histogram
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=x, nbinsx=30, name='Engines', marker_color='#636EFA', opacity=0.7))

    # KDE line (smooth curve) - scale to count
    kde = gaussian_kde(x)
    x_line = np.linspace(x.min(), x.max(), 200)
    counts, _ = np.histogram(x, bins=30)
    # Scale KDE to approximate count scale (area under KDE = 1, so scale by n * bin_width)
    bin_width = (x.max() - x.min()) / 30
    kde_vals = kde(x_line) * len(x) * bin_width

    fig.add_trace(go.Scatter(
        x=x_line, y=kde_vals, mode='lines',
        name='Density', line=dict(color='red', width=2),
    ))

    fig.update_layout(
        title='Distribution of Maximum Engine Life (Total Cycles)',
        xaxis_title='Total Cycles until Failure',
        yaxis_title='Number of Engines',
        bargap=0.1, showlegend=True,
    )
    fig.show()

def plot_sensor_rul_corr_by_condition(df, sensor_col, op1='operational_setting_1', op2='operational_setting_2'):
    """Bar chart: correlation(sensor, RUL) per operating condition. Replaces flat mean±std lines."""
    
    df = df.copy()
    df['_condition'] = df[op1].round(2).astype(str) + '_' + df[op2].round(2).astype(str)
    corrs = df.groupby('_condition').apply(
        lambda g: g[sensor_col].corr(g['RUL'])
    ).reset_index(name='corr_RUL')
    corrs = corrs.sort_values('corr_RUL')

    fig = px.bar(corrs, x='_condition', y='corr_RUL', title=f'{sensor_col} — correlation with RUL by condition',
                 labels={'_condition': 'Condition', 'corr_RUL': 'Corr(sensor, RUL)'},
                 color='corr_RUL', color_continuous_scale='RdBu_r', range_color=[-1, 1])
    fig.update_layout(height=400, xaxis_tickangle=-45, showlegend=False)
    fig.show()

def smooth_sensors_per_engine(df, sensor_cols=None, window=5):
    """
    Replace each sensor value with rolling mean along time_in_cycles, per engine.
    """
    if sensor_cols is None:
        sensor_cols = get_sensor_column_names(df)
    df = df.copy()
    df = df.sort_values(['engine_id', 'time_in_cycles']).reset_index(drop=True)
    for col in sensor_cols:
        df[col] = df.groupby('engine_id')[col].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
    return df


#### **3. Exploratory Data Analysis**

In [52]:
header_names = ['engine_id', 'time_in_cycles', 'operational_setting_1', 'operational_setting_2', 'operational_setting_3', 'sensor_measurement_1', 'sensor_measurement_2', 'sensor_measurement_3', 'sensor_measurement_4', 'sensor_measurement_5', 'sensor_measurement_6', 'sensor_measurement_7', 'sensor_measurement_8', 'sensor_measurement_9', 'sensor_measurement_10', 'sensor_measurement_11', 'sensor_measurement_12', 'sensor_measurement_13', 'sensor_measurement_14', 'sensor_measurement_15', 'sensor_measurement_16', 'sensor_measurement_17', 'sensor_measurement_18', 'sensor_measurement_19', 'sensor_measurement_20', 'sensor_measurement_21']

In [53]:
# Reading data
df_2 = pd.read_csv("../data/CMAPSSData/train_FD002.txt", sep=r"\s+", header=None, names=header_names)
df_2.head()

Unnamed: 0,engine_id,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,sensor_measurement_6,sensor_measurement_7,sensor_measurement_8,sensor_measurement_9,sensor_measurement_10,sensor_measurement_11,sensor_measurement_12,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,8.0,194.64,2222.65,8341.91,1.02,42.02,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,5.71,138.51,2211.57,8303.96,1.02,42.2,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,9.02,175.71,1915.11,8001.42,0.94,36.69,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,5.71,138.46,2211.58,8303.96,1.02,41.96,130.72,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,9.03,175.05,1915.1,7993.23,0.94,36.89,164.31,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286


In [54]:
# 1) Load test data (same format as train)
data_path = "../data/CMAPSSData"  # adjust if your path is different
df_test = pd.read_csv(f"../data/CMAPSSData/test_FD002.txt", sep=r"\s+", names=header_names, header=None)

# 2) Load true RUL at end of each test trajectory (one value per engine, in engine order)
rul_test = pd.read_csv(f"../data/CMAPSSData/RUL_FD002.txt", sep=r"\s+", header=None, names=['RUL_last_cycle'])

# 3) RUL_last_cycle = RUL at the last cycle of that engine. For earlier cycles: RUL = RUL_last_cycle + (last_cycle - current_cycle)
last_cycle_per_engine = df_test.groupby('engine_id')['time_in_cycles'].transform('max')
df_test['RUL'] = rul_test['RUL_last_cycle'].values[df_test['engine_id'].values - 1] + (last_cycle_per_engine - df_test['time_in_cycles'])
df_test.head()

Unnamed: 0,engine_id,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,sensor_measurement_6,sensor_measurement_7,sensor_measurement_8,sensor_measurement_9,sensor_measurement_10,sensor_measurement_11,sensor_measurement_12,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,RUL
0,1,1,9.9987,0.2502,100.0,489.05,605.03,1497.17,1304.99,10.52,15.49,394.54,2318.96,8763.8,1.26,45.61,371.69,2388.18,8114.1,8.6476,0.03,369,2319,100.0,28.42,17.1551,275
1,1,2,20.0026,0.7,100.0,491.19,607.82,1481.2,1246.11,9.35,13.66,334.36,2323.95,8713.21,1.08,44.26,315.32,2388.12,8053.06,9.2405,0.02,364,2324,100.0,24.29,14.8039,274
2,1,3,35.0045,0.84,100.0,449.44,556.0,1359.08,1128.36,5.48,8.0,193.55,2222.67,8340.2,1.02,41.8,183.04,2387.75,8053.04,9.3472,0.02,333,2223,100.0,14.98,8.9125,273
3,1,4,42.0066,0.841,100.0,445.0,550.17,1349.69,1127.89,3.91,5.71,138.74,2211.58,8313.85,1.02,42.21,130.4,2387.72,8066.9,9.3961,0.02,332,2212,100.0,10.35,6.4181,272
4,1,5,24.9985,0.6213,60.0,462.54,536.72,1253.18,1050.69,7.05,9.03,175.75,1915.1,7997.13,0.94,36.76,164.56,2028.05,7865.66,10.8682,0.02,305,1915,84.93,14.31,8.574,271


In [55]:
# Adding target variable 
df_2 = add_RUL(df_2)
df_2.head()

Unnamed: 0,engine_id,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,sensor_measurement_6,sensor_measurement_7,sensor_measurement_8,sensor_measurement_9,sensor_measurement_10,sensor_measurement_11,sensor_measurement_12,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,RUL
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,8.0,194.64,2222.65,8341.91,1.02,42.02,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071,148
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,5.71,138.51,2211.57,8303.96,1.02,42.2,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665,147
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,9.02,175.71,1915.11,8001.42,0.94,36.69,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723,146
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,5.71,138.46,2211.58,8303.96,1.02,41.96,130.72,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701,145
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,9.03,175.05,1915.1,7993.23,0.94,36.89,164.31,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286,144


In [56]:
# Checking for NA or NULL values
df_2.info()

<class 'pandas.DataFrame'>
RangeIndex: 53759 entries, 0 to 53758
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   engine_id              53759 non-null  int64  
 1   time_in_cycles         53759 non-null  int64  
 2   operational_setting_1  53759 non-null  float64
 3   operational_setting_2  53759 non-null  float64
 4   operational_setting_3  53759 non-null  float64
 5   sensor_measurement_1   53759 non-null  float64
 6   sensor_measurement_2   53759 non-null  float64
 7   sensor_measurement_3   53759 non-null  float64
 8   sensor_measurement_4   53759 non-null  float64
 9   sensor_measurement_5   53759 non-null  float64
 10  sensor_measurement_6   53759 non-null  float64
 11  sensor_measurement_7   53759 non-null  float64
 12  sensor_measurement_8   53759 non-null  float64
 13  sensor_measurement_9   53759 non-null  float64
 14  sensor_measurement_10  53759 non-null  float64
 15  sensor_measur

In [57]:
print(f"(Rows, Columns): ({df_2.shape[0]},{df_2.shape[1]})")

(Rows, Columns): (53759,27)


In [58]:
# Number of cycles per engine
plot_cycles_per_engine(df_2)

##### **<span style="color: #B0B0B0 ;">As evident from the provided graph, the distribution of the number of cycles per engine is as follows: the minimum number of cycles is associated with <span style="color: #E74C3C;">engine_id = 244</span> (128 cycles), while the maximum number of cycles is associated with <span style="color: #27AE60;">engine_id = 112</span> (378 cycles)</span>**

In [59]:
df_2.describe()

Unnamed: 0,engine_id,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,sensor_measurement_6,sensor_measurement_7,sensor_measurement_8,sensor_measurement_9,sensor_measurement_10,sensor_measurement_11,sensor_measurement_12,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,RUL
count,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0
mean,131.082981,109.154746,23.998407,0.572056,94.04602,472.910207,579.672399,1419.971013,1205.442024,8.031986,11.600746,282.606787,2228.879188,8525.200837,1.094962,42.985172,266.069034,2334.557253,8066.597682,9.329654,0.023326,348.309511,2228.806358,97.756838,20.789296,12.473423,108.154746
std,74.463862,69.180569,14.747376,0.310016,14.237735,26.389707,37.289399,105.946341,119.123428,3.613839,5.431802,146.005306,145.209816,335.812013,0.127469,3.232372,137.659507,128.068271,84.83795,0.749335,0.004711,27.754515,145.32798,5.364067,9.869331,5.921615,69.180569
min,1.0,1.0,0.0,0.0,60.0,445.0,535.53,1243.73,1023.77,3.91,5.71,136.8,1914.77,7985.56,0.93,36.23,129.12,2027.61,7848.36,8.3357,0.02,303.0,1915.0,84.93,10.18,6.0105,0.0
25%,68.0,52.0,10.0046,0.2507,100.0,445.0,549.57,1352.76,1123.655,3.91,5.72,139.935,2211.88,8321.66,1.02,41.91,131.52,2387.9,8062.14,8.6778,0.02,331.0,2212.0,100.0,10.91,6.5463,51.0
50%,131.0,104.0,25.0013,0.7,100.0,462.54,555.98,1369.18,1138.89,7.05,9.03,194.66,2223.07,8361.2,1.02,42.39,183.2,2388.08,8082.54,9.3109,0.02,335.0,2223.0,100.0,14.88,8.9292,103.0
75%,195.0,157.0,41.998,0.84,100.0,491.19,607.34,1499.37,1306.85,10.52,15.49,394.08,2323.96,8778.03,1.26,45.35,371.26,2388.17,8127.195,9.3869,0.03,369.0,2324.0,100.0,28.47,17.0832,156.0
max,260.0,378.0,42.008,0.842,100.0,518.67,644.52,1612.88,1439.23,14.62,21.61,555.82,2388.39,9215.66,1.3,48.51,523.37,2390.48,8268.5,11.0669,0.03,399.0,2388.0,100.0,39.34,23.5901,377.0


In [60]:
# Checking for duplicates
df_2.duplicated().sum()

np.int64(0)

In [61]:
# Checking for constant columns 
df_2.nunique()

engine_id                  260
time_in_cycles             378
operational_setting_1      536
operational_setting_2      105
operational_setting_3        2
sensor_measurement_1         6
sensor_measurement_2      1590
sensor_measurement_3     12305
sensor_measurement_4     15411
sensor_measurement_5         6
sensor_measurement_6        14
sensor_measurement_7      2067
sensor_measurement_8       897
sensor_measurement_9     22434
sensor_measurement_10        9
sensor_measurement_11      681
sensor_measurement_12     1672
sensor_measurement_13      514
sensor_measurement_14    14905
sensor_measurement_15     8464
sensor_measurement_16        2
sensor_measurement_17       53
sensor_measurement_18        6
sensor_measurement_19        2
sensor_measurement_20      510
sensor_measurement_21    17837
RUL                        378
dtype: int64

##### **<span style="color: #B0B0B0 ;">Below is a list of constant columns that are being dropped</span>**

In [62]:
# Dropping constant columns
df_2.drop(columns=['operational_setting_3', 'sensor_measurement_1', 'sensor_measurement_5', 'sensor_measurement_10', 'sensor_measurement_16', 'sensor_measurement_18', 'sensor_measurement_19'], inplace=True)

In [63]:
for i in get_sensor_column_names(df_2):
    # Based on IQR concept
    print(f"{i}: {round((outlier_detection(df_2[i])/df_2[i].shape[0]) * 100, 2)}%")

sensor_measurement_2: 0.0%
sensor_measurement_3: 0.0%
sensor_measurement_4: 0.0%
sensor_measurement_6: 0.0%
sensor_measurement_7: 0.0%
sensor_measurement_8: 14.88%
sensor_measurement_9: 0.0%
sensor_measurement_11: 6.39%
sensor_measurement_12: 0.0%
sensor_measurement_13: 17.93%
sensor_measurement_14: 14.97%
sensor_measurement_15: 14.88%
sensor_measurement_17: 0.0%
sensor_measurement_20: 0.0%
sensor_measurement_21: 0.0%


##### **<span style="color: #B0B0B0 ;">Since the CMAPSS dataset includes simulated measurement noise and does not provide physical sensor bounds, extreme values were not removed blindly. Instead, smoothing-based feature engineering will be used to preserve potential degradation signals while mitigating noise.</span>**

In [64]:
histogram_plot_RUL(df_2)

In [65]:
print("RUL skewness:", skew(df_2["RUL"]))

RUL skewness: 0.5008502686554939


##### **<span style="color: #B0B0B0;">RUL is right‑skewed in the pooled data because every engine contributes rows with low RUL (end of life), but only long-lived engines contribute rows with high RUL. So we see more low-RUL rows overall-that’s the data structure, not a bias in how RUL behaves per engine.</span>**

In [66]:
rul_vs_cycle_plot(df_2)

##### **<span style="color: #B0B0B0;">RUL vs cycle plot confirms that remaining useful life decreases as time-in-cycles increases for each engine (sanity check on the RUL definition).</span>**

In [67]:
# Most data = healthy phase
#Few samples = near failure
print(f"Checking for RUL Imbalance: {round((df_2['RUL'] < 20).mean().item() * 100,2)}%")


Checking for RUL Imbalance: 9.67%


##### **<span style="color: #B0B0B0;">In the pooled data (one row per cycle), 9.69% of <em>rows</em> have RUL &lt; 20 i.e. 9.69% of all cycle records fall in the “last 20 cycles before failure” for some engine. Every engine contributes some of those rows; the percentage simply describes how much of the dataset is in that critical slice. This can inform loss weighting if we want the model to emphasize accuracy when RUL is low.</span>**

In [68]:
correlation_sensors_rul(df_2)

##### **<span style="color: #B0B0B0;">As evident from the correlation graph above, sensors (12, 7, 8, 13, 21, 20) exhibit positive correlation with RUL, while others exhibit negative correlation with RUL. In terms of degradation: as the engine degrades (RUL decreases), sensors 12, 7, 8, 13, 21, 20 tend to <em>decrease</em> (downward trend toward failure), whereas the remaining sensors tend to <em>increase</em> (upward trend toward failure). So we get two degradation patterns-some sensors drop and others rise as the unit approaches end of life.</span>**

In [69]:
sensor_measurements_distribution(df_2)

**FD002 sensor distribution results:** Many sensors show **multi-modal distributions with several distinct peaks** (e.g. sensor_2: three clusters around ~540–560, ~605–610, ~642; sensor_3: four peaks across ~1250–1600; sensor_7: six peaks from ~0 to ~550). This matches FD002’s **6 operating conditions**: each condition shifts sensor values, so the pooled histogram has multiple modes rather than a single bell. So for FD002, “many peaks” is the expected shape for most sensors, not just two; constant/near-constant sensors still show tight spikes.

##### **Dropping sensor 13, cause it has only 2 values so constant**

In [70]:
df_2.drop(columns=['sensor_measurement_13'], inplace=True)
df_2.head()

Unnamed: 0,engine_id,time_in_cycles,operational_setting_1,operational_setting_2,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_6,sensor_measurement_7,sensor_measurement_8,sensor_measurement_9,sensor_measurement_11,sensor_measurement_12,sensor_measurement_14,sensor_measurement_15,sensor_measurement_17,sensor_measurement_20,sensor_measurement_21,RUL
0,1,1,34.9983,0.84,555.32,1358.61,1137.23,8.0,194.64,2222.65,8341.91,42.02,183.06,8048.56,9.3461,334,14.73,8.8071,148
1,1,2,41.9982,0.8408,549.9,1353.22,1125.78,5.71,138.51,2211.57,8303.96,42.2,130.42,8072.3,9.3774,330,10.41,6.2665,147
2,1,3,24.9988,0.6218,537.31,1256.76,1047.45,9.02,175.71,1915.11,8001.42,36.69,164.22,7864.87,10.8941,309,14.08,8.6723,146
3,1,4,42.0077,0.8416,549.51,1354.03,1126.38,5.71,138.46,2211.58,8303.96,41.96,130.72,8068.66,9.3528,329,10.59,6.4701,145
4,1,5,25.0005,0.6203,537.07,1257.71,1047.93,9.03,175.05,1915.1,7993.23,36.89,164.31,7861.23,10.8963,309,14.13,8.5286,144


In [71]:
plot_sensor_rul_corr_by_condition(df_2, 'sensor_measurement_21')

##### **What the plot signifies**
Each bar is one **operating condition** (e.g. a combination of operational settings). The **height** of the bar is the **Pearson correlation** between the sensor and RUL *within that condition* — i.e. how strongly the sensor moves with RUL when we restrict to that condition. So the plot shows whether, and how much, the sensor–RUL relationship **depends on operating condition**.

Pearson correlation is a number that measures how strong and in which direction two numeric variables have a linear relationship.


##### **Results**
- All bars are **positive** for this sensor: within every condition, the sensor is positively correlated with RUL (sensor tends to decrease as RUL decreases).
- **Correlation strength varies by condition:** strongest (e.g. ~0.64) in condition `0.0_0.0`, weakest (e.g. ~0.28) in conditions such as `42.0_0.84` and `42.01_0.84`. So the same sensor is a stronger RUL indicator under some conditions and weaker under others.

##### **What we deduce**
- **Operating condition matters:** the sensor–RUL relationship is condition-dependent. The model should either include condition (e.g. operational settings) as features so it can use the sensor differently per regime, or we can use condition-specific models/weights if needed.
- **Interpretation:** under conditions with high correlation, the sensor tracks RUL well (clear downward trend as the engine degrades); under others the trend is weaker but still positive.

In [72]:
correlation_heatmap(df_2)

#### Correlation Heatmap – Sensor Relationships

##### 1. One large positively correlated group (red block)

The heatmap shows **one main cluster** of sensors that are highly positively correlated with each other:

**Main group (positive with each other, ~0.90–1.00):**
- Sensors: sensor_2, sensor_3, sensor_4, sensor_6, sensor_7, sensor_9, sensor_11, sensor_12, sensor_17, sensor_20, sensor_21
- These tend to **move together** (increase or decrease in sync). As one goes up, the others tend to go up.
- Pairwise correlations within this group are very strong (e.g. s2–s3, s2–s4, s6–s7, s6–s12, s9–s3, s9–s4 in the 0.96–1.00 range).

**Sensor_8 and sensor_14:**  
Still positively correlated with the main group but with **more varied strength**: sensor_8 is very strong with s11 (~0.97) and s3/s9 (~0.90), and weaker with s6, s7, s12, s20, s21 (~0.59–0.68). Sensor_14 is strong with s8 (~0.92) and s11 (~0.89), and weaker with s2, s6, s7, s12, s20, s21 (~0.41–0.62). So they sit in the same “positive” block but add some variation in how tightly they track the rest.

---

##### 2. One anti-correlated sensor (sensor_15 — blue column/row)

- **sensor_15** is **strongly negatively correlated** with almost all other sensors (roughly **−0.60 to −0.97**).
- Strongest negative: with s8 and s11 (~−0.97), with s14 (~−0.96); also strong with s9, s3, s17, s4, s2 (~−0.85 to −0.89); moderate with s12, s7, s20, s21, s6 (~−0.60 to −0.69).
- **Interpretation:** sensor_15 moves **in the opposite direction** to the main group — when most sensors go up, s15 tends to go down, and vice versa. It provides a **distinct, inverse signal** relative to the rest.

---

##### 3. Multicollinearity within the main group

- Within the large positive block, many pairwise correlations are **0.90–1.00** → very high **redundancy**; several sensors carry almost the same information.
- **For modeling:**  
  - **Tree-based models:** Can keep all sensors; multicollinearity is usually not an issue.  
  - **Linear / neural nets:** Consider **PCA** or **feature selection** to reduce redundancy and improve stability.

---

##### 4. Summary table

| Observation | Implication |
|-------------|-------------|
| One large positively correlated block (most sensors) | Most sensors move together; high redundancy. |
| sensor_15 strongly negative with others | Single “inverse” sensor; captures opposite trend; useful complementary signal. |
| sensor_8, sensor_14 in positive block but varied strength | Part of main trend but with weaker links to some sensors (s6, s7, s12, s20, s21). |
| Very high correlations (0.90–1.00) within main group | Strong multicollinearity; PCA or feature selection can help for linear models. |

**Conclusion:** The heatmap shows one dominant **positively correlated** sensor group (s2, s3, s4, s6, s7, s9, s11, s12, s17, s20, s21) with very high pairwise correlations, and **sensor_15** as the single strongly **anti-correlated** sensor. Sensor_8 and sensor_14 belong to the positive block but with more varied correlation strengths. For linear or neural models, PCA or feature selection is recommended to handle redundancy; tree-based models can retain all sensors.

In [73]:
engine_lifecycle_distribution(df_2)

#### **4. Preprocessing**

In [74]:
# Example: after you have df_1 (and e.g. add_RUL(df_1))
# --- Optional smoothing (before split, before normalization) ---
WINDOW = 5  # last 5 cycles
df_2 = smooth_sensors_per_engine(df_2, window=WINDOW)
# Then later: split → normalize → model

In [75]:
# Features: only sensors (no engine_id, no time_in_cycles, no RUL)
exclude = ['engine_id', 'time_in_cycles', 'RUL']
feature_cols = [c for c in df_2.columns if c not in exclude]

X_train = df_2[feature_cols]
X_test = df_test[feature_cols]
y_train = df_2['RUL']   # RUL is the target — model learns from (X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### **5. Model**