In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, root_mean_squared_error  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
# Load the data
train = pd.read_csv('train_FD004.csv', low_memory=False)

In [3]:
# Define a function to calculate Fisher score using first 50 and last 50 samples of each engine
def fisher_score_sensor(df, sensor, start_cycles=50, end_cycles=50):
    # Extract beginning of life (first 50 cycles) and end of life (last 50 cycles) samples
    begin_life = df[df['time, in cycles'] <= start_cycles][sensor]
    end_life = df[df['time, in cycles'] >= (df['time, in cycles'].max() - end_cycles + 1)][sensor]
    
    # Calculate the Fisher score for the sensor
    mean_diff = abs(begin_life.mean() - end_life.mean())
    within_var = begin_life.var() + end_life.var()
    fisher_score = mean_diff / within_var
    return fisher_score

# Apply Fisher score calculation across each sensor
sensor_columns = [col for col in train.columns if col.startswith('sensor')]
fisher_scores = {sensor: fisher_score_sensor(train, sensor) for sensor in sensor_columns}

# Select the top sensors based on Fisher scores
top_sensors = sorted(fisher_scores, key=fisher_scores.get, reverse=True)[:6]  # adjust N as needed

print("Top sensors selected based on Fisher score:", top_sensors)

Top sensors selected based on Fisher score: ['sensor measurement 16', 'sensor measurement 10', 'sensor measurement 15', 'sensor measurement 11', 'sensor measurement 19', 'sensor measurement 14']


In [4]:
fisher_scores

{'sensor measurement 1': np.float64(0.00027096688041915846),
 'sensor measurement 2': np.float64(0.0008795484488569448),
 'sensor measurement 3': np.float64(0.0009666748110904218),
 'sensor measurement 4': np.float64(0.0008261329018411685),
 'sensor measurement 5': np.float64(0.0016987683680191714),
 'sensor measurement 6': np.float64(9.538825608326564e-05),
 'sensor measurement 7': np.float64(0.00020888324490909727),
 'sensor measurement 8': np.float64(0.0006157218587662951),
 'sensor measurement 9': np.float64(0.00035922781652775046),
 'sensor measurement 10': np.float64(0.6475512292574864),
 'sensor measurement 11': np.float64(0.05240475644794676),
 'sensor measurement 12': np.float64(0.0002248292184767426),
 'sensor measurement 13': np.float64(0.0009301093625797129),
 'sensor measurement 14': np.float64(0.0048030469459598366),
 'sensor measurement 15': np.float64(0.3087389919722746),
 'sensor measurement 16': np.float64(28.309157820238113),
 'sensor measurement 17': np.float64(0.00

In [5]:
# Calculate RUL for each engine
grouped_by_engine = train.groupby('unit number')

# Initialize the RUL column
train['RUL'] = 0

# Calculate RUL for each engine
for name, group in grouped_by_engine:
    max_cycle = group['time, in cycles'].max()
    train.loc[train['unit number'] == name, 'RUL'] = max_cycle - group['time, in cycles']

# Display a few rows to check
print(train[['unit number', 'time, in cycles', 'RUL']].head())

   unit number  time, in cycles  RUL
0            1                1  320
1            1                2  319
2            1                3  318
3            1                4  317
4            1                5  316


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Prepare the data with only the top sensors
X = train[top_sensors]
y = train['RUL']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse}")

# Print RUL predictions for each engine in the test set
engine_rul_predictions = pd.DataFrame({
    'True RUL': y_test,
    'Predicted RUL': y_pred
})

# Reset index for clearer display
engine_rul_predictions = engine_rul_predictions.reset_index(drop=True)
print(engine_rul_predictions)

RMSE: 74.48613865948836
       True RUL  Predicted RUL
0           179     134.658476
1            76     146.565319
2           205     204.078543
3           170     162.451981
4           127     189.581628
...         ...            ...
12245        98     117.053188
12246       110     121.098028
12247       184     142.254835
12248       106     145.645389
12249       235     149.992966

[12250 rows x 2 columns]


In [7]:
# Save the Linear Regression model as h5 file
joblib.dump(model, 'linear_regression_model.h5')

['linear_regression_model.h5']

In [8]:
# Load the model, test_FD004 and test_FD004_RUL datasets
model = joblib.load('linear_regression_model.h5')
test = pd.read_csv('test_FD004.csv', low_memory=False)
RUL = pd.read_csv('RUL_FD004.csv', low_memory=False)

In [9]:
test.shape, RUL.shape 

((41214, 26), (248, 1))

In [10]:
# Extract the last row of each engine in the test set
test_last_cycle = test.groupby('unit number').last().reset_index()

# Shape of the test_last_cycle
test_last_cycle.shape

(248, 26)

In [11]:
# Concatenate the test_last_cycle with the RUL dataset
test_last_cycle = pd.concat([test_last_cycle, RUL], axis=1)
test_last_cycle.head()

Unnamed: 0,unit number,"time, in cycles",operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,...,sensor measurement 13,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21,RUL
0,1,230,25.007,0.6214,60,462.54,537.66,1264.31,1046.41,7.05,...,2028.53,7890.31,10.7615,0.02,308,1915,84.93,14.41,8.6329,22
1,2,153,41.9989,0.84,100,445.0,549.96,1354.05,1133.55,3.91,...,2387.72,8073.44,9.3925,0.02,331,2212,100.0,10.58,6.4325,39
2,3,141,42.0005,0.8401,100,445.0,549.47,1341.06,1118.9,3.91,...,2388.18,8095.58,9.2974,0.02,330,2212,100.0,10.61,6.3488,107
3,4,208,25.0018,0.6207,60,462.54,536.06,1253.49,1038.53,7.05,...,2028.3,7878.63,10.8396,0.02,306,1915,84.93,14.41,8.5696,75
4,5,51,25.0039,0.62,60,462.54,537.36,1263.6,1052.52,7.05,...,2028.24,7873.75,10.9094,0.02,307,1915,84.93,14.19,8.6248,149


In [12]:
test_last_cycle.shape

(248, 27)

In [13]:
# Calculate the RUL for each engine in the test set using the Linear Regression model
X_test_last_cycle = test_last_cycle[top_sensors]
X_test_last_cycle = scaler.transform(X_test_last_cycle)
test_last_cycle['Predicted RUL'] = model.predict(X_test_last_cycle)

# Display the RUL predictions
print(test_last_cycle[['unit number', 'RUL', 'Predicted RUL']])

     unit number  RUL  Predicted RUL
0              1   22     113.793939
1              2   39     126.468956
2              3  107     117.539487
3              4   75     142.060327
4              5  149     110.997716
..           ...  ...            ...
243          244   35      41.449154
244          245  131     152.901124
245          246  194     172.380927
246          247  112     100.743529
247          248   26      77.622697

[248 rows x 3 columns]


In [14]:
# Calculate RMSE for the RUL predictions
rmse = root_mean_squared_error(test_last_cycle['RUL'], test_last_cycle['Predicted RUL'])
print(f"RMSE: {rmse}")

RMSE: 61.61085018466975
