# *Intelligent Decision Support  System for Monitoring the TurboFan Engine Performance using Machine Learning Algorithm*

# **This is done by Swathi Kalyan K U and Roshini Banu L**

 *Importing initially important libraries*

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import sklearn
from sklearn.metrics import mean_squared_error, r2_score
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random
import warnings
np.random.seed(34)
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#import pyearth

*Defining features names*

In [2]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = ['s_{}'.format(i+1) for i in range(0,21)]
col_names = index_names + setting_names + sensor_names

*Importing train and validation data*

In [3]:
dftrain = pd.read_csv('train_FD001.txt',sep='\s+',header=None,index_col=False,names=col_names)
dfvalid = pd.read_csv('test_FD001.txt',sep='\s+',header=None,index_col=False,names=col_names)
y_valid = pd.read_csv('RUL_FD001.txt',sep='\s+',header=None,index_col=False,names=['RUL'])
dfvalid.shape


(13096, 26)

In [4]:
train = dftrain.copy()
valid = dfvalid.copy()

In [5]:
train.head()

Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,s_5,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [6]:
dftrain = pd.read_csv('train_FD001.txt',sep='\s+',header=None,index_col=False )

# *Data inspection*

In [7]:
df_train_1 = train.iloc[:,5:26]
df_train_1.head()
valid_1=valid.iloc[:,5:]
valid_1.head()

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
0,518.67,643.02,1585.29,1398.21,14.62,21.61,553.9,2388.04,9050.17,1.3,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,518.67,641.71,1588.45,1395.42,14.62,21.61,554.85,2388.01,9054.42,1.3,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,518.67,642.46,1586.94,1401.34,14.62,21.61,554.11,2388.05,9056.96,1.3,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,518.67,642.44,1584.12,1406.42,14.62,21.61,554.07,2388.03,9045.29,1.3,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,518.67,642.51,1587.19,1401.92,14.62,21.61,554.16,2388.01,9044.55,1.3,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [8]:
train.head()

Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,s_5,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [9]:


train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   unit_number  20631 non-null  int64  
 1   time_cycles  20631 non-null  int64  
 2   setting_1    20631 non-null  float64
 3   setting_2    20631 non-null  float64
 4   setting_3    20631 non-null  float64
 5   s_1          20631 non-null  float64
 6   s_2          20631 non-null  float64
 7   s_3          20631 non-null  float64
 8   s_4          20631 non-null  float64
 9   s_5          20631 non-null  float64
 10  s_6          20631 non-null  float64
 11  s_7          20631 non-null  float64
 12  s_8          20631 non-null  float64
 13  s_9          20631 non-null  float64
 14  s_10         20631 non-null  float64
 15  s_11         20631 non-null  float64
 16  s_12         20631 non-null  float64
 17  s_13         20631 non-null  float64
 18  s_14         20631 non-null  float64
 19  s_15

In [10]:
train.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
unit_number,20631.0,51.506568,29.22763,1.0,26.0,52.0,77.0,100.0
time_cycles,20631.0,108.807862,68.88099,1.0,52.0,104.0,156.0,362.0
setting_1,20631.0,-9e-06,0.002187313,-0.0087,-0.0015,0.0,0.0015,0.0087
setting_2,20631.0,2e-06,0.0002930621,-0.0006,-0.0002,0.0,0.0003,0.0006
setting_3,20631.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
s_1,20631.0,518.67,6.537152e-11,518.67,518.67,518.67,518.67,518.67
s_2,20631.0,642.680934,0.5000533,641.21,642.325,642.64,643.0,644.53
s_3,20631.0,1590.523119,6.13115,1571.04,1586.26,1590.1,1594.38,1616.91
s_4,20631.0,1408.933782,9.000605,1382.25,1402.36,1408.04,1414.555,1441.49
s_5,20631.0,14.62,3.3947e-12,14.62,14.62,14.62,14.62,14.62


In [11]:
print('Shape of the train dataset : ',train.shape)
print('Shape of the validation dataset : ',valid.shape)

Shape of the train dataset :  (20631, 26)
Shape of the validation dataset :  (13096, 26)


In [12]:
#Cheking the presence of Nan values 
print('Total Null values in the train dataset : ',train.isna().sum())

Total Null values in the train dataset :  unit_number    0
time_cycles    0
setting_1      0
setting_2      0
setting_3      0
s_1            0
s_2            0
s_3            0
s_4            0
s_5            0
s_6            0
s_7            0
s_8            0
s_9            0
s_10           0
s_11           0
s_12           0
s_13           0
s_14           0
s_15           0
s_16           0
s_17           0
s_18           0
s_19           0
s_20           0
s_21           0
dtype: int64


In [13]:
train.loc[:,['unit_number','time_cycles']].describe()

Unnamed: 0,unit_number,time_cycles
count,20631.0,20631.0
mean,51.506568,108.807862
std,29.227633,68.88099
min,1.0,1.0
25%,26.0,52.0
50%,52.0,104.0
75%,77.0,156.0
max,100.0,362.0


**Add RUL column to the data**

In [15]:
def add_RUL_column(df):
    train_grouped_by_unit = df.groupby(by='unit_number') 
    max_time_cycles = train_grouped_by_unit['time_cycles'].max() 
    merged = df.merge(max_time_cycles.to_frame(name='max_time_cycle'), left_on='unit_number',right_index=True)
    merged["RUL"] = merged["max_time_cycle"] - merged['time_cycles']
    merged = merged.drop("max_time_cycle", axis=1) 
    return merged

In [16]:
train = add_RUL_column(train)

In [17]:
train[['unit_number','RUL']]

Unnamed: 0,unit_number,RUL
0,1,191
1,1,190
2,1,189
3,1,188
4,1,187
...,...,...
20626,100,4
20627,100,3
20628,100,2
20629,100,1


In [18]:
train.head()

Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,s_5,...,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [19]:
from sklearn.model_selection import train_test_split
drop_labels = index_names+setting_names
X_train=train.drop(columns=drop_labels).copy()
X_train, X_test, y_train, y_test=train_test_split(X_train,X_train['RUL'], test_size=0.3, random_state=42)


In [20]:
X_test.columns

Index(['s_1', 's_2', 's_3', 's_4', 's_5', 's_6', 's_7', 's_8', 's_9', 's_10',
       's_11', 's_12', 's_13', 's_14', 's_15', 's_16', 's_17', 's_18', 's_19',
       's_20', 's_21', 'RUL'],
      dtype='object')

In [21]:
y_train

12862     54
9936     185
12025    101
14526     24
16747    137
        ... 
11284    107
11964    162
5390      74
860      255
15795    157
Name: RUL, Length: 14441, dtype: int64

In [22]:
# drop_labels=['unit_number','time_cycles','setting_1','setting_2','setting_3','s_1', 's_5','s_6','s_10',  's_16', 's_18', 's_19','RUL']
# x_train_1=train.drop(columns=drop_labels, axis=1)
# train_sample = train.drop(columns=drop_labels, axis=1)

In [23]:
#train_sample

In [24]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
#Droping the target variable
X_train.drop(columns=['RUL'], inplace=True)
X_test.drop(columns=['RUL'], inplace=True)
#Scaling X_train and X_test
X_train_s=scaler.fit_transform(X_train)
X_test_s=scaler.fit_transform(X_test)
#Conserve only the last occurence of each unit to match the length of y_valid
X_valid = valid.groupby('unit_number').last().reset_index().drop(columns=drop_labels)
#scaling X_valid
X_valid_s=scaler.fit_transform(X_valid)

In [25]:
X_valid.shape

(100, 21)

In [26]:
X_train.head()

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
12862,518.67,643.08,1592.07,1402.73,14.62,21.61,554.1,2388.09,9062.5,1.3,...,521.78,2388.04,8133.34,8.4301,0.03,393,2388,100.0,38.67,23.3724
9936,518.67,642.21,1580.72,1394.09,14.62,21.61,553.34,2387.99,9066.77,1.3,...,522.67,2388.03,8144.06,8.3837,0.03,392,2388,100.0,39.08,23.3686
12025,518.67,642.09,1586.25,1404.03,14.62,21.61,554.27,2388.06,9061.07,1.3,...,521.7,2388.01,8139.32,8.4244,0.03,391,2388,100.0,38.96,23.3025
14526,518.67,643.52,1597.95,1423.87,14.62,21.61,552.22,2388.21,9031.18,1.3,...,520.08,2388.16,8109.8,8.493,0.03,394,2388,100.0,38.54,23.1882
16747,518.67,642.34,1586.62,1401.82,14.62,21.61,554.13,2388.06,9054.55,1.3,...,522.24,2388.03,8128.29,8.4081,0.03,392,2388,100.0,39.08,23.4485


In [27]:
X_valid.head()

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
0,518.67,642.58,1581.22,1398.91,14.62,21.61,554.42,2388.08,9056.4,1.3,...,521.79,2388.06,8130.11,8.4024,0.03,393,2388,100.0,38.81,23.3552
1,518.67,642.55,1586.59,1410.83,14.62,21.61,553.52,2388.1,9044.77,1.3,...,521.74,2388.09,8126.9,8.4505,0.03,391,2388,100.0,38.81,23.2618
2,518.67,642.88,1589.75,1418.89,14.62,21.61,552.59,2388.16,9049.26,1.3,...,520.83,2388.14,8131.46,8.4119,0.03,395,2388,100.0,38.93,23.274
3,518.67,642.78,1594.53,1406.88,14.62,21.61,552.64,2388.13,9051.3,1.3,...,521.88,2388.11,8133.64,8.4634,0.03,395,2388,100.0,38.58,23.2581
4,518.67,642.27,1589.94,1419.36,14.62,21.61,553.29,2388.1,9053.99,1.3,...,521.0,2388.15,8125.74,8.4362,0.03,394,2388,100.0,38.75,23.4117


In [28]:
#R2 score & RMSE & MAER
def evaluate(y_true, y_hat, label='test'):
    mse = mean_squared_error(y_true, y_hat)
    rmse = np.sqrt(mse)
    variance = r2_score(y_true, y_hat)
    print('{} set RMSE:{}, R2:{}'.format(label, rmse, variance))

In [29]:
y_train.head()

12862     54
9936     185
12025    101
14526     24
16747    137
Name: RUL, dtype: int64

In [30]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression() #Instantiation
lr.fit(X_train_s, y_train) #Fitting 

y_lr_train = lr.predict(X_train_s) #Prediction on train data
evaluate(y_train,y_lr_train, label='train')

y_lr_test = lr.predict(X_test_s)  #Prediction on test data
evaluate(y_test, y_lr_test, label='test')

y_lr_valid= lr.predict(X_valid_s) #Prediction on validation data
evaluate(y_valid, y_lr_valid, label='valid')

train set RMSE:44.7994102333131, R2:0.583093872579288
test set RMSE:46.09812141308128, R2:0.5360561298414108
valid set RMSE:43.004188685510925, R2:-0.07093219111566151


In [31]:
from sklearn.svm import SVR
#import tensorflow as tf
regressor = SVR(kernel='rbf')

In [32]:
reg=SVR() #Instantiation
reg.fit(X_train_s, y_train) #Fitting 

y_svr_train = reg.predict(X_train_s) #Prediction on train data
evaluate(y_train,y_svr_train, label='train')

y_svr_test = reg.predict(X_test_s)  #Prediction on test data
evaluate(y_test, y_svr_test, label='test')

y_svr_valid= reg.predict(X_valid_s) #Prediction on validation data
evaluate(y_valid, y_svr_valid, label='valid')

train set RMSE:43.56448464126648, R2:0.605761670162996
test set RMSE:47.26718449996111, R2:0.5122262167433573
valid set RMSE:27.118861018173632, R2:0.57412382329099


In [33]:
x_train_1=train
x_train_1.head()

Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,s_5,...,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [34]:
df_train_1

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
0,518.67,641.82,1589.70,1400.60,14.62,21.61,554.36,2388.06,9046.19,1.3,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,518.67,642.35,1587.99,1404.20,14.62,21.61,554.26,2388.08,9052.94,1.3,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,518.67,642.37,1582.85,1406.22,14.62,21.61,554.00,2388.06,9055.15,1.3,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,518.67,643.49,1597.98,1428.63,14.62,21.61,551.43,2388.19,9065.52,1.3,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,518.67,643.54,1604.50,1433.58,14.62,21.61,550.86,2388.23,9065.11,1.3,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,518.67,643.42,1602.46,1428.18,14.62,21.61,550.94,2388.24,9065.90,1.3,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,518.67,643.23,1605.26,1426.53,14.62,21.61,550.68,2388.25,9073.72,1.3,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


In [35]:
sensor_cols=df_train_1.columns
sensor_cols

Index(['s_1', 's_2', 's_3', 's_4', 's_5', 's_6', 's_7', 's_8', 's_9', 's_10',
       's_11', 's_12', 's_13', 's_14', 's_15', 's_16', 's_17', 's_18', 's_19',
       's_20', 's_21'],
      dtype='object')


# LAG FEATURE

In [36]:
lag_size = 2
for col in sensor_cols:
    for i in range(1,lag_size+1):
        df_train_1[f"{col}_lag{i}"] = df_train_1[col].shift(i)

In [37]:
df_train_1.head()

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_17_lag1,s_17_lag2,s_18_lag1,s_18_lag2,s_19_lag1,s_19_lag2,s_20_lag1,s_20_lag2,s_21_lag1,s_21_lag2
0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,...,,,,,,,,,,
1,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,...,392.0,,2388.0,,100.0,,39.06,,23.419,
2,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,...,392.0,392.0,2388.0,2388.0,100.0,100.0,39.0,39.06,23.4236,23.419
3,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,...,390.0,392.0,2388.0,2388.0,100.0,100.0,38.95,39.0,23.3442,23.4236
4,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,...,392.0,390.0,2388.0,2388.0,100.0,100.0,38.88,38.95,23.3739,23.3442


In [38]:
df_train_l= pd.concat([df_train_1,train['RUL']],axis=1)
df_train_l

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_17_lag2,s_18_lag1,s_18_lag2,s_19_lag1,s_19_lag2,s_20_lag1,s_20_lag2,s_21_lag1,s_21_lag2,RUL
0,518.67,641.82,1589.70,1400.60,14.62,21.61,554.36,2388.06,9046.19,1.3,...,,,,,,,,,,191
1,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,...,,2388.0,,100.0,,39.06,,23.4190,,190
2,518.67,642.35,1587.99,1404.20,14.62,21.61,554.26,2388.08,9052.94,1.3,...,392.0,2388.0,2388.0,100.0,100.0,39.00,39.06,23.4236,23.4190,189
3,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,...,392.0,2388.0,2388.0,100.0,100.0,38.95,39.00,23.3442,23.4236,188
4,518.67,642.37,1582.85,1406.22,14.62,21.61,554.00,2388.06,9055.15,1.3,...,390.0,2388.0,2388.0,100.0,100.0,38.88,38.95,23.3739,23.3442,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,518.67,643.49,1597.98,1428.63,14.62,21.61,551.43,2388.19,9065.52,1.3,...,394.0,2388.0,2388.0,100.0,100.0,38.14,38.38,23.1923,23.1324,4
20627,518.67,643.54,1604.50,1433.58,14.62,21.61,550.86,2388.23,9065.11,1.3,...,394.0,2388.0,2388.0,100.0,100.0,38.49,38.14,22.9735,23.1923,3
20628,518.67,643.42,1602.46,1428.18,14.62,21.61,550.94,2388.24,9065.90,1.3,...,397.0,2388.0,2388.0,100.0,100.0,38.30,38.49,23.1594,22.9735,2
20629,518.67,643.23,1605.26,1426.53,14.62,21.61,550.68,2388.25,9073.72,1.3,...,395.0,2388.0,2388.0,100.0,100.0,38.44,38.30,22.9333,23.1594,1


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [40]:
imputer = SimpleImputer(strategy ='mean')
df_train_l = pd.DataFrame(imputer.fit_transform(df_train_l), columns=df_train_l.columns)
#dftm_valid = pd.DataFrame(imputer.fit_transform(dftm_valid), columns=dftm_valid.columns)

In [41]:
df_train_l.head()

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_17_lag2,s_18_lag1,s_18_lag2,s_19_lag1,s_19_lag2,s_20_lag1,s_20_lag2,s_21_lag1,s_21_lag2,RUL
0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,...,393.210432,2388.0,2388.0,100.0,100.0,38.816292,38.816318,23.289717,23.289728,191.0
1,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,...,393.210432,2388.0,2388.0,100.0,100.0,39.06,38.816318,23.419,23.289728,190.0
2,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,...,392.0,2388.0,2388.0,100.0,100.0,39.0,39.06,23.4236,23.419,189.0
3,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,...,392.0,2388.0,2388.0,100.0,100.0,38.95,39.0,23.3442,23.4236,188.0
4,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,...,390.0,2388.0,2388.0,100.0,100.0,38.88,38.95,23.3739,23.3442,187.0


In [42]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df_train_l.drop(columns=['RUL']),df_train_l['RUL'],test_size=0.3,random_state=42)

In [43]:
print(X_train.isnull().sum())

s_1          0
s_2          0
s_3          0
s_4          0
s_5          0
            ..
s_19_lag2    0
s_20_lag1    0
s_20_lag2    0
s_21_lag1    0
s_21_lag2    0
Length: 63, dtype: int64


In [44]:
valid_1.columns

Index(['s_1', 's_2', 's_3', 's_4', 's_5', 's_6', 's_7', 's_8', 's_9', 's_10',
       's_11', 's_12', 's_13', 's_14', 's_15', 's_16', 's_17', 's_18', 's_19',
       's_20', 's_21'],
      dtype='object')

In [45]:
y_valid.shape

(100, 1)

In [46]:
valid_1 = valid.groupby('unit_number').last().reset_index().drop(columns=drop_labels)
valid_1.shape

(100, 21)

In [47]:
valid_1.head()

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
0,518.67,642.58,1581.22,1398.91,14.62,21.61,554.42,2388.08,9056.4,1.3,...,521.79,2388.06,8130.11,8.4024,0.03,393,2388,100.0,38.81,23.3552
1,518.67,642.55,1586.59,1410.83,14.62,21.61,553.52,2388.1,9044.77,1.3,...,521.74,2388.09,8126.9,8.4505,0.03,391,2388,100.0,38.81,23.2618
2,518.67,642.88,1589.75,1418.89,14.62,21.61,552.59,2388.16,9049.26,1.3,...,520.83,2388.14,8131.46,8.4119,0.03,395,2388,100.0,38.93,23.274
3,518.67,642.78,1594.53,1406.88,14.62,21.61,552.64,2388.13,9051.3,1.3,...,521.88,2388.11,8133.64,8.4634,0.03,395,2388,100.0,38.58,23.2581
4,518.67,642.27,1589.94,1419.36,14.62,21.61,553.29,2388.1,9053.99,1.3,...,521.0,2388.15,8125.74,8.4362,0.03,394,2388,100.0,38.75,23.4117


In [48]:
lag_size = 2
for col in sensor_cols:
    for i in range(1,lag_size+1):
        valid_1[f"{col}_lag{i}"] = valid_1[col].shift(i)
valid_1

Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_17_lag1,s_17_lag2,s_18_lag1,s_18_lag2,s_19_lag1,s_19_lag2,s_20_lag1,s_20_lag2,s_21_lag1,s_21_lag2
0,518.67,642.58,1581.22,1398.91,14.62,21.61,554.42,2388.08,9056.40,1.3,...,,,,,,,,,,
1,518.67,642.55,1586.59,1410.83,14.62,21.61,553.52,2388.10,9044.77,1.3,...,393.0,,2388.0,,100.0,,38.81,,23.3552,
2,518.67,642.88,1589.75,1418.89,14.62,21.61,552.59,2388.16,9049.26,1.3,...,391.0,393.0,2388.0,2388.0,100.0,100.0,38.81,38.81,23.2618,23.3552
3,518.67,642.78,1594.53,1406.88,14.62,21.61,552.64,2388.13,9051.30,1.3,...,395.0,391.0,2388.0,2388.0,100.0,100.0,38.93,38.81,23.2740,23.2618
4,518.67,642.27,1589.94,1419.36,14.62,21.61,553.29,2388.10,9053.99,1.3,...,395.0,395.0,2388.0,2388.0,100.0,100.0,38.58,38.93,23.2581,23.2740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,518.67,642.30,1590.88,1397.94,14.62,21.61,553.99,2388.03,9062.41,1.3,...,394.0,393.0,2388.0,2388.0,100.0,100.0,38.94,38.63,23.3458,23.2933
96,518.67,642.59,1582.96,1410.92,14.62,21.61,554.05,2388.06,9076.36,1.3,...,391.0,394.0,2388.0,2388.0,100.0,100.0,38.96,38.94,23.4606,23.3458
97,518.67,642.68,1599.51,1415.47,14.62,21.61,553.44,2388.13,9062.34,1.3,...,395.0,391.0,2388.0,2388.0,100.0,100.0,38.61,38.96,23.2953,23.4606
98,518.67,642.00,1585.03,1397.98,14.62,21.61,554.75,2388.01,9067.16,1.3,...,394.0,395.0,2388.0,2388.0,100.0,100.0,38.76,38.61,23.3608,23.2953


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [50]:
imputer = SimpleImputer(strategy ='mean')
valid_lag = pd.DataFrame(imputer.fit_transform(valid_1), columns=valid_1.columns)

In [51]:
print(valid_lag.isnull().sum())
y_train.shape

s_1          0
s_2          0
s_3          0
s_4          0
s_5          0
            ..
s_19_lag2    0
s_20_lag1    0
s_20_lag2    0
s_21_lag1    0
s_21_lag2    0
Length: 63, dtype: int64


(14441,)

In [52]:
X_train.columns


Index(['s_1', 's_2', 's_3', 's_4', 's_5', 's_6', 's_7', 's_8', 's_9', 's_10',
       's_11', 's_12', 's_13', 's_14', 's_15', 's_16', 's_17', 's_18', 's_19',
       's_20', 's_21', 's_1_lag1', 's_1_lag2', 's_2_lag1', 's_2_lag2',
       's_3_lag1', 's_3_lag2', 's_4_lag1', 's_4_lag2', 's_5_lag1', 's_5_lag2',
       's_6_lag1', 's_6_lag2', 's_7_lag1', 's_7_lag2', 's_8_lag1', 's_8_lag2',
       's_9_lag1', 's_9_lag2', 's_10_lag1', 's_10_lag2', 's_11_lag1',
       's_11_lag2', 's_12_lag1', 's_12_lag2', 's_13_lag1', 's_13_lag2',
       's_14_lag1', 's_14_lag2', 's_15_lag1', 's_15_lag2', 's_16_lag1',
       's_16_lag2', 's_17_lag1', 's_17_lag2', 's_18_lag1', 's_18_lag2',
       's_19_lag1', 's_19_lag2', 's_20_lag1', 's_20_lag2', 's_21_lag1',
       's_21_lag2'],
      dtype='object')

In [53]:
reg=SVR() #Instantiation
reg.fit(X_train, y_train) #Fitting 

y_svr_train_lag = reg.predict(X_train) #Prediction on train data
evaluate(y_train,y_svr_train_lag, label='train')

y_svr_tes_lag = reg.predict(X_test)  #Prediction on test data
evaluate(y_test, y_svr_test, label='test')

y_svr_valid_lag= reg.predict(valid_lag) #Prediction on validation data
evaluate(y_valid, y_svr_valid_lag, label='valid')

train set RMSE:69.63509136914342, R2:-0.0072794743053077315
test set RMSE:47.26718449996111, R2:0.5122262167433573
valid set RMSE:48.79979309333518, R2:-0.37903858285091396


# *RANDOM FOREST REGRESSION*

In [104]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
x = dftm.drop("RUL",axis=1)
y = dftm.iloc[:,-1].values
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
# Create a random forest regressor object
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
rf.fit(x_train,y_train)

# Make predictions on the validation data
rf_test_pred = rf.predict(x_test)
rf_train_pred = rf.predict(x_train)

In [105]:
rmse_train = np.sqrt(mean_squared_error(y_train, rf_train_pred))
rmse = np.sqrt(mean_squared_error(y_test,rf_test_pred))
print("RMSE for training data:", rmse_train)
print("RMSE for training data:", rmse)


RMSE for training data: 6.563932365206643
RMSE for training data: 17.202078507887332


In [106]:
# Calculate the R-squared score
r2_train = r2_score(y_train, rf_train_pred)
r2_test = r2_score(y_test, rf_test_pred)

print('R-squared score:', r2_train)
print('R-squared score:', r2_test)



R-squared score: 0.9910013339403791
R-squared score: 0.9352320476369129


Since the training accuracy is greater than testing accuracy the Random Forest model is overfit

# *Support Vector Regression*

In [43]:
scaler = StandardScaler()
x_train_scale = scaler.fit_transform(x_train)

In [44]:
from sklearn.svm import SVR

svr = SVR(kernel = 'rbf', C=1.0, epsilon = 0.1)
svr.fit(x_train_scale,y_train)

In [45]:
svr_train_pred = svr.predict(x_train_scale)
svr_test_pred = svr.predict(x_test)

In [46]:
rmse_train_svr = np.sqrt(mean_squared_error(y_train, svr_train_pred))
rmse_test_svr = np.sqrt(mean_squared_error(y_test, svr_test_pred))
print("RMSE for training data:", rmse_train_svr)
print("RMSE for training data:", rmse_test_svr)


RMSE for training data: 42.049242440346326
RMSE for training data: 77.65971551231758


In [47]:
r2_train_svr = r2_score(y_train, svr_train_pred)
r2_test_svr = r2_score(y_test, svr_test_pred)
print('R-squared score:', r2_train_svr)
print('R-squared score:', r2_test_svr)

R-squared score: 0.63071106720779
R-squared score: -0.3200485574160825


**LSTM WITHOUT FEATURE ENGINEERING**

In [48]:
df=pd.concat([train['unit_number'],train_sample,train['RUL']],axis=1) 
df.head()

Unnamed: 0,unit_number,s_2,s_3,s_4,s_7,s_8,s_9,s_11,s_12,s_13,s_14,s_15,s_17,s_20,s_21,RUL
0,1,641.82,1589.7,1400.6,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419,191
1,1,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236,190
2,1,642.35,1587.99,1404.2,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,189
3,1,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,188
4,1,642.37,1582.85,1406.22,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044,187


In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler

In [51]:


# 3. Normalize the dataset
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df.iloc[:, :-1])  # Scale only the sensor data, not the ID and RUL columns

# 4. Create sequences for the LSTM
def create_sequences(data, seq_length):
    X = []
    y = []
    for i in range(seq_length, len(data)):
        X.append(data[i-seq_length:i, :])
        y.append(data[i, -1])
    return np.array(X), np.array(y)

seq_length = 50  # Set the length of the sequences
X, y = create_sequences(df_scaled, seq_length)

# 5. Split the dataset into training and testing sets
split = 0.8  # Set the fraction of the dataset to use for training
split_index = int(split * len(df_scaled))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# 6. Define the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=50))
model.add(Dense(units=1))

# 7. Train the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, batch_size=32)

# 8. Evaluate the model
train_loss = model.evaluate(X_train, y_train)
test_loss = model.evaluate(X_test, y_test)
print("Training loss:", train_loss)
print("Testing loss:", test_loss)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training loss: 0.007249157875776291
Testing loss: 0.007553464733064175


In [52]:

from sklearn.metrics import mean_squared_error

# assuming you have already trained and evaluated your LSTM model on the train and test sets

# get the predicted values for the training and testing sets
lstm_train_pred = model.predict(X_train)
lstm_test_pred = model.predict(X_test)

# reshape y_train_pred, y_train, y_test_pred, and y_test to match dimensions
lstm_train_pred = lstm_train_pred.reshape(y_train.shape)
lstm_test_pred = lstm_test_pred.reshape(y_test.shape)

# calculate root mean squared error for the training and testing sets
train_rmse_lstm = np.sqrt(mean_squared_error(y_train, lstm_train_pred))
test_rmse_lstm = np.sqrt(mean_squared_error(y_test, lstm_test_pred))

print("Training RMSE: ", train_rmse_lstm)
print("Testing RMSE: ", test_rmse_lstm)


Training RMSE:  0.08514197861016423
Testing RMSE:  0.08691066740805763


In [53]:
r2_train_lstm = r2_score(y_train, lstm_train_pred)
r2_test_lstm = r2_score(y_test, lstm_test_pred)
print('R-squared score:', r2_train_lstm)
print('R-squared score:', r2_test_lstm)

R-squared score: 0.6778387060278884
R-squared score: 0.6509588687749854


*POLYNOMIAL FEATURE*

In [54]:
sensors_taken = train.iloc[:,[6,7,8,11,12,13,15,16,17,18,19,21,24,25]]
sensors_taken

Unnamed: 0,s_2,s_3,s_4,s_7,s_8,s_9,s_11,s_12,s_13,s_14,s_15,s_17,s_20,s_21
0,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.4190
1,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.00,23.4236
2,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442
3,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739
4,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,2388.26,8137.60,8.4956,397,38.49,22.9735
20627,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,2388.22,8136.50,8.5139,395,38.30,23.1594
20628,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,2388.24,8141.05,8.5646,398,38.44,22.9333
20629,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,2388.23,8139.29,8.5389,395,38.29,23.0640


In [58]:
dftm_p= pd.concat([train['unit_number'],train_sample,train['RUL']],axis=1)
dftm_p

Unnamed: 0,unit_number,s_2,s_3,s_4,s_7,s_8,s_9,s_11,s_12,s_13,s_14,s_15,s_17,s_20,s_21,RUL
0,1,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.4190,191
1,1,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.00,23.4236,190
2,1,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,189
3,1,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,188
4,1,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393,38.90,23.4044,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,2388.26,8137.60,8.4956,397,38.49,22.9735,4
20627,100,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,2388.22,8136.50,8.5139,395,38.30,23.1594,3
20628,100,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,2388.24,8141.05,8.5646,398,38.44,22.9333,2
20629,100,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,2388.23,8139.29,8.5389,395,38.29,23.0640,1


In [1]:
from sklearn.preprocessing import PolynomialFeatures

x_p = dftm_p.drop("RUL",axis=1)
y_p = dftm.iloc[:,-1].values
 

NameError: name 'dftm_p' is not defined

In [63]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
x_poly = poly.fit_transform(x_p)

In [64]:
df_poly = pd.DataFrame(x_poly,columns=poly.get_feature_names(x_p.columns))
df_poly['RUL']=y_p
df_poly


Unnamed: 0,1,unit_number,s_2,s_3,s_4,s_7,s_8,s_9,s_11,s_12,...,s_14 s_17,s_14 s_20,s_14 s_21,s_15 s_17,s_15 s_20,s_15 s_21,s_17 s_20,s_17 s_21,s_20 s_21,RUL
0,1.0,1.0,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,...,3190339.04,317894.4972,190598.341780,3300.4440,328.865670,197.176270,15311.52,9180.2480,914.746140,191.0
1,1.0,1.0,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,...,3187544.08,317128.1100,190468.769164,3305.2656,328.840200,197.503110,15288.00,9182.0512,913.520400,190.0
2,1.0,1.0,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,...,3171959.70,316789.3085,189863.747766,3282.9420,327.873310,196.506807,15190.50,9104.2380,909.256590,189.0
3,1.0,1.0,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,...,3188461.36,316243.3104,190119.329037,3280.3344,325.355616,195.597470,15240.96,9162.5688,908.777232,188.0
4,1.0,1.0,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,...,3196583.40,316404.8200,190366.708720,3312.7542,327.903660,197.285049,15287.70,9197.9292,910.431160,187.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,1.0,100.0,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,...,3230627.20,313216.2240,186949.153600,3372.7532,326.995644,195.173667,15280.53,9120.4795,884.250015,4.0
20627,1.0,100.0,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,...,3213917.50,311627.9500,188436.458100,3362.9905,326.082370,197.176816,15128.50,9147.9630,887.005020,3.0
20628,1.0,100.0,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,...,3240137.90,312941.9620,186701.141965,3408.7108,329.223224,196.414541,15299.12,9127.4534,881.556052,2.0
20629,1.0,100.0,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,...,3215019.55,311653.4141,187724.584560,3372.8655,326.954481,196.941190,15124.55,9110.2800,883.120560,1.0


In [65]:
y_poly = df_poly['RUL'].values
x_poly = df_poly.drop(["RUL"],axis=1).values

In [66]:
x_poly_train,x_poly_test,y_poly_train,y_poly_test=train_test_split(x_poly,y_poly,test_size=0.2,random_state=42)

In [67]:
model = SVR(kernel="rbf")
model.fit(x_poly_train,y_poly_train)

In [68]:
y_poly_train_pred = model.predict(x_poly_train)
y_poly_test_pred = model.predict(x_poly_test)


In [69]:
from sklearn.metrics import mean_absolute_error
rmse_train_p=np.sqrt(mean_squared_error(y_poly_train,y_poly_train_pred))
rmse_test_p=np.sqrt(mean_squared_error(y_poly_test,y_poly_test_pred))

In [70]:
print("Training Rmse:",rmse_train_p)
print("testing rmse:",rmse_test_p)

Training Rmse: 69.31233480696638
testing rmse: 67.83094411119875


*LSTM WITH POLYNOMIAL FEATURE*

In [71]:
split = 0.8  # Set the fraction of the dataset to use for training
split_index = int(split * len(df_scaled))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# 6. Define the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=50))
model.add(Dense(units=1))

# 7. Train the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, batch_size=32)

# 8. Evaluate the model
train_loss = model.evaluate(X_train, y_train)
test_loss = model.evaluate(X_test, y_test)
print("Training loss:", train_loss)
print("Testing loss:", test_loss)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training loss: 0.007231167983263731
Testing loss: 0.007606515195220709


In [73]:
import numpy as np
from sklearn.metrics import mean_squared_error

# assuming you have already trained and evaluated your LSTM model on the train and test sets

# get the predicted values for the training and testing sets
p_train_pred = model.predict(X_train)
p_test_pred = model.predict(X_test)

# reshape y_train_pred, y_train, y_test_pred, and y_test to match dimensions
p_train_pred = p_train_pred.reshape(y_train.shape)
p_test_pred = p_test_pred.reshape(y_test.shape)

# calculate root mean squared error for the training and testing sets
p_train_rmse = np.sqrt(mean_squared_error(y_train, p_train_pred))
p_test_rmse = np.sqrt(mean_squared_error(y_test, p_test_pred))

print("Training RMSE: ", p_train_rmse)
print("Testing RMSE: ", p_test_rmse)



Training RMSE:  0.08503628211461602
Testing RMSE:  0.08721535060872752


In [74]:
r2_train_p = r2_score(y_train, p_train_pred)
r2_test_p = r2_score(y_test, p_test_pred)
print('R-squared score:', r2_train_p)
print('R-squared score:', r2_test_p)

R-squared score: 0.6786380810113415
R-squared score: 0.6485073082754782


*LSTM WITH LAG FEATURE*

In [75]:
dftm= pd.concat([train['unit_number'],x_train_1,train['RUL']],axis=1)
dftm

Unnamed: 0,unit_number,s_2,s_3,s_4,s_7,s_8,s_9,s_11,s_12,s_13,...,s_14_lag2,s_15_lag1,s_15_lag2,s_17_lag1,s_17_lag2,s_20_lag1,s_20_lag2,s_21_lag1,s_21_lag2,RUL
0,1,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,2388.02,...,,,,,,,,,,191
1,1,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,...,,8.4195,,392.0,,39.06,,23.4190,,190
2,1,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,2388.03,...,8138.62,8.4318,8.4195,392.0,392.0,39.00,39.06,23.4236,23.4190,189
3,1,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,...,8131.49,8.4178,8.4318,390.0,392.0,38.95,39.00,23.3442,23.4236,188
4,1,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,2388.04,...,8133.23,8.3682,8.4178,392.0,390.0,38.88,38.95,23.3739,23.3442,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,2388.26,...,8139.67,8.5519,8.5215,394.0,394.0,38.14,38.38,23.1923,23.1324,4
20627,100,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,2388.22,...,8142.90,8.4956,8.5519,397.0,394.0,38.49,38.14,22.9735,23.1923,3
20628,100,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,2388.24,...,8137.60,8.5139,8.4956,395.0,397.0,38.30,38.49,23.1594,22.9735,2
20629,100,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,2388.23,...,8136.50,8.5646,8.5139,398.0,395.0,38.44,38.30,22.9333,23.1594,1


In [76]:
dftm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20631 entries, 0 to 20630
Data columns (total 44 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   unit_number  20631 non-null  int64  
 1   s_2          20631 non-null  float64
 2   s_3          20631 non-null  float64
 3   s_4          20631 non-null  float64
 4   s_7          20631 non-null  float64
 5   s_8          20631 non-null  float64
 6   s_9          20631 non-null  float64
 7   s_11         20631 non-null  float64
 8   s_12         20631 non-null  float64
 9   s_13         20631 non-null  float64
 10  s_14         20631 non-null  float64
 11  s_15         20631 non-null  float64
 12  s_17         20631 non-null  int64  
 13  s_20         20631 non-null  float64
 14  s_21         20631 non-null  float64
 15  s_2_lag1     20630 non-null  float64
 16  s_2_lag2     20629 non-null  float64
 17  s_3_lag1     20630 non-null  float64
 18  s_3_lag2     20629 non-null  float64
 19  s_4_

In [77]:
print(dftm.isnull().sum())

unit_number    0
s_2            0
s_3            0
s_4            0
s_7            0
s_8            0
s_9            0
s_11           0
s_12           0
s_13           0
s_14           0
s_15           0
s_17           0
s_20           0
s_21           0
s_2_lag1       1
s_2_lag2       2
s_3_lag1       1
s_3_lag2       2
s_4_lag1       1
s_4_lag2       2
s_7_lag1       1
s_7_lag2       2
s_8_lag1       1
s_8_lag2       2
s_9_lag1       1
s_9_lag2       2
s_11_lag1      1
s_11_lag2      2
s_12_lag1      1
s_12_lag2      2
s_13_lag1      1
s_13_lag2      2
s_14_lag1      1
s_14_lag2      2
s_15_lag1      1
s_15_lag2      2
s_17_lag1      1
s_17_lag2      2
s_20_lag1      1
s_20_lag2      2
s_21_lag1      1
s_21_lag2      2
RUL            0
dtype: int64


In [78]:
imputer = SimpleImputer(strategy ='mean')
dftm = pd.DataFrame(imputer.fit_transform(dftm), columns=dftm.columns)

In [79]:
print(dftm.isnull().sum())

unit_number    0
s_2            0
s_3            0
s_4            0
s_7            0
s_8            0
s_9            0
s_11           0
s_12           0
s_13           0
s_14           0
s_15           0
s_17           0
s_20           0
s_21           0
s_2_lag1       0
s_2_lag2       0
s_3_lag1       0
s_3_lag2       0
s_4_lag1       0
s_4_lag2       0
s_7_lag1       0
s_7_lag2       0
s_8_lag1       0
s_8_lag2       0
s_9_lag1       0
s_9_lag2       0
s_11_lag1      0
s_11_lag2      0
s_12_lag1      0
s_12_lag2      0
s_13_lag1      0
s_13_lag2      0
s_14_lag1      0
s_14_lag2      0
s_15_lag1      0
s_15_lag2      0
s_17_lag1      0
s_17_lag2      0
s_20_lag1      0
s_20_lag2      0
s_21_lag1      0
s_21_lag2      0
RUL            0
dtype: int64


Unnamed: 0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,...,s_17_lag1,s_17_lag2,s_18_lag1,s_18_lag2,s_19_lag1,s_19_lag2,s_20_lag1,s_20_lag2,s_21_lag1,s_21_lag2
12862,518.67,643.08,1592.07,1402.73,14.62,21.61,554.10,2388.09,9062.50,1.3,...,391.0,393.0,2388.0,2388.0,100.0,100.0,38.97,38.97,23.3322,23.3445
9936,518.67,642.21,1580.72,1394.09,14.62,21.61,553.34,2387.99,9066.77,1.3,...,393.0,394.0,2388.0,2388.0,100.0,100.0,39.05,39.04,23.4449,23.3796
12025,518.67,642.09,1586.25,1404.03,14.62,21.61,554.27,2388.06,9061.07,1.3,...,392.0,392.0,2388.0,2388.0,100.0,100.0,39.05,38.91,23.4335,23.2813
14526,518.67,643.52,1597.95,1423.87,14.62,21.61,552.22,2388.21,9031.18,1.3,...,396.0,395.0,2388.0,2388.0,100.0,100.0,38.73,38.46,23.1722,23.1483
16747,518.67,642.34,1586.62,1401.82,14.62,21.61,554.13,2388.06,9054.55,1.3,...,390.0,390.0,2388.0,2388.0,100.0,100.0,38.95,39.09,23.4119,23.3369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,518.67,642.19,1592.99,1390.35,14.62,21.61,553.93,2388.06,9039.80,1.3,...,392.0,393.0,2388.0,2388.0,100.0,100.0,38.88,38.90,23.4126,23.4147
11964,518.67,641.81,1580.59,1404.09,14.62,21.61,554.78,2388.01,9065.90,1.3,...,390.0,391.0,2388.0,2388.0,100.0,100.0,38.94,39.18,23.4023,23.4199
5390,518.67,642.50,1600.91,1415.54,14.62,21.61,553.50,2388.09,9061.03,1.3,...,393.0,393.0,2388.0,2388.0,100.0,100.0,38.82,38.84,23.3558,23.2138
860,518.67,641.93,1578.03,1396.28,14.62,21.61,554.52,2387.97,9062.88,1.3,...,392.0,391.0,2388.0,2388.0,100.0,100.0,38.96,38.87,23.4476,23.3806


In [54]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_train_l.iloc[:, :-1])  # Scale only the sensor data, not the ID and RUL columns

# 4. Create sequences for the LSTM
def create_sequences(data, seq_length):
    X = []
    y = []
    for i in range(seq_length, len(data)):
        X.append(data[i-seq_length:i, :])
        y.append(data[i, -1])
    return np.array(X), np.array(y)

seq_length = 50  # Set the length of the sequences
X, y = create_sequences(df_scaled, seq_length)

# 5. Split the dataset into training and testing sets
split = 0.8  # Set the fraction of the dataset to use for training
split_index = int(split * len(df_scaled))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# 6. Define the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=50))
model.add(Dense(units=1))

# 7. Train the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, batch_size=32)

# 8. Evaluate the model
train_loss = model.evaluate(X_train, y_train)
test_loss = model.evaluate(X_test, y_test)
print("Training loss:", train_loss)
print("Testing loss:", test_loss)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training loss: 2.424498688924359e-06
Testing loss: 2.6321922632632777e-06


In [57]:
valid_lag["RUL"]=y_valid
valid_lag
print(l_valid_pred.shape)

(20581, 1)


In [56]:
# get the predicted values for the training and testing sets
l_train_pred = model.predict(X_train)
l_test_pred = model.predict(X_test)
l_valid_pred = model.predict(X) 

# reshape y_train_pred, y_train, y_test_pred, and y_test to match dimensions
l_train_pred = l_train_pred.reshape(y_train.shape)
l_test_pred = l_test_pred.reshape(y_test.shape)
#l_valid_pred = l_valid_pred.reshape(y_valid.shape)

# calculate root mean squared error for the training and testing sets
l_train_rmse = np.sqrt(mean_squared_error(y_train, l_train_pred))
l_test_rmse = np.sqrt(mean_squared_error(y_test, l_test_pred))
l_valid_rmse= np.sqrt(mean_squared_error(y,l_valid_pred))


print("Training RMSE: ", l_train_rmse)
print("Testing RMSE: ", l_test_rmse)
print("Valid RMSE: ", l_valid_rmse)

Training RMSE:  0.001557080065060417
Testing RMSE:  0.0016224044252643576
Valid RMSE:  0.0015702363515232036


In [58]:
r2_train_l = r2_score(y_train, l_train_pred)
r2_test_l= r2_score(y_test, l_test_pred)
r2_valid_l = r2_score(y,l_valid_pred)
print('R-squared score:', r2_train_l)
print('R-squared score:', r2_test_l)
print('R-squared score:', r2_valid_l)

R-squared score: 0.9998922705772013
R-squared score: 0.9998780994469209
R-squared score: 0.9998895666788477
