# **Import**

In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error

# **Data Load**

In [3]:
cd /content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data

/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data


In [5]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = [f's_{i + 1}' for i in range(21)]
col_names = index_names + setting_names + sensor_names

train_df = pd.read_csv('./train_FD004.csv')

In [6]:
max_cycle = train_df.groupby('unit_number')['time_cycles'].max().reset_index()
max_cycle.columns = ['unit_number', 'max_cycle']

train_df = train_df.merge(max_cycle, on='unit_number', how='left')

train_df['RUL'] = train_df['max_cycle'] - train_df['time_cycles']

In [7]:
test_df = pd.read_csv('./test_FD004.csv')
rul_df = pd.read_csv('./RUL_FD004.csv')

In [14]:
results = []

using_sensors = [
    's_2', 's_3', 's_4', 's_7', 's_8', 's_9', 's_10', 's_11', 's_12',
    's_13', 's_14', 's_15', 's_17', 's_20', 's_21'
]

In [8]:
train_df['setting_1'] = train_df['setting_1'].round(1)
train_df['setting_2'] = train_df['setting_2'].round(1)
train_df['setting_group'] = train_df[['setting_1','setting_2','setting_3']].astype(str).agg('_'.join, axis=1)
train_df = pd.get_dummies(train_df, columns=['setting_group'])
train_df.head()

Unnamed: 0.1,Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,...,s_21,max_cycle,RUL,setting_group_0.0_0.0_100.0,setting_group_10.0_0.2_100.0,setting_group_10.0_0.3_100.0,setting_group_20.0_0.7_100.0,setting_group_25.0_0.6_60.0,setting_group_35.0_0.8_100.0,setting_group_42.0_0.8_100.0
0,0,1,1,42.0,0.8,100.0,445.0,549.68,1343.43,1112.93,...,6.367,321,320,False,False,False,False,False,False,True
1,1,1,2,20.0,0.7,100.0,491.19,606.07,1477.61,1237.5,...,14.6552,321,319,False,False,False,True,False,False,False
2,2,1,3,42.0,0.8,100.0,445.0,548.95,1343.12,1117.05,...,6.4213,321,318,False,False,False,False,False,False,True
3,3,1,4,42.0,0.8,100.0,445.0,548.7,1341.24,1118.03,...,6.4176,321,317,False,False,False,False,False,False,True
4,4,1,5,25.0,0.6,60.0,462.54,536.1,1255.23,1033.59,...,8.6754,321,316,False,False,False,False,True,False,False


In [10]:
setting_group = [col for col in train_df.columns if col.startswith('setting_group')]
setting_group

['setting_group_0.0_0.0_100.0',
 'setting_group_10.0_0.2_100.0',
 'setting_group_10.0_0.3_100.0',
 'setting_group_20.0_0.7_100.0',
 'setting_group_25.0_0.6_60.0',
 'setting_group_35.0_0.8_100.0',
 'setting_group_42.0_0.8_100.0']

In [11]:
test_df['setting_1'] = test_df['setting_1'].round(1)
test_df['setting_2'] = test_df['setting_2'].round(1)
test_df['setting_group'] = test_df[['setting_1','setting_2','setting_3']].astype(str).agg('_'.join, axis=1)
test_df = pd.get_dummies(test_df, columns=['setting_group'])
test_df.head()

Unnamed: 0.1,Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,...,s_19,s_20,s_21,setting_group_0.0_0.0_100.0,setting_group_10.0_0.2_100.0,setting_group_10.0_0.3_100.0,setting_group_20.0_0.7_100.0,setting_group_25.0_0.6_60.0,setting_group_35.0_0.8_100.0,setting_group_42.0_0.8_100.0
0,0,1,1,20.0,0.7,100.0,491.19,606.67,1481.04,1227.81,...,100.0,24.31,14.7007,False,False,False,True,False,False,False
1,1,1,2,25.0,0.6,60.0,462.54,536.22,1256.17,1031.48,...,84.93,14.36,8.5748,False,False,False,False,True,False,False
2,2,1,3,42.0,0.8,100.0,445.0,549.23,1340.13,1105.88,...,100.0,10.39,6.4365,False,False,False,False,False,False,True
3,3,1,4,42.0,0.8,100.0,445.0,549.19,1339.7,1107.26,...,100.0,10.56,6.2367,False,False,False,False,False,False,True
4,4,1,5,35.0,0.8,100.0,449.44,555.1,1353.04,1117.8,...,100.0,14.85,8.9326,False,False,False,False,False,True,False


## Raw Data

In [15]:
# RUL Clipping X
raw_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x_sensors = train_df[using_sensors].copy()
    train_x_settings = train_df[setting_group].copy()
    train_y = train_df['RUL'].copy().values.ravel()
    test_x_sensors = test_df.groupby('unit_number').tail(1)[using_sensors]
    test_x_settings = test_df.groupby('unit_number').tail(1)[setting_group]

    if scaler:
        scaler.fit(train_x_sensors)

        train_x_sensors = scaler.transform(train_x_sensors)
        test_x_sensors = scaler.transform(test_x_sensors)

    train_x = np.concatenate([train_x_sensors, train_x_settings], axis=1)
    test_x = np.concatenate([test_x_sensors, test_x_settings], axis=1)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    raw_results.append({
    'Data': 'Raw',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_results)

Scaler: None, MAE: 39.09381364907474, RMSE: 48.4865204328425
Scaler: MinMaxScaler, MAE: 39.09381364906865, RMSE: 48.486520432835476
Scaler: StandardScaler, MAE: 39.093813649069226, RMSE: 48.48652043283618
Scaler: RobustScaler, MAE: 39.093813649067314, RMSE: 48.48652043283441


In [16]:
pd.DataFrame(raw_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Raw,Linear,,39.093814,48.48652
1,Raw,Linear,MinMaxScaler,39.093814,48.48652
2,Raw,Linear,StandardScaler,39.093814,48.48652
3,Raw,Linear,RobustScaler,39.093814,48.48652


In [17]:
# RUL Clipping O
raw_clipping_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x_sensors = train_df[using_sensors].copy()
    train_x_settings = train_df[setting_group].copy()
    train_y = train_df['RUL']
    test_x_sensors = test_df.groupby('unit_number').tail(1)[using_sensors]
    test_x_settings = test_df.groupby('unit_number').tail(1)[setting_group]

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    if scaler:
        scaler.fit(train_x_sensors)

        train_x_sensors = scaler.transform(train_x_sensors)
        test_x_sensors = scaler.transform(test_x_sensors)

    train_x = np.concatenate([train_x_sensors, train_x_settings], axis=1)
    test_x = np.concatenate([test_x_sensors, test_x_settings], axis=1)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    raw_clipping_results.append({
    'Data': 'Raw (RUL Clipping)',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_clipping_results)

Scaler: None, MAE: 21.77097262750639, RMSE: 26.051967994079774
Scaler: MinMaxScaler, MAE: 21.77097262750583, RMSE: 26.05196799407926
Scaler: StandardScaler, MAE: 21.770972627505692, RMSE: 26.051967994079142
Scaler: RobustScaler, MAE: 21.77097262750533, RMSE: 26.05196799407767


In [18]:
pd.DataFrame(raw_clipping_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Raw (RUL Clipping),Linear,,21.770973,26.051968
1,Raw (RUL Clipping),Linear,MinMaxScaler,21.770973,26.051968
2,Raw (RUL Clipping),Linear,StandardScaler,21.770973,26.051968
3,Raw (RUL Clipping),Linear,RobustScaler,21.770973,26.051968


## **Cumulative Stat**

In [24]:
def make_cum_stat(df, is_train=True):
    tmp = df.copy()
    stat_df = []

    for unit in tmp['unit_number'].unique():
        unit_data = tmp[tmp['unit_number'] == unit].sort_values('time_cycles')

        for t in range(2, len(unit_data) + 1):
            sub = unit_data.iloc[:t]
            tc_values = sub['time_cycles'].values

            features = {
                'unit_number': unit,
                'time_cycles': sub['time_cycles'].iloc[-1]
            }

            for sensor in using_sensors:
                sub_data = sub[sensor].values

                features[f'{sensor}_mean']   = sub_data.mean()
                features[f'{sensor}_std']    = sub_data.std()
                features[f'{sensor}_min']    = sub_data.min()
                features[f'{sensor}_max']    = sub_data.max()
                features[f'{sensor}_last']   = sub_data[-1]
                features[f'{sensor}_median'] = np.median(sub_data)
                features[f'{sensor}_trend']  = np.polyfit(tc_values, sub_data, 1)[0]

            for setting in setting_group:
                features[setting] = sub[setting].iloc[-1]

            if is_train:
                features['RUL'] = sub['RUL'].iloc[-1]

            stat_df.append(features)

    return pd.DataFrame(stat_df)

In [None]:
cum_stat_train_df = make_cum_stat(train_df, is_train=True)
cum_stat_test_df = make_cum_stat(test_df, is_train=False)

In [30]:
cum_stat_train_df.to_csv('/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data/cum_stat_train_FD004.csv', index=False)
cum_stat_test_df.to_csv('/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data/cum_stat_test_FD004.csv', index=False)

In [38]:
stat_cols = [col for col in cum_stat_train_df.columns if col.startswith('s_')]

In [39]:
# RUL Clipping X
cum_stat_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x_sensors = cum_stat_train_df[stat_cols].copy()
    train_x_settings = cum_stat_train_df[setting_group].copy()
    train_y = cum_stat_train_df['RUL'].copy().values.ravel()
    test_x_sensors = cum_stat_test_df.groupby('unit_number').tail(1)[stat_cols]
    test_x_settings = cum_stat_test_df.groupby('unit_number').tail(1)[setting_group]

    if scaler:
        scaler.fit(train_x_sensors)

        train_x_sensors = scaler.transform(train_x_sensors)
        test_x_sensors = scaler.transform(test_x_sensors)

    train_x = np.concatenate([train_x_sensors, train_x_settings], axis=1)
    test_x = np.concatenate([test_x_sensors, test_x_settings], axis=1)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_results.append({
    'Data': 'Cumulative Stat',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_results)

Scaler: None, MAE: 25.534237001673173, RMSE: 32.52336923879927
Scaler: MinMaxScaler, MAE: 25.534237001673613, RMSE: 32.52336923879923
Scaler: StandardScaler, MAE: 25.53423700167362, RMSE: 32.523369238799404
Scaler: RobustScaler, MAE: 25.53423700167367, RMSE: 32.52336923880006


In [40]:
pd.DataFrame(cum_stat_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Cumulative Stat,Linear,,25.534237,32.523369
1,Cumulative Stat,Linear,MinMaxScaler,25.534237,32.523369
2,Cumulative Stat,Linear,StandardScaler,25.534237,32.523369
3,Cumulative Stat,Linear,RobustScaler,25.534237,32.523369


In [41]:
# RUL Clipping O
cum_stat_rul_clipping_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x_sensors = cum_stat_train_df[stat_cols].copy()
    train_x_settings = cum_stat_train_df[setting_group].copy()
    train_y = cum_stat_train_df['RUL'].copy()
    test_x_sensors = cum_stat_test_df.groupby('unit_number').tail(1)[stat_cols]
    test_x_settings = cum_stat_test_df.groupby('unit_number').tail(1)[setting_group]

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    if scaler:
        scaler.fit(train_x_sensors)

        train_x_sensors = scaler.transform(train_x_sensors)
        test_x_sensors = scaler.transform(test_x_sensors)

    train_x = np.concatenate([train_x_sensors, train_x_settings], axis=1)
    test_x = np.concatenate([test_x_sensors, test_x_settings], axis=1)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_rul_clipping_results.append({
    'Data': 'Cumulative Stat (RUL Clipping)',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_rul_clipping_results)

Scaler: None, MAE: 17.630382444866868, RMSE: 21.542401808402165
Scaler: MinMaxScaler, MAE: 17.63038244486736, RMSE: 21.542401808402655
Scaler: StandardScaler, MAE: 17.630382444867344, RMSE: 21.54240180840253
Scaler: RobustScaler, MAE: 17.63038244486693, RMSE: 21.54240180840254


In [42]:
pd.DataFrame(cum_stat_rul_clipping_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Cumulative Stat (RUL Clipping),Linear,,17.630382,21.542402
1,Cumulative Stat (RUL Clipping),Linear,MinMaxScaler,17.630382,21.542402
2,Cumulative Stat (RUL Clipping),Linear,StandardScaler,17.630382,21.542402
3,Cumulative Stat (RUL Clipping),Linear,RobustScaler,17.630382,21.542402


# **Result**

In [43]:
result = pd.DataFrame(results)
result.to_csv('/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Results_FD004/Linear_Regression.csv')

In [44]:
result = result.sort_values(by='MAE')
result.head()

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
12,Cumulative Stat (RUL Clipping),Linear,,17.630382,21.542402
15,Cumulative Stat (RUL Clipping),Linear,RobustScaler,17.630382,21.542402
14,Cumulative Stat (RUL Clipping),Linear,StandardScaler,17.630382,21.542402
13,Cumulative Stat (RUL Clipping),Linear,MinMaxScaler,17.630382,21.542402
7,Raw (RUL Clipping),Linear,RobustScaler,21.770973,26.051968
