# **Import**

In [1]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# !pip install catboost
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error

# **Data Load**

In [3]:
cd /content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data

/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data


In [4]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = [f's_{i + 1}' for i in range(21)]
col_names = index_names + setting_names + sensor_names

train_df = pd.read_csv('./train_FD004.csv')

In [5]:
max_cycle = train_df.groupby('unit_number')['time_cycles'].max().reset_index()
max_cycle.columns = ['unit_number', 'max_cycle']

train_df = train_df.merge(max_cycle, on='unit_number', how='left')

train_df['RUL'] = train_df['max_cycle'] - train_df['time_cycles']

In [6]:
test_df = pd.read_csv('./test_FD004.csv')
rul_df = pd.read_csv('./RUL_FD004.csv')

In [7]:
results = []

using_sensors = [
    's_2', 's_3', 's_4', 's_7', 's_8', 's_9', 's_10', 's_11', 's_12',
    's_13', 's_14', 's_15', 's_17', 's_20', 's_21'
]

In [8]:
train_df['setting_1'] = train_df['setting_1'].round(1)
train_df['setting_2'] = train_df['setting_2'].round(1)
train_df['setting_group'] = train_df[['setting_1','setting_2','setting_3']].astype(str).agg('_'.join, axis=1)
train_df = pd.get_dummies(train_df, columns=['setting_group'])
train_df.head()

Unnamed: 0.1,Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,...,s_21,max_cycle,RUL,setting_group_0.0_0.0_100.0,setting_group_10.0_0.2_100.0,setting_group_10.0_0.3_100.0,setting_group_20.0_0.7_100.0,setting_group_25.0_0.6_60.0,setting_group_35.0_0.8_100.0,setting_group_42.0_0.8_100.0
0,0,1,1,42.0,0.8,100.0,445.0,549.68,1343.43,1112.93,...,6.367,321,320,False,False,False,False,False,False,True
1,1,1,2,20.0,0.7,100.0,491.19,606.07,1477.61,1237.5,...,14.6552,321,319,False,False,False,True,False,False,False
2,2,1,3,42.0,0.8,100.0,445.0,548.95,1343.12,1117.05,...,6.4213,321,318,False,False,False,False,False,False,True
3,3,1,4,42.0,0.8,100.0,445.0,548.7,1341.24,1118.03,...,6.4176,321,317,False,False,False,False,False,False,True
4,4,1,5,25.0,0.6,60.0,462.54,536.1,1255.23,1033.59,...,8.6754,321,316,False,False,False,False,True,False,False


In [9]:
setting_group = [col for col in train_df.columns if col.startswith('setting_group')]
setting_group

['setting_group_0.0_0.0_100.0',
 'setting_group_10.0_0.2_100.0',
 'setting_group_10.0_0.3_100.0',
 'setting_group_20.0_0.7_100.0',
 'setting_group_25.0_0.6_60.0',
 'setting_group_35.0_0.8_100.0',
 'setting_group_42.0_0.8_100.0']

In [10]:
test_df['setting_1'] = test_df['setting_1'].round(1)
test_df['setting_2'] = test_df['setting_2'].round(1)
test_df['setting_group'] = test_df[['setting_1','setting_2','setting_3']].astype(str).agg('_'.join, axis=1)
test_df = pd.get_dummies(test_df, columns=['setting_group'])
test_df.head()

Unnamed: 0.1,Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,...,s_19,s_20,s_21,setting_group_0.0_0.0_100.0,setting_group_10.0_0.2_100.0,setting_group_10.0_0.3_100.0,setting_group_20.0_0.7_100.0,setting_group_25.0_0.6_60.0,setting_group_35.0_0.8_100.0,setting_group_42.0_0.8_100.0
0,0,1,1,20.0,0.7,100.0,491.19,606.67,1481.04,1227.81,...,100.0,24.31,14.7007,False,False,False,True,False,False,False
1,1,1,2,25.0,0.6,60.0,462.54,536.22,1256.17,1031.48,...,84.93,14.36,8.5748,False,False,False,False,True,False,False
2,2,1,3,42.0,0.8,100.0,445.0,549.23,1340.13,1105.88,...,100.0,10.39,6.4365,False,False,False,False,False,False,True
3,3,1,4,42.0,0.8,100.0,445.0,549.19,1339.7,1107.26,...,100.0,10.56,6.2367,False,False,False,False,False,False,True
4,4,1,5,35.0,0.8,100.0,449.44,555.1,1353.04,1117.8,...,100.0,14.85,8.9326,False,False,False,False,False,True,False


## **Raw Data**

In [11]:
# RUL Clipping X
raw_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x_sensors = train_df[using_sensors].copy()
    train_x_settings = train_df[setting_group].copy()
    train_y = train_df['RUL'].copy().values.ravel()
    test_x_sensors = test_df.groupby('unit_number').tail(1)[using_sensors]
    test_x_settings = test_df.groupby('unit_number').tail(1)[setting_group]

    train_x = np.concatenate([train_x_sensors, train_x_settings], axis=1)
    test_x = np.concatenate([test_x_sensors, test_x_settings], axis=1)

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    raw_results.append({
    'Data': 'Raw',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_results)

Model: XGBoost, MAE: 32.80341720581055, RMSE: 45.25286889860893
Model: LightGBM, MAE: 33.13850756448758, RMSE: 44.56902200553508
Model: CatBoost, MAE: 32.35418212803426, RMSE: 43.43519971531071


In [12]:
pd.DataFrame(raw_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
2,Raw,CatBoost,,32.354182,43.4352
0,Raw,XGBoost,,32.803417,45.252869
1,Raw,LightGBM,,33.138508,44.569022


In [13]:
# RUL Clipping O
raw_clipping_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x_sensors = train_df[using_sensors].copy()
    train_x_settings = train_df[setting_group].copy()
    train_y = train_df['RUL']
    test_x_sensors = test_df.groupby('unit_number').tail(1)[using_sensors]
    test_x_settings = test_df.groupby('unit_number').tail(1)[setting_group]

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    train_x = np.concatenate([train_x_sensors, train_x_settings], axis=1)
    test_x = np.concatenate([test_x_sensors, test_x_settings], axis=1)

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    raw_clipping_results.append({
    'Data': 'Raw (RUL Clipping)',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_clipping_results)

Model: XGBoost, MAE: 14.307032585144043, RMSE: 19.851915304568614
Model: LightGBM, MAE: 14.689087310447462, RMSE: 19.842515523684515
Model: CatBoost, MAE: 14.403875294281798, RMSE: 19.742004337822475


In [14]:
pd.DataFrame(raw_clipping_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Raw (RUL Clipping),XGBoost,,14.307033,19.851915
2,Raw (RUL Clipping),CatBoost,,14.403875,19.742004
1,Raw (RUL Clipping),LightGBM,,14.689087,19.842516


## **Cumlative Stat**

In [15]:
cum_stat_train_df = pd.read_csv('./cum_stat_train_FD004.csv')
cum_stat_test_df = pd.read_csv('./cum_stat_test_FD004.csv')

In [16]:
stat_cols = [col for col in cum_stat_train_df.columns if col.startswith('s_')]

In [17]:
# RUL Clipping X
cum_stat_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x_sensors = cum_stat_train_df[stat_cols].copy()
    train_x_settings = cum_stat_train_df[setting_group].copy()
    train_y = cum_stat_train_df['RUL'].copy().values.ravel()
    test_x_sensors = cum_stat_test_df.groupby('unit_number').tail(1)[stat_cols]
    test_x_settings = cum_stat_test_df.groupby('unit_number').tail(1)[setting_group]

    train_x = np.concatenate([train_x_sensors, train_x_settings], axis=1)
    test_x = np.concatenate([test_x_sensors, test_x_settings], axis=1)

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_results.append({
    'Data': 'Cumulative Stat',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_results)

Model: XGBoost, MAE: 22.5842342376709, RMSE: 30.925778042898028
Model: LightGBM, MAE: 22.161568298759573, RMSE: 30.434082679582048
Model: CatBoost, MAE: 22.284992368478264, RMSE: 30.052233212829375


In [18]:
pd.DataFrame(cum_stat_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Cumulative Stat,XGBoost,,22.584234,30.925778
1,Cumulative Stat,LightGBM,,22.161568,30.434083
2,Cumulative Stat,CatBoost,,22.284992,30.052233


In [19]:
# RUL Clipping O
cum_stat_rul_clipping_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x_sensors = cum_stat_train_df[stat_cols].copy()
    train_x_settings = cum_stat_train_df[setting_group].copy()
    train_y = cum_stat_train_df['RUL'].copy()
    test_x_sensors = cum_stat_test_df.groupby('unit_number').tail(1)[stat_cols]
    test_x_settings = cum_stat_test_df.groupby('unit_number').tail(1)[setting_group]

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    train_x = np.concatenate([train_x_sensors, train_x_settings], axis=1)
    test_x = np.concatenate([test_x_sensors, test_x_settings], axis=1)

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_rul_clipping_results.append({
    'Data': 'Cumulative Stat (RUL Clipping)',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_rul_clipping_results)

Model: XGBoost, MAE: 14.183903694152832, RMSE: 19.826312620332118
Model: LightGBM, MAE: 13.54042827217269, RMSE: 18.731705154766058
Model: CatBoost, MAE: 13.683681308532416, RMSE: 18.547913842925634


In [20]:
pd.DataFrame(cum_stat_rul_clipping_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Cumulative Stat (RUL Clipping),XGBoost,,14.183904,19.826313
1,Cumulative Stat (RUL Clipping),LightGBM,,13.540428,18.731705
2,Cumulative Stat (RUL Clipping),CatBoost,,13.683681,18.547914


# **Result**

In [21]:
result = pd.DataFrame(results)
result.to_csv('/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Results_FD004/Boosting_Regression.csv')

In [22]:
result = result.sort_values(by='MAE')
result.head()

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
10,Cumulative Stat (RUL Clipping),LightGBM,,13.540428,18.731705
11,Cumulative Stat (RUL Clipping),CatBoost,,13.683681,18.547914
9,Cumulative Stat (RUL Clipping),XGBoost,,14.183904,19.826313
3,Raw (RUL Clipping),XGBoost,,14.307033,19.851915
5,Raw (RUL Clipping),CatBoost,,14.403875,19.742004
