# **Import**

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# !pip install catboost
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error

# **Data Load**

In [None]:
cd /content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data

/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data


In [None]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = [f's_{i + 1}' for i in range(21)]
col_names = index_names + setting_names + sensor_names

train_df = pd.read_csv('./train_FD001.csv')

In [None]:
max_cycle = train_df.groupby('unit_number')['time_cycles'].max().reset_index()
max_cycle.columns = ['unit_number', 'max_cycle']

train_df = train_df.merge(max_cycle, on='unit_number', how='left')

train_df['RUL'] = train_df['max_cycle'] - train_df['time_cycles']

In [None]:
test_df = pd.read_csv('./test_FD001.csv')
rul_df = pd.read_csv('./RUL_FD001.csv')

In [None]:
results = []

using_sensors = [
    's_2', 's_3', 's_4', 's_7', 's_8', 's_9', 's_11', 's_12',
    's_13', 's_14', 's_15', 's_17', 's_20', 's_21'
]

## Raw Data

In [None]:
# RUL Clipping X
raw_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = train_df[using_sensors].copy()
    train_y = train_df['RUL'].copy().values.ravel()
    test_x = test_df.groupby('unit_number').tail(1)[using_sensors].copy()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    raw_results.append({
    'Data': 'Raw',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_results)

Model: XGBoost, MAE: 25.59726333618164, RMSE: 37.31629155254277
Model: LightGBM, MAE: 23.612490436601, RMSE: 33.50621924900844
Model: CatBoost, MAE: 23.241181749871053, RMSE: 33.00056960061909


In [None]:
pd.DataFrame(raw_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
2,Raw,CatBoost,,23.241182,33.00057
1,Raw,LightGBM,,23.61249,33.506219
0,Raw,XGBoost,,25.597263,37.316292


In [None]:
# RUL Clipping O
raw_clipping_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = train_df[using_sensors].copy()
    train_y = train_df['RUL'].copy()
    test_x = test_df.groupby('unit_number').tail(1)[using_sensors].copy()

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    raw_clipping_results.append({
    'Data': 'Raw (RUL Clipping)',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_clipping_results)

Model: XGBoost, MAE: 12.655447959899902, RMSE: 17.6076262857311
Model: LightGBM, MAE: 11.754091711691029, RMSE: 16.769257011014545
Model: CatBoost, MAE: 11.679642024809432, RMSE: 17.03259696815047


In [None]:
pd.DataFrame(raw_clipping_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
2,Raw (RUL Clipping),CatBoost,,11.679642,17.032597
1,Raw (RUL Clipping),LightGBM,,11.754092,16.769257
0,Raw (RUL Clipping),XGBoost,,12.655448,17.607626


## Global Stat

In [None]:
def make_global_stat(df, is_train=True):
    tmp_df = df.copy()
    stat_df = []

    for unit in tmp_df['unit_number'].unique():
        unit_data = tmp_df[tmp_df['unit_number'] == unit]
        features = {'unit_number': unit}

        for sensor in using_sensors:
            features[f'{sensor}_mean'] = unit_data[sensor].mean()
            features[f'{sensor}_std'] = unit_data[sensor].std()
            features[f'{sensor}_min'] = unit_data[sensor].min()
            features[f'{sensor}_max'] = unit_data[sensor].max()
            features[f'{sensor}_last'] = unit_data[sensor].iloc[-1]
            features[f'{sensor}_median'] = unit_data[sensor].median()
            features[f'{sensor}_trend'] = np.polyfit(unit_data['time_cycles'], unit_data[sensor], 1)[0]  # 선형 추세

        if is_train:
            features['RUL'] = unit_data['RUL'].max()

        stat_df.append(features)

    return pd.DataFrame(stat_df)

In [None]:
stat_train_df = make_global_stat(train_df, is_train=True)
stat_test_df = make_global_stat(test_df, is_train=False)

In [None]:
# RUL Clipping X
global_stat_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = stat_train_df.drop(columns=['unit_number', 'RUL']).copy()
    train_y = stat_train_df['RUL'].copy().values.ravel()
    test_x = stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number').copy()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    global_stat_results.append({
    'Data': 'Global Stat',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(global_stat_results)

In [None]:
pd.DataFrame(global_stat_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
1,Global Stat,LightGBM,,164.515277,168.410275
0,Global Stat,XGBoost,,220.257751,223.818137


In [None]:
# RUL Clipping O
global_stat_rul_clipping_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
}

for model_name, model in model_dict.items():
    train_x = stat_train_df.drop(columns=['unit_number', 'RUL']).copy()
    train_y = stat_train_df['RUL'].copy()
    test_x = stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number').copy()

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    global_stat_rul_clipping_results.append({
    'Data': 'Global Stat (RUL Clipping)',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(global_stat_rul_clipping_results)

In [None]:
pd.DataFrame(global_stat_rul_clipping_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Global Stat (RUL Clipping),XGBoost,,50.549999,64.507131
1,Global Stat (RUL Clipping),LightGBM,,50.55,64.507131


## **Cumulative Stat**

In [None]:
def make_cum_stat(df, is_train=True):
    tmp_df = df.copy()
    stat_df = []

    for unit in tmp_df['unit_number'].unique():
        unit_data = tmp_df[tmp_df['unit_number'] == unit]
        for t in range(2, len(unit_data) + 1):  # 최소 2개 시점부터
            sub_data = unit_data.iloc[:t]  # 1~t
            features = {'unit_number': unit, 'time_cycles': sub_data['time_cycles'].iloc[-1]}

            for sensor in using_sensors:
                features[f'{sensor}_mean'] = sub_data[sensor].mean()
                features[f'{sensor}_std'] = sub_data[sensor].std()
                features[f'{sensor}_min'] = sub_data[sensor].min()
                features[f'{sensor}_max'] = sub_data[sensor].max()
                features[f'{sensor}_last'] = sub_data[sensor].iloc[-1]
                features[f'{sensor}_median'] = sub_data[sensor].median()
                # trend는 최소 2개 이상 시점에서만 계산 가능
                features[f'{sensor}_trend'] = np.polyfit(sub_data['time_cycles'], sub_data[sensor], 1)[0]

            if is_train:
                features['RUL'] = sub_data['RUL'].iloc[-1]

            stat_df.append(features)

    return pd.DataFrame(stat_df)

In [None]:
cum_stat_train_df = make_cum_stat(train_df, is_train=True)
cum_stat_test_df = make_cum_stat(test_df, is_train=False)

In [None]:
# RUL Clipping X
cum_stat_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = cum_stat_train_df.drop(columns=['unit_number', 'RUL']).copy()
    train_y = cum_stat_train_df['RUL'].copy().values.ravel()
    test_x = cum_stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number').copy()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_results.append({
    'Data': 'Cumulative Stat',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_results)

Model: XGBoost, MAE: 15.298989295959473, RMSE: 24.809834603769655
Model: LightGBM, MAE: 13.203343278153557, RMSE: 21.33030836087428
Model: CatBoost, MAE: 13.588148593012095, RMSE: 20.67245184359415


In [None]:
pd.DataFrame(cum_stat_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
1,Cumulative Stat,LightGBM,,13.203343,21.330308
2,Cumulative Stat,CatBoost,,13.588149,20.672452
0,Cumulative Stat,XGBoost,,15.298989,24.809835


In [None]:
# RUL Clipping O
cum_stat_rul_clipping_results = []

model_dict = {
    'XGBoost': XGBRegressor(verbose=0, random_state=42),
    'LightGBM': LGBMRegressor(verbose=0, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = cum_stat_train_df.drop(columns=['unit_number', 'RUL']).copy()
    train_y = cum_stat_train_df['RUL'].copy()
    test_x = cum_stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number').copy()

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_rul_clipping_results.append({
    'Data': 'Cumulative Stat (RUL Clipping)',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_rul_clipping_results)

Model: XGBoost, MAE: 8.567877769470215, RMSE: 12.321972606329426
Model: LightGBM, MAE: 7.670902609458941, RMSE: 11.071520398675641
Model: CatBoost, MAE: 7.629197432932172, RMSE: 10.553240477562863


In [None]:
pd.DataFrame(cum_stat_rul_clipping_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
2,Cumulative Stat (RUL Clipping),CatBoost,,7.629197,10.55324
1,Cumulative Stat (RUL Clipping),LightGBM,,7.670903,11.07152
0,Cumulative Stat (RUL Clipping),XGBoost,,8.567878,12.321973


# **Result**

In [None]:
result = pd.DataFrame(results)
result.to_csv('/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Results/Boosting_Regression.csv', index=False)

In [None]:
result = result.sort_values(by='MAE')
result.head()

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
15,Cumulative Stat (RUL Clipping),CatBoost,,7.629197,10.55324
14,Cumulative Stat (RUL Clipping),LightGBM,,7.670903,11.07152
13,Cumulative Stat (RUL Clipping),XGBoost,,8.567878,12.321973
5,Raw (RUL Clipping),CatBoost,,11.679642,17.032597
4,Raw (RUL Clipping),LightGBM,,11.754092,16.769257
