# **Import**

In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error

# **Data Load**

In [3]:
cd /content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data

/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data


In [4]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = [f's_{i + 1}' for i in range(21)]
col_names = index_names + setting_names + sensor_names

train_df = pd.read_csv('./train_FD001.csv')

In [5]:
max_cycle = train_df.groupby('unit_number')['time_cycles'].max().reset_index()
max_cycle.columns = ['unit_number', 'max_cycle']

train_df = train_df.merge(max_cycle, on='unit_number', how='left')

train_df['RUL'] = train_df['max_cycle'] - train_df['time_cycles']

In [6]:
test_df = pd.read_csv('./test_FD001.csv')
rul_df = pd.read_csv('./RUL_FD001.csv')

In [7]:
results = []

using_sensors = [
    's_2', 's_3', 's_4', 's_7', 's_8', 's_9', 's_11', 's_12',
    's_13', 's_14', 's_15', 's_17', 's_20', 's_21'
]

## Raw Data

In [8]:
# RUL Clipping X
raw_results = []

dt = DecisionTreeRegressor(random_state=42)

model_dict = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'Bagging': BaggingRegressor(estimator=dt, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = train_df[using_sensors].copy()
    train_y = train_df['RUL'].copy().values.ravel()
    test_x = test_df.groupby('unit_number').tail(1)[using_sensors].copy()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    raw_results.append({
    'Data': 'Raw',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_results)

Model: RandomForest, MAE: 24.726300000000005, RMSE: 34.08721395186178
Model: Bagging, MAE: 27.27999999999999, RMSE: 37.17031073316445


In [9]:
pd.DataFrame(raw_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Raw,RandomForest,,24.7263,34.087214
1,Raw,Bagging,,27.28,37.170311


In [10]:
# RUL Clipping O
raw_clipping_results = []

dt = DecisionTreeRegressor(random_state=42)

model_dict = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'Bagging': BaggingRegressor(estimator=dt, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = train_df[using_sensors].copy()
    train_y = train_df['RUL'].copy()
    test_x = test_df.groupby('unit_number').tail(1)[using_sensors].copy()

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    raw_clipping_results.append({
    'Data': 'Raw (RUL Clipping)',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_clipping_results)

Model: RandomForest, MAE: 12.163000000000002, RMSE: 17.186845842096798
Model: Bagging, MAE: 12.531999999999998, RMSE: 18.60559055767916


In [11]:
pd.DataFrame(raw_clipping_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Raw (RUL Clipping),RandomForest,,12.163,17.186846
1,Raw (RUL Clipping),Bagging,,12.532,18.605591


## Global Stat

In [12]:
def make_global_stat(df, is_train=True):
    tmp_df = df.copy()
    stat_df = []

    for unit in tmp_df['unit_number'].unique():
        unit_data = tmp_df[tmp_df['unit_number'] == unit]
        features = {'unit_number': unit}

        for sensor in using_sensors:
            features[f'{sensor}_mean'] = unit_data[sensor].mean()
            features[f'{sensor}_std'] = unit_data[sensor].std()
            features[f'{sensor}_min'] = unit_data[sensor].min()
            features[f'{sensor}_max'] = unit_data[sensor].max()
            features[f'{sensor}_last'] = unit_data[sensor].iloc[-1]
            features[f'{sensor}_median'] = unit_data[sensor].median()
            features[f'{sensor}_trend'] = np.polyfit(unit_data['time_cycles'], unit_data[sensor], 1)[0]  # 선형 추세

        if is_train:
            features['RUL'] = unit_data['RUL'].max()

        stat_df.append(features)

    return pd.DataFrame(stat_df)

In [13]:
stat_train_df = make_global_stat(train_df, is_train=True)
stat_test_df = make_global_stat(test_df, is_train=False)

In [14]:
# RUL Clipping X
global_stat_results = []

dt = DecisionTreeRegressor(random_state=42)

model_dict = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'Bagging': BaggingRegressor(estimator=dt, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = stat_train_df.drop(columns=['unit_number', 'RUL']).copy()
    train_y = stat_train_df['RUL'].copy().values.ravel()
    test_x = stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number').copy()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    global_stat_results.append({
    'Data': 'Global Stat',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(global_stat_results)

Model: RandomForest, MAE: 189.92850000000007, RMSE: 192.25897116909786
Model: Bagging, MAE: 181.90600000000003, RMSE: 184.07141331559336


In [15]:
pd.DataFrame(global_stat_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
1,Global Stat,Bagging,,181.906,184.071413
0,Global Stat,RandomForest,,189.9285,192.258971


In [16]:
# RUL Clipping O
global_stat_rul_clipping_results = []

dt = DecisionTreeRegressor(random_state=42)

model_dict = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'Bagging': BaggingRegressor(estimator=dt, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = stat_train_df.drop(columns=['unit_number', 'RUL']).copy()
    train_y = stat_train_df['RUL'].copy()
    test_x = stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number').copy()

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    global_stat_rul_clipping_results.append({
    'Data': 'Global Stat (RUL Clipping)',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(global_stat_rul_clipping_results)

Model: RandomForest, MAE: 50.55, RMSE: 64.50713138870772
Model: Bagging, MAE: 50.55, RMSE: 64.50713138870772


In [17]:
pd.DataFrame(global_stat_rul_clipping_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Global Stat (RUL Clipping),RandomForest,,50.55,64.507131
1,Global Stat (RUL Clipping),Bagging,,50.55,64.507131


## **Cumulative Stat**

In [18]:
def make_cum_stat(df, is_train=True):
    tmp_df = df.copy()
    stat_df = []

    for unit in tmp_df['unit_number'].unique():
        unit_data = tmp_df[tmp_df['unit_number'] == unit]
        for t in range(2, len(unit_data) + 1):  # 최소 2개 시점부터
            sub_data = unit_data.iloc[:t]  # 1~t
            features = {'unit_number': unit, 'time_cycles': sub_data['time_cycles'].iloc[-1]}

            for sensor in using_sensors:
                features[f'{sensor}_mean'] = sub_data[sensor].mean()
                features[f'{sensor}_std'] = sub_data[sensor].std()
                features[f'{sensor}_min'] = sub_data[sensor].min()
                features[f'{sensor}_max'] = sub_data[sensor].max()
                features[f'{sensor}_last'] = sub_data[sensor].iloc[-1]
                features[f'{sensor}_median'] = sub_data[sensor].median()
                # trend는 최소 2개 이상 시점에서만 계산 가능
                features[f'{sensor}_trend'] = np.polyfit(sub_data['time_cycles'], sub_data[sensor], 1)[0]

            if is_train:
                features['RUL'] = sub_data['RUL'].iloc[-1]

            stat_df.append(features)

    return pd.DataFrame(stat_df)

In [19]:
cum_stat_train_df = make_cum_stat(train_df, is_train=True)
cum_stat_test_df = make_cum_stat(test_df, is_train=False)

In [20]:
# RUL Clipping X
cum_stat_results = []

dt = DecisionTreeRegressor(random_state=42)

model_dict = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'Bagging': BaggingRegressor(estimator=dt, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = cum_stat_train_df.drop(columns=['unit_number', 'RUL']).copy()
    train_y = cum_stat_train_df['RUL'].copy().values.ravel()
    test_x = cum_stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number').copy()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy().values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_results.append({
    'Data': 'Cumulative Stat',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_results)

Model: RandomForest, MAE: 13.151, RMSE: 21.337424258799377
Model: Bagging, MAE: 13.963999999999999, RMSE: 22.048387696155924


In [21]:
pd.DataFrame(cum_stat_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Cumulative Stat,RandomForest,,13.151,21.337424
1,Cumulative Stat,Bagging,,13.964,22.048388


In [22]:
# RUL Clipping O
cum_stat_rul_clipping_results = []

dt = DecisionTreeRegressor(random_state=42)

model_dict = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'Bagging': BaggingRegressor(estimator=dt, random_state=42)
}

for model_name, model in model_dict.items():
    train_x = cum_stat_train_df.drop(columns=['unit_number', 'RUL']).copy()
    train_y = cum_stat_train_df['RUL'].copy()
    test_x = cum_stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number').copy()

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    model.fit(train_x, train_y)

    pred = model.predict(test_x)
    true = rul_df['RUL'].copy()
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Model: {model_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_rul_clipping_results.append({
    'Data': 'Cumulative Stat (RUL Clipping)',
    'Model': model_name,
    'Scaler': 'None',
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_rul_clipping_results)

Model: RandomForest, MAE: 7.5183, RMSE: 10.653501349321733
Model: Bagging, MAE: 8.29, RMSE: 11.502851820309605


In [23]:
pd.DataFrame(cum_stat_rul_clipping_results).sort_values(by='MAE')

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Cumulative Stat (RUL Clipping),RandomForest,,7.5183,10.653501
1,Cumulative Stat (RUL Clipping),Bagging,,8.29,11.502852


# **Result**

In [26]:
result = pd.DataFrame(results)
result.to_csv('/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Results/Bagging_Regression.csv', index=False)

In [27]:
result = result.sort_values(by='MAE')
result.head()

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
10,Cumulative Stat (RUL Clipping),RandomForest,,7.5183,10.653501
11,Cumulative Stat (RUL Clipping),Bagging,,8.29,11.502852
2,Raw (RUL Clipping),RandomForest,,12.163,17.186846
3,Raw (RUL Clipping),Bagging,,12.532,18.605591
8,Cumulative Stat,RandomForest,,13.151,21.337424
