# **Import**

In [1]:
import warnings

warnings.filterwarnings('ignore')

In [26]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error

# **Data Load**

In [3]:
cd /content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data

/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Data


In [35]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = [f's_{i + 1}' for i in range(21)]
col_names = index_names + setting_names + sensor_names

train_df = pd.read_csv('./train_FD001.csv')

In [36]:
max_cycle = train_df.groupby('unit_number')['time_cycles'].max().reset_index()
max_cycle.columns = ['unit_number', 'max_cycle']

train_df = train_df.merge(max_cycle, on='unit_number', how='left')

train_df['RUL'] = train_df['max_cycle'] - train_df['time_cycles']

In [37]:
test_df = pd.read_csv('./test_FD001.csv')
rul_df = pd.read_csv('./RUL_FD001.csv')

In [140]:
results = []

using_sensors = [
    's_2', 's_3', 's_4', 's_7', 's_8', 's_9', 's_11', 's_12',
    's_13', 's_14', 's_15', 's_17', 's_20', 's_21'
]

## Raw Data

In [141]:
# RUL Clipping X
raw_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x = train_df[using_sensors]
    train_y = train_df['RUL'].values.ravel()
    test_x = test_df.groupby('unit_number').tail(1)[using_sensors]

    if scaler:
        scaler.fit(train_x)

        train_x = scaler.transform(train_x)
        test_x = scaler.transform(test_x)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL'].values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    raw_results.append({
    'Data': 'Raw',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_results)

Scaler: None, MAE: 16.608642027839814, RMSE: 20.82473081333295
Scaler: MinMaxScaler, MAE: 16.608642027843313, RMSE: 20.82473081333581
Scaler: StandardScaler, MAE: 16.608642027842972, RMSE: 20.824730813335364
Scaler: RobustScaler, MAE: 16.60864202784297, RMSE: 20.82473081333536


In [142]:
pd.DataFrame(raw_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Raw,Linear,,16.608642,20.824731
1,Raw,Linear,MinMaxScaler,16.608642,20.824731
2,Raw,Linear,StandardScaler,16.608642,20.824731
3,Raw,Linear,RobustScaler,16.608642,20.824731


In [143]:
# RUL Clipping O
raw_clipping_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x = train_df[using_sensors]
    train_y = train_df['RUL']
    test_x = test_df.groupby('unit_number').tail(1)[using_sensors]

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    if scaler:
        scaler.fit(train_x)

        train_x = scaler.transform(train_x)
        test_x = scaler.transform(test_x)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL']
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    raw_clipping_results.append({
    'Data': 'Raw (RUL Clipping)',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(raw_clipping_results)

Scaler: None, MAE: 16.608642027839814, RMSE: 20.82473081333295
Scaler: MinMaxScaler, MAE: 16.608642027843313, RMSE: 20.82473081333581
Scaler: StandardScaler, MAE: 16.608642027842972, RMSE: 20.824730813335364
Scaler: RobustScaler, MAE: 16.60864202784297, RMSE: 20.82473081333536


In [144]:
pd.DataFrame(raw_clipping_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Raw (RUL Clipping),Linear,,16.608642,20.824731
1,Raw (RUL Clipping),Linear,MinMaxScaler,16.608642,20.824731
2,Raw (RUL Clipping),Linear,StandardScaler,16.608642,20.824731
3,Raw (RUL Clipping),Linear,RobustScaler,16.608642,20.824731


## Global Stat

In [126]:
def make_global_stat(df, is_train=True):
    tmp_df = df.copy()
    stat_df = []

    for unit in tmp_df['unit_number'].unique():
        unit_data = tmp_df[tmp_df['unit_number'] == unit]
        features = {'unit_number': unit}

        for sensor in using_sensors:
            features[f'{sensor}_mean'] = unit_data[sensor].mean()
            features[f'{sensor}_std'] = unit_data[sensor].std()
            features[f'{sensor}_min'] = unit_data[sensor].min()
            features[f'{sensor}_max'] = unit_data[sensor].max()
            features[f'{sensor}_last'] = unit_data[sensor].iloc[-1]
            features[f'{sensor}_median'] = unit_data[sensor].median()
            features[f'{sensor}_trend'] = np.polyfit(unit_data['time_cycles'], unit_data[sensor], 1)[0]  # 선형 추세

        if is_train:
            features['RUL'] = unit_data['RUL'].max()

        stat_df.append(features)

    return pd.DataFrame(stat_df)

In [127]:
stat_train_df = make_global_stat(train_df, is_train=True)
stat_test_df = make_global_stat(test_df, is_train=False)

In [145]:
# RUL Clipping X
global_stat_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x = stat_train_df.drop(columns=['unit_number', 'RUL'])
    train_y = stat_train_df['RUL'].values.ravel()
    test_x = stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number')

    if scaler:
        scaler.fit(train_x)

        train_x = scaler.transform(train_x)
        test_x = scaler.transform(test_x)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL'].values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    global_stat_results.append({
    'Data': 'Global Stat',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(global_stat_results)

Scaler: None, MAE: 50.55, RMSE: 64.50713138870772
Scaler: MinMaxScaler, MAE: 50.55, RMSE: 64.50713138870772
Scaler: StandardScaler, MAE: 50.55, RMSE: 64.50713138870772
Scaler: RobustScaler, MAE: 50.55, RMSE: 64.50713138870772


In [146]:
pd.DataFrame(global_stat_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Global Stat,Linear,,50.55,64.507131
1,Global Stat,Linear,MinMaxScaler,50.55,64.507131
2,Global Stat,Linear,StandardScaler,50.55,64.507131
3,Global Stat,Linear,RobustScaler,50.55,64.507131


In [147]:
# RUL Clipping O
global_stat_rul_clipping_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x = stat_train_df.drop(columns=['unit_number', 'RUL'])
    train_y = stat_train_df['RUL']
    test_x = stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number')

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    if scaler:
        scaler.fit(train_x)

        train_x = scaler.transform(train_x)
        test_x = scaler.transform(test_x)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL']
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    global_stat_rul_clipping_results.append({
    'Data': 'Global Stat (RUL Clipping)',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(global_stat_rul_clipping_results)

Scaler: None, MAE: 50.55, RMSE: 64.50713138870772
Scaler: MinMaxScaler, MAE: 50.55, RMSE: 64.50713138870772
Scaler: StandardScaler, MAE: 50.55, RMSE: 64.50713138870772
Scaler: RobustScaler, MAE: 50.55, RMSE: 64.50713138870772


In [148]:
pd.DataFrame(global_stat_rul_clipping_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Global Stat (RUL Clipping),Linear,,50.55,64.507131
1,Global Stat (RUL Clipping),Linear,MinMaxScaler,50.55,64.507131
2,Global Stat (RUL Clipping),Linear,StandardScaler,50.55,64.507131
3,Global Stat (RUL Clipping),Linear,RobustScaler,50.55,64.507131


## **Cumulative Stat**

In [104]:
def make_cum_stat(df, is_train=True):
    tmp_df = df.copy()
    stat_df = []

    for unit in tmp_df['unit_number'].unique():
        unit_data = tmp_df[tmp_df['unit_number'] == unit]
        for t in range(2, len(unit_data) + 1):  # 최소 2개 시점부터
            sub_data = unit_data.iloc[:t]  # 1~t
            features = {'unit_number': unit, 'time_cycles': sub_data['time_cycles'].iloc[-1]}

            for sensor in using_sensors:
                features[f'{sensor}_mean'] = sub_data[sensor].mean()
                features[f'{sensor}_std'] = sub_data[sensor].std()
                features[f'{sensor}_min'] = sub_data[sensor].min()
                features[f'{sensor}_max'] = sub_data[sensor].max()
                features[f'{sensor}_last'] = sub_data[sensor].iloc[-1]
                features[f'{sensor}_median'] = sub_data[sensor].median()
                # trend는 최소 2개 이상 시점에서만 계산 가능
                features[f'{sensor}_trend'] = np.polyfit(sub_data['time_cycles'], sub_data[sensor], 1)[0]

            if is_train:
                features['RUL'] = sub_data['RUL'].iloc[-1]

            stat_df.append(features)

    return pd.DataFrame(stat_df)

In [105]:
cum_stat_train_df = make_cum_stat(train_df, is_train=True)
cum_stat_test_df = make_cum_stat(test_df, is_train=False)

In [149]:
# RUL Clipping X
cum_stat_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x = cum_stat_train_df.drop(columns=['unit_number', 'RUL'])
    train_y = cum_stat_train_df['RUL'].values.ravel()
    test_x = cum_stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number')

    if scaler:
        scaler.fit(train_x)

        train_x = scaler.transform(train_x)
        test_x = scaler.transform(test_x)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL'].values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_results.append({
    'Data': 'Cumulative Stat',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_results)

Scaler: None, MAE: 15.669487401435617, RMSE: 18.858082944183987
Scaler: MinMaxScaler, MAE: 15.669487401422423, RMSE: 18.858082944167297
Scaler: StandardScaler, MAE: 15.669487401422128, RMSE: 18.85808294416693
Scaler: RobustScaler, MAE: 15.669487401422124, RMSE: 18.858082944166927


In [150]:
pd.DataFrame(cum_stat_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Cumulative Stat,Linear,,15.669487,18.858083
1,Cumulative Stat,Linear,MinMaxScaler,15.669487,18.858083
2,Cumulative Stat,Linear,StandardScaler,15.669487,18.858083
3,Cumulative Stat,Linear,RobustScaler,15.669487,18.858083


In [151]:
# RUL Clipping O
cum_stat_rul_clipping_results = []

scaler_dict = {
    'None': None,
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scaler_dict.items():
    train_x = cum_stat_train_df.drop(columns=['unit_number', 'RUL'])
    train_y = cum_stat_train_df['RUL']
    test_x = cum_stat_test_df.groupby('unit_number').tail(1).drop(columns='unit_number')

    train_y.loc[train_y > 125] = 125
    train_y = train_y.values.ravel()

    if scaler:
        scaler.fit(train_x)

        train_x = scaler.transform(train_x)
        test_x = scaler.transform(test_x)

    lr = LinearRegression()
    lr_model = lr.fit(train_x, train_y)

    pred = lr_model.predict(test_x)
    true = rul_df['RUL']
    true.loc[true > 125] = 125
    true = true.values.ravel()

    mae = mean_absolute_error(true, pred)
    rmse = mean_squared_error(true, pred)**0.5

    print(f'Scaler: {scaler_name}, MAE: {mae}, RMSE: {rmse}')

    cum_stat_rul_clipping_results.append({
    'Data': 'Cumulative Stat (RUL Clipping)',
    'Model': 'Linear',
    'Scaler': scaler_name,
    'MAE': mae,
    'RMSE': rmse
    })
results.extend(cum_stat_rul_clipping_results)

Scaler: None, MAE: 15.669487401435617, RMSE: 18.858082944183987
Scaler: MinMaxScaler, MAE: 15.669487401422423, RMSE: 18.858082944167297
Scaler: StandardScaler, MAE: 15.669487401422128, RMSE: 18.85808294416693
Scaler: RobustScaler, MAE: 15.669487401422124, RMSE: 18.858082944166927


In [152]:
pd.DataFrame(cum_stat_rul_clipping_results)

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
0,Cumulative Stat (RUL Clipping),Linear,,15.669487,18.858083
1,Cumulative Stat (RUL Clipping),Linear,MinMaxScaler,15.669487,18.858083
2,Cumulative Stat (RUL Clipping),Linear,StandardScaler,15.669487,18.858083
3,Cumulative Stat (RUL Clipping),Linear,RobustScaler,15.669487,18.858083


# **Result**

In [153]:
result = pd.DataFrame(results)
result.to_csv('/content/drive/MyDrive/[Projects]/Kaggle/NASA 터보팬 제트 엔진의 잔존 수명 예측 및 예지보전 방안/Results/Linear_Regression.csv')

In [154]:
result = result.sort_values(by='MAE')
result.head()

Unnamed: 0,Data,Model,Scaler,MAE,RMSE
23,Cumulative Stat (RUL Clipping),Linear,RobustScaler,15.669487,18.858083
19,Cumulative Stat,Linear,RobustScaler,15.669487,18.858083
18,Cumulative Stat,Linear,StandardScaler,15.669487,18.858083
22,Cumulative Stat (RUL Clipping),Linear,StandardScaler,15.669487,18.858083
17,Cumulative Stat,Linear,MinMaxScaler,15.669487,18.858083
