In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from typing import Tuple, List

In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def extract_data_set_name(file_path: str) -> str:
    """
    Extracts the specific data set from a file path
    """
    return (file_path
                .split('/')[-1]
                .split('_')[-1]
                .split('.')[0])


def calc_rul(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the remaining useful life (RUL) for each engine 
    """
    df = df.sort_values(['time', 'unit_number', 'data_set'], ascending=[False, True, True])
    df['RUL'] = df.groupby(['data_set', 'unit_number']).cumcount()
    return df


def gen_col_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates the columns for training data set
    """
    cols = (['unit_number', 'time'] 
            + [f'op_setting_{i+1}' for i in range(3)]
            + [f'sensor_{i+1}' for i in range(df.shape[1] - 5)])
    df.columns = cols
    return df


def drop_empty_cols(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drops empty columns from dataframe
    """
    na_by_col = df.isna().sum()
    row_cnt = df.shape[0]
    cols_to_keep = [True if na_cnt != row_cnt else False for na_cnt in na_by_col]
    return df.iloc[:, cols_to_keep]


def load_ts_data(file_paths: List[str]) -> pd.DataFrame:
    """
    Loads time series data
    Args:
        - file_paths: paths to the files to load
    Returns:
        - pandas dataframe containing data with expected column names & dropped empty columns
    """
    frames = []
    for file_path in file_paths:
        data_set = extract_data_set_name(file_path)
        df = pd.read_csv(file_path, sep=' ', header=None)
        df = (df.pipe(drop_empty_cols)
                .pipe(gen_col_names))
        df['data_set'] = data_set
        frames.append(df)
    return pd.concat(frames)


def load_rul_data(file_paths: List[str]) -> pd.DataFrame:
    """
    Loads RUL files (remaining useful life) for test data
    Args:
        - file_paths: paths to RUL files to load
    Returns:
        - np.ndarray with expected RULs of test data
    """
    frames = []
    for file_path in file_paths:
        data_set = extract_data_set_name(file_path)
        df = pd.read_csv(file_path, header=None)
        df.columns = ['RUL']
        df['data_set'] = data_set
        frames.append(df)
    return pd.concat(frames)


def preprocess(df: pd.DataFrame) -> np.ndarray:
    """
    Cleaning input data before training or inference;
    dropping columns that do not have much predictive power; 
    see analysis described below:
    https://towardsdatascience.com/predictive-maintenance-of-turbofan-engines-ec54a083127
    """
    SENSOR_COLS_TO_DROP = [f'sensor_{i}' for i in (1, 5, 6, 10, 16, 18, 19)]
    SETTING_COLS_TO_DROP = [f'op_setting_{i}' for i in range(1, 3+1)]
    COLS_TO_DROP = SENSOR_COLS_TO_DROP + SETTING_COLS_TO_DROP + ['unit_number', 'time', 'data_set']
    return df.drop(columns=COLS_TO_DROP)

In [4]:
# load training files
DRIVE_DATA_DIR = '/content/drive/MyDrive/fourth_brain/projects/data/CMAPSSData_small'
TRAIN_TS_FILES = [f'train_FD00{i}.txt' for i in range(1, 4+1)]
TEST_TS_FILES = [f'test_FD00{i}.txt' for i in range(1, 4+1)]
TEST_RUL_FILES = [f'RUL_FD00{i}.txt' for i in range(1, 4+1)]
train_df = load_ts_data([f'{DRIVE_DATA_DIR}/{TRAIN_TS_FILE}' for TRAIN_TS_FILE in TRAIN_TS_FILES])
test_df = load_ts_data([f'{DRIVE_DATA_DIR}/{TEST_TS_FILE}' for TEST_TS_FILE in TEST_TS_FILES])
y_test = load_rul_data([f'{DRIVE_DATA_DIR}/{TEST_RUL_FILE}' for TEST_RUL_FILE in TEST_RUL_FILES])
train_df = (train_df
                .pipe(calc_rul)
                .pipe(preprocess))
train_df.head()

Unnamed: 0,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,RUL
29223,549.99,1361.11,1146.56,142.52,2212.48,8378.69,42.56,133.47,2388.57,8140.53,9.0833,334,10.86,6.4898,0
29222,537.32,1268.93,1052.87,179.94,1915.86,8043.94,36.94,168.65,2028.76,7906.75,10.641,307,14.54,8.6194,1
29221,605.97,1508.05,1316.18,404.79,2319.55,8842.34,45.88,381.68,2388.86,8184.69,8.4239,375,29.11,17.5285,2
29220,605.6,1515.56,1329.75,405.08,2319.57,8837.55,46.08,381.37,2388.8,8187.4,8.4663,372,29.25,17.5117,3
29219,556.09,1377.6,1143.91,199.35,2223.8,8405.69,42.68,188.57,2388.87,8118.05,9.0881,338,15.15,9.0975,4


In [5]:
train_df.describe()

Unnamed: 0,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,RUL
count,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0,160359.0
mean,597.361022,1467.035653,1260.956434,359.729968,2273.829707,8677.553696,44.212049,338.789821,2349.645243,8088.950972,9.054747,360.698801,25.942709,15.5657,122.331338
std,42.478516,118.175261,136.300073,174.133835,142.426613,374.657454,3.426342,164.19348,111.167242,80.623257,0.751581,31.02143,11.691422,7.015067,83.538146
min,535.48,1242.67,1023.77,136.17,1914.72,7984.51,36.04,128.31,2027.57,7845.78,8.1563,302.0,10.16,6.0105,0.0
25%,549.96,1357.36,1126.83,175.71,2212.12,8334.77,42.01,164.79,2387.97,8070.53,8.43925,332.0,14.33,8.6013,56.0
50%,605.93,1492.81,1271.74,341.69,2319.37,8764.2,44.93,321.69,2388.07,8118.59,9.0301,367.0,24.92,14.9535,113.0
75%,642.34,1586.59,1402.2,553.29,2388.05,9055.85,47.34,521.34,2388.16,8139.41,9.3442,392.0,38.82,23.2946,172.0
max,645.11,1616.91,1441.49,570.81,2388.64,9244.59,48.53,537.49,2390.49,8293.72,11.0669,400.0,39.89,23.9505,542.0


In [6]:
test_df.shape

(104897, 27)

In [7]:
y_test.shape

(707, 2)

In [8]:
y_train = train_df['RUL'].values
X_train = train_df.drop(columns=['RUL'])
model = LinearRegression(normalize=True)

In [9]:
pipeline = make_pipeline(model)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=True))],
         verbose=False)

In [10]:
pipeline.named_steps

{'linearregression': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)}

In [11]:
test_df.head()

Unnamed: 0,unit_number,time,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,data_set
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,21.61,553.9,2388.04,9050.17,1.3,47.2,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,FD001
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,21.61,554.85,2388.01,9054.42,1.3,47.5,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,FD001
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,21.61,554.11,2388.05,9056.96,1.3,47.5,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166,FD001
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,21.61,554.07,2388.03,9045.29,1.3,47.28,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737,FD001
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,21.61,554.16,2388.01,9044.55,1.3,47.31,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413,FD001


In [12]:
# only want to make predictions based on the last timestamp (cycle) for each engine in test set
test_df = test_df.sort_values(['data_set', 'unit_number', 'time'], ascending=[True, True, True])
test_df = (test_df
            .groupby(['unit_number', 'data_set'])
            .last()
            .reset_index()
            .pipe(preprocess))
predictions = np.floor(pipeline.predict(test_df))

In [13]:
test_rmse = mean_squared_error(y_test['RUL'].values, predictions, squared=False)
print(f'Test Set RMSE: {test_rmse}')

Test Set RMSE: 79.24241716987838


In [14]:
y_test[:10]

Unnamed: 0,RUL,data_set
0,112,FD001
1,98,FD001
2,69,FD001
3,82,FD001
4,91,FD001
5,93,FD001
6,91,FD001
7,95,FD001
8,111,FD001
9,96,FD001


In [15]:
predictions[:10]

array([157., -15., 124.,  99., 147., 124., 120.,  84.,  80., 145.])

In [16]:
y_test[-10:]

Unnamed: 0,RUL,data_set
238,166,FD004
239,98,FD004
240,176,FD004
241,81,FD004
242,118,FD004
243,35,FD004
244,131,FD004
245,194,FD004
246,112,FD004
247,26,FD004


In [17]:
predictions[-10:]

array([ 17., 139.,  -7.,  68.,  17., 121., 114., 105., 144., 132.])

In [18]:
## TODO: 1) analysis of errors for each data set type