In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from typing import Tuple
import joblib

In [2]:
def calc_rul(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the remaining useful life (RUL) for each engine 
    """
    df['RUL'] = (df.sort_values(['time'], ascending=False)
                    .groupby(['unit_number']).cumcount())
    return df

def gen_col_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates the columns for training data set
    """
    cols = (['unit_number', 'time'] 
            + [f'op_setting_{i+1}' for i in range(3)]
            + [f'sensor_{i+1}' for i in range(df.shape[1] - 5)])
    df.columns = cols
    return df

def drop_empty_cols(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drops empty columns from dataframe
    """
    na_by_col = df.isna().sum()
    row_cnt = df.shape[0]
    cols_to_keep = [True if na_cnt != row_cnt else False for na_cnt in na_by_col]
    return df.iloc[:, cols_to_keep]


def load_ts_data(file_path: str) -> pd.DataFrame:
    """
    Loads time series data
    Args:
        - drive_path (str): path to the file
    Returns:
        - pandas dataframe containing data with expected column names
    """
    df = pd.read_csv(file_path, sep=' ', header=None)
    df = (df.pipe(drop_empty_cols)
            .pipe(gen_col_names))
    return df

def preprocess(df: pd.DataFrame) -> np.ndarray:
    """
    Cleaning input data before training or inference;
    dropping columns that do not have much predictive power; 
    see analysis described below:
    https://towardsdatascience.com/predictive-maintenance-of-turbofan-engines-ec54a083127
    """
    SENSOR_COLS_TO_DROP = [f'sensor_{i}' for i in (1, 5, 6, 10, 16, 18, 19)]
    SETTING_COLS_TO_DROP = [f'op_setting_{i}' for i in range(1, 3+1)]
    COLS_TO_DROP = SENSOR_COLS_TO_DROP + SETTING_COLS_TO_DROP + ['unit_number', 'time']
    return df.drop(columns=COLS_TO_DROP)



In [3]:
# load first training file
DATA_DIR = '../data/raw/CMAPSSData_small'
TRAIN_TS_FILE = 'train_FD001.txt'
TEST_TS_FILE = 'test_FD001.txt'
TEST_RUL_FILE = 'RUL_FD001.txt'
train_df = load_ts_data(f'{DATA_DIR}/{TRAIN_TS_FILE}')
test_df = load_ts_data(f'{DATA_DIR}/{TEST_TS_FILE}')
y_test = pd.read_csv(f'{DATA_DIR}/{TEST_RUL_FILE}', header=None)
y_test.columns = ['rul']
train_df = (train_df
                .pipe(calc_rul)
                .pipe(preprocess))
train_df.head()

Unnamed: 0,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,RUL
0,641.82,1589.7,1400.6,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419,191
1,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236,190
2,642.35,1587.99,1404.2,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,189
3,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,188
4,642.37,1582.85,1406.22,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044,187


In [4]:
test_df.head()

Unnamed: 0,unit_number,time,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [5]:
test_df.columns.tolist()

['unit_number',
 'time',
 'op_setting_1',
 'op_setting_2',
 'op_setting_3',
 'sensor_1',
 'sensor_2',
 'sensor_3',
 'sensor_4',
 'sensor_5',
 'sensor_6',
 'sensor_7',
 'sensor_8',
 'sensor_9',
 'sensor_10',
 'sensor_11',
 'sensor_12',
 'sensor_13',
 'sensor_14',
 'sensor_15',
 'sensor_16',
 'sensor_17',
 'sensor_18',
 'sensor_19',
 'sensor_20',
 'sensor_21']

In [6]:
train_df.describe()

Unnamed: 0,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,RUL
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,642.680934,1590.523119,1408.933782,553.367711,2388.096652,9065.242941,47.541168,521.41347,2388.096152,8143.752722,8.442146,393.210654,38.816271,23.289705,107.807862
std,0.500053,6.13115,9.000605,0.885092,0.070985,22.08288,0.267087,0.737553,0.071919,19.076176,0.037505,1.548763,0.180746,0.108251,68.88099
min,641.21,1571.04,1382.25,549.85,2387.9,9021.73,46.85,518.69,2387.88,8099.94,8.3249,388.0,38.14,22.8942,0.0
25%,642.325,1586.26,1402.36,552.81,2388.05,9053.1,47.35,520.96,2388.04,8133.245,8.4149,392.0,38.7,23.2218,51.0
50%,642.64,1590.1,1408.04,553.44,2388.09,9060.66,47.51,521.48,2388.09,8140.54,8.4389,393.0,38.83,23.2979,103.0
75%,643.0,1594.38,1414.555,554.01,2388.14,9069.42,47.7,521.95,2388.14,8148.31,8.4656,394.0,38.95,23.3668,155.0
max,644.53,1616.91,1441.49,556.06,2388.56,9244.59,48.53,523.38,2388.56,8293.72,8.5848,400.0,39.43,23.6184,361.0


In [7]:
y_train = train_df['RUL'].values
X_train = train_df.drop(columns=['RUL'])
model = LinearRegression(normalize=True)

In [8]:
pipeline = make_pipeline(model)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('linearregression', LinearRegression(normalize=True))])

In [9]:
pipeline.named_steps

{'linearregression': LinearRegression(normalize=True)}

In [10]:
test_df.head()

Unnamed: 0,unit_number,time,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [11]:
# only want to make predictions based on the last timestamp (cycle) for each engine in test set
test = (test_df
            .groupby('unit_number')
            .last()
            .reset_index()
            .pipe(preprocess))
predictions = np.floor(pipeline.predict(test))

In [12]:
test_mse = mean_squared_error(y_test, predictions, squared=False)
print(f'Test Set MSE: {test_mse}')

Test Set MSE: 31.621511665320494


In [13]:
y_test[:10]

Unnamed: 0,rul
0,112
1,98
2,69
3,82
4,91
5,93
6,91
7,95
8,111
9,96


In [14]:
predictions[:10]

array([158., 122.,  84.,  97., 112., 122., 130.,  95., 113., 117.])

In [15]:
y_test[-10:]

Unnamed: 0,rul
90,38
91,20
92,85
93,55
94,128
95,137
96,82
97,59
98,117
99,20


In [16]:
predictions[-10:]

array([ 52.,  34.,  63.,  70., 145., 166., 108.,  98., 161.,   2.])

In [17]:
# store serialized pipeline
pipeline_name = 'baseline_regression_pipeline.gz'
file_path = f'../models/{pipeline_name}'
joblib.dump(pipeline, file_path)

['../models/baseline_regression_pipeline.gz']

In [18]:
# store test & RUL data for streamlit simulation
test_data = 'test_FD001.feather'
rul_data = 'rul_FD001.feather'
test_data_file_path = f'../data/processed/{test_data}'
rul_data_file_path = f'../data/processed/{rul_data}'
test_df.to_feather(test_data_file_path)
y_test.to_feather(rul_data_file_path)

In [19]:
## TODO: 1) make a method for generating validation data from training data (different procedure for time series data; has to be sequential slices for validation)
## TODO: 2) hyperparameter tuning for different models