In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from typing import Tuple
import joblib

In [2]:
def calc_rul(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the remaining useful life (RUL) for each engine 
    """
    df['RUL'] = (df.sort_values(['time'], ascending=False)
                    .groupby(['unit_number']).cumcount())
    return df

def gen_col_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates the columns for training data set
    """
    cols = (['unit_number', 'time'] 
            + [f'op_setting_{i+1}' for i in range(3)]
            + [f'sensor_{i+1}' for i in range(df.shape[1] - 5)])
    df.columns = cols
    return df

def drop_empty_cols(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drops empty columns from dataframe
    """
    na_by_col = df.isna().sum()
    row_cnt = df.shape[0]
    cols_to_keep = [True if na_cnt != row_cnt else False for na_cnt in na_by_col]
    return df.iloc[:, cols_to_keep]


def load_ts_data(file_path: str) -> pd.DataFrame:
    """
    Loads time series data
    Args:
        - drive_path (str): path to the file
    Returns:
        - pandas dataframe containing data with expected column names
    """
    df = pd.read_csv(file_path, sep=' ', header=None)
    df = (df.pipe(drop_empty_cols)
            .pipe(gen_col_names))
    return df

def preprocess(df: pd.DataFrame) -> np.ndarray:
    """
    Cleaning input data before training or inference;
    dropping columns that do not have much predictive power; 
    see analysis described below:
    https://towardsdatascience.com/predictive-maintenance-of-turbofan-engines-ec54a083127
    """
    SENSOR_COLS_TO_DROP = [f'sensor_{i}' for i in (1, 5, 6, 10, 16, 18, 19)]
    SETTING_COLS_TO_DROP = [f'op_setting_{i}' for i in range(1, 3+1)]
    COLS_TO_DROP = SENSOR_COLS_TO_DROP + SETTING_COLS_TO_DROP + ['unit_number', 'time']
    return df.drop(columns=COLS_TO_DROP)



In [3]:
# load first training file
DATA_DIR = '../data/raw/CMAPSSData_small'
TRAIN_TS_FILE = 'train_FD001.txt'
train_df = load_ts_data(f'{DATA_DIR}/{TRAIN_TS_FILE}')
train_df = (train_df
                .pipe(calc_rul))
train_df['RUL'] = train_df['RUL'].clip(upper=125)
train_df.head()

Unnamed: 0,unit_number,time,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,125
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,125
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,125
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,125
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,125


In [4]:
train_df[train_df['unit_number'] == 1]['RUL']

0      125
1      125
2      125
3      125
4      125
      ... 
187      4
188      3
189      2
190      1
191      0
Name: RUL, Length: 192, dtype: int64

In [5]:
# store train data for streamlit simulation
train_data = 'train_FD001.feather'
train_data_file_path = f'../data/processed/{train_data}'
train_df.to_feather(train_data_file_path)