## Data Processing

In [1]:
# Data = CMAPSSData
# Source = https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/
# Dataset = Turbofan Engine Degradation Simulation Data Set

In [2]:
# Data to be used - Train and Test data for 1

In [19]:
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

### Notebook options

In [22]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [15, 10] # Size of the plots

### Data Directories

In [23]:
INPUT_DIR = '../data/raw/CMAPSSData/'
OUTPUT_DIR = '../data/interim/'

### Read the input data

In [33]:
input_file = INPUT_DIR + 'train_FD001.txt'
col_headers = ['unit', 'time_cycles', 'setting1', 'setting2', 'setting3', 
              'meas01', 'meas02', 'meas03', 'meas04', 'meas05', 'meas06', 'meas07', 'meas08', 'meas09', 'meas10', 
              'meas11', 'meas12', 'meas13', 'meas14', 'meas15', 'meas16', 'meas17', 'meas18', 'meas19', 'meas20', 
              'meas21', 'meas22', 'meas23', 'meas24', 'meas25', 'meas26']

df = pd.read_csv(input_file, header=None, sep=' ', names=col_headers)

In [34]:
df.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas17,meas18,meas19,meas20,meas21,meas22,meas23,meas24,meas25,meas26
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,392,2388,100.0,39.06,23.419,,,,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,392,2388,100.0,39.0,23.4236,,,,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,390,2388,100.0,38.95,23.3442,,,,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,392,2388,100.0,38.88,23.3739,,,,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,393,2388,100.0,38.9,23.4044,,,,,


In [39]:
cols = ['unit', 'time_cycles', 'setting1', 'setting2', 'setting3']
df[cols].describe()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3
count,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0
std,29.227633,68.88099,0.002187,0.000293,0.0
min,1.0,1.0,-0.0087,-0.0006,100.0
25%,26.0,52.0,-0.0015,-0.0002,100.0
50%,52.0,104.0,0.0,0.0,100.0
75%,77.0,156.0,0.0015,0.0003,100.0
max,100.0,362.0,0.0087,0.0006,100.0


In [36]:
cols = ['meas01', 'meas02', 'meas03', 'meas04', 'meas05', 'meas06', 'meas07']
df[cols].describe()

Unnamed: 0,meas01,meas02,meas03,meas04,meas05,meas06,meas07
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,518.67,642.680934,1590.523119,1408.933782,14.62,21.609803,553.367711
std,6.537152e-11,0.500053,6.13115,9.000605,3.3947e-12,0.001389,0.885092
min,518.67,641.21,1571.04,1382.25,14.62,21.6,549.85
25%,518.67,642.325,1586.26,1402.36,14.62,21.61,552.81
50%,518.67,642.64,1590.1,1408.04,14.62,21.61,553.44
75%,518.67,643.0,1594.38,1414.555,14.62,21.61,554.01
max,518.67,644.53,1616.91,1441.49,14.62,21.61,556.06


In [37]:
cols = ['meas08', 'meas09', 'meas10', 'meas11', 'meas12', 'meas13', 'meas14']
df[cols].describe()

Unnamed: 0,meas08,meas09,meas10,meas11,meas12,meas13,meas14
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,2388.096652,9065.242941,1.3,47.541168,521.41347,2388.096152,8143.752722
std,0.070985,22.08288,4.660829e-13,0.267087,0.737553,0.071919,19.076176
min,2387.9,9021.73,1.3,46.85,518.69,2387.88,8099.94
25%,2388.05,9053.1,1.3,47.35,520.96,2388.04,8133.245
50%,2388.09,9060.66,1.3,47.51,521.48,2388.09,8140.54
75%,2388.14,9069.42,1.3,47.7,521.95,2388.14,8148.31
max,2388.56,9244.59,1.3,48.53,523.38,2388.56,8293.72


In [38]:
cols = ['meas15', 'meas16', 'meas17', 'meas18', 'meas19', 'meas20', 'meas21']
df[cols].describe()

Unnamed: 0,meas15,meas16,meas17,meas18,meas19,meas20,meas21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,0.037505,1.556432e-14,1.548763,0.0,0.0,0.180746,0.108251
min,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


In [42]:
df[['unit', 'time_cycles']].groupby('unit').count()

Unnamed: 0_level_0,time_cycles
unit,Unnamed: 1_level_1
1,192
2,287
3,179
4,189
5,269
6,188
7,259
8,150
9,201
10,222
