# Exploratory Data Analysis

## Import packages and libraries

In [1]:
import sys

# insert libraries folder at the beginning of system path to enable fast access
sys.path.insert(1, '../src')

# data manipulation package
import pandas as pd

# configure display settings
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 50)

# library containing utility functions
import utils
# library containing data exploration functions
import exploration
# library containing data processing functions
import processing
# library containing data visualization functions
import visualization

## Load data and display metadata

In [2]:
dataschema = ['Engine_no', 'Time', 'Altitude', 'Mach', 'TRA', 'T2', 'T24', 
              'T30', 'T50', 'P2', 'P15', 'P30', 'Nf', 'Nc', 'epr', 'Ps30', 
              'phi', 'NRf', 'NRc', 'BPR', 'farB', 'htBleed', 'Nf_dmd', 
              'PCNfR_dmd', 'W31', 'W32']
dataframes = utils.load_data('RUL', ["RUL"], dataschema)

  0%|          | 0/14 [00:00<?, ?it/s]


------------------------------
----------RUL_FD001-----------
------------------------------
Number of rows : 100 
Number of columns : 1 
------------------------------
RUL    int64

------------------------------
----------RUL_FD002-----------
------------------------------
Number of rows : 259 
Number of columns : 1 
------------------------------
RUL    int64

------------------------------
----------RUL_FD003-----------
------------------------------
Number of rows : 100 
Number of columns : 1 
------------------------------
RUL    int64

------------------------------
----------RUL_FD004-----------
------------------------------
Number of rows : 248 
Number of columns : 1 
------------------------------
RUL    int64

------------------------------
----------test_FD001----------
------------------------------
Number of rows : 13096 
Number of columns : 26 
------------------------------
Engine_no      int64
Time           int64
Altitude     float64
Mach         float64
TRA        

In [3]:
dataframes['RUL_FD001'].sample(5)

Unnamed: 0,RUL
86,116
95,137
65,14
35,19
64,128


In [4]:
dataframes['test_FD001'].sample(5)

Unnamed: 0,Engine_no,Time,Altitude,Mach,TRA,T2,T24,T30,T50,P2,P15,P30,Nf,Nc,epr,Ps30,phi,NRf,NRc,BPR,farB,htBleed,Nf_dmd,PCNfR_dmd,W31,W32
8553,65,57,0.0011,-0.0003,100.0,518.67,642.12,1582.86,1394.85,14.62,21.6,553.99,2387.93,9059.26,1.3,47.19,522.45,2387.98,8146.49,8.3924,0.03,393,2388,100.0,39.02,23.4389
6926,54,91,0.0001,-0.0,100.0,518.67,642.13,1581.71,1395.38,14.62,21.61,554.61,2388.04,9053.71,1.3,47.24,521.97,2388.04,8139.87,8.4045,0.03,391,2388,100.0,38.99,23.39
12512,96,64,0.0059,-0.0005,100.0,518.67,641.71,1578.14,1402.39,14.62,21.61,554.16,2388.03,9068.63,1.3,47.18,522.48,2387.99,8143.85,8.364,0.03,391,2388,100.0,38.92,23.42
5618,45,106,-0.0025,0.0002,100.0,518.67,642.6,1592.62,1408.46,14.62,21.61,553.48,2388.14,9061.8,1.3,47.42,521.82,2388.14,8145.48,8.4682,0.03,394,2388,100.0,38.87,23.3062
12818,99,18,-0.0011,0.0003,100.0,518.67,641.79,1590.18,1388.79,14.62,21.61,554.65,2387.98,9064.55,1.3,47.12,522.42,2388.01,8138.37,8.3759,0.03,392,2388,100.0,39.21,23.4465


In [5]:
dataframes['train_FD001'].sample(5)

Unnamed: 0,Engine_no,Time,Altitude,Mach,TRA,T2,T24,T30,T50,P2,P15,P30,Nf,Nc,epr,Ps30,phi,NRf,NRc,BPR,farB,htBleed,Nf_dmd,PCNfR_dmd,W31,W32
16667,83,76,-0.0001,0.0002,100.0,518.67,641.82,1579.26,1405.68,14.62,21.61,554.82,2388.04,9055.91,1.3,47.19,522.2,2388.0,8142.69,8.412,0.03,391,2388,100.0,38.86,23.4263
13684,69,54,0.001,-0.0002,100.0,518.67,642.22,1592.65,1399.91,14.62,21.61,554.06,2388.06,9059.05,1.3,47.44,522.23,2388.1,8139.88,8.4224,0.03,394,2388,100.0,38.94,23.4411
15997,80,45,0.0029,-0.0005,100.0,518.67,642.68,1591.31,1411.79,14.62,21.61,552.86,2388.08,9047.17,1.3,47.37,521.34,2388.07,8131.3,8.4331,0.03,394,2388,100.0,38.7,23.2211
13442,68,11,0.003,0.0001,100.0,518.67,642.89,1588.11,1412.51,14.62,21.61,554.08,2388.06,9059.76,1.3,47.33,521.98,2388.07,8145.18,8.4532,0.03,392,2388,100.0,38.88,23.3126
6504,33,93,-0.0056,-0.0001,100.0,518.67,642.64,1586.28,1409.2,14.62,21.61,554.66,2388.0,9059.15,1.3,47.26,522.36,2388.08,8148.71,8.4256,0.03,392,2388,100.0,38.97,23.5023


We loaded the data and it does not contain any string variable.

## Check for missing

In [6]:
missing_data = exploration.missing_values(dataframes)

  0%|          | 0/12 [00:00<?, ?it/s]

--------------------------------------------------
--------------------------------------------------

RUL_FD001 has 1 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

RUL_FD002 has 1 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

RUL_FD003 has 1 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

RUL_FD004 has 1 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

test_FD001 has 26 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

test_FD002 has 26 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

test_FD003 has 26 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

test_FD004 has 26 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

train_FD001 has 26 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

train_FD002 has 26 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

train_FD003 has 26 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


--------------------------------------------------
--------------------------------------------------

train_FD004 has 26 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


There is no missing data.

## Check for duplicates

In [7]:
duplicated_data = exploration.duplicate_rows(dataframes)

  0%|          | 0/12 [00:00<?, ?it/s]

--------------------------------------------------
--------------------------------------------------

RUL_FD001 has 29 duplicated rows.

--------------------------------------------------
--------------------------------------------------

RUL_FD002 has 127 duplicated rows.

--------------------------------------------------
--------------------------------------------------

RUL_FD003 has 31 duplicated rows.

--------------------------------------------------
--------------------------------------------------

RUL_FD004 has 117 duplicated rows.

--------------------------------------------------
--------------------------------------------------

test_FD001 has 0 duplicated rows.

--------------------------------------------------
--------------------------------------------------

test_FD002 has 0 duplicated rows.

--------------------------------------------------
--------------------------------------------------

test_FD003 has 0 duplicated rows.

--------------------------------

We have duplicated rows in RUL dataframes. Which is normal since there is no Engine_no column

## Remaining Useful Lifetime extraction

Since this is a predictive maintenance project using RUL prediction. We are going to generate a new column "Remaining Useful Lifetime (RUL)". 
</br>
* Train dataset: Engine operation to failure data
> Inverse time column or subtract time unit from maximum time unit
* Test dataset: Engine operation, RUL data of last time unit is to be found in RUL dataset
> Subtract time unit from maximum time unit plus RUL of last time unit

In [8]:
dataframes = processing.extract_rul(dataframes)

  0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
dataframes['train_FD001'].loc[:,['Engine_no', 'Time', 'RUL']].head(5)

Unnamed: 0,Engine_no,Time,RUL
0,1,1,191
1,1,2,190
2,1,3,189
3,1,4,188
4,1,5,187


In [10]:
dataframes['test_FD001'].loc[:,['Engine_no', 'Time', 'RUL']].head(5)

Unnamed: 0,Engine_no,Time,RUL
0,1,1,142
1,1,2,141
2,1,3,140
3,1,4,139
4,1,5,138
