## Data Preprocessing

In [1]:
# Data = CMAPSSData
# Source = https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/
# Dataset = Turbofan Engine Degradation Simulation Data Set

In [2]:
# Data to be used - Train and Test data for 1

In [3]:
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

### Notebook options

In [4]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [21, 10] # Size of the plots

### Data Directories

In [5]:
INPUT_DIR = '../data/raw/CMAPSSData/'
INT_DIR = '../data/interim/'
OUTPUT_DIR = '../data/interim/'

### Read the input data

In [8]:
input_file = OUTPUT_DIR + 'train_case3.csv'
df_data = pd.read_csv(input_file)

### Data Imputation

In [9]:
# Not required as there are no missing values
# Remove the additional columns

### Data Standardization

In [10]:
df_data.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,1,2,0.0008,-0.0003,100.0,518.67,642.5,1584.69,1396.89,14.62,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,522.03,2388.0,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669
3,1,4,-0.002,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,522.49,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951
4,1,5,0.0016,0.0,100.0,518.67,641.68,1588.63,1397.65,14.62,...,522.58,2388.03,8147.8,8.3869,0.03,392,2388,100.0,39.14,23.4583


In [12]:
from sklearn.preprocessing import StandardScaler

exclude_cols = ['unit', 'time_cycles']

df_pca = df_data.copy()
features = [x for x in df_pca.columns.values if x not in set(exclude_cols)]
print(features)

# Separating out the features
x = df_pca.loc[:, features].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

['setting1', 'setting2', 'setting3', 'meas01', 'meas02', 'meas03', 'meas04', 'meas05', 'meas06', 'meas07', 'meas08', 'meas09', 'meas10', 'meas11', 'meas12', 'meas13', 'meas14', 'meas15', 'meas16', 'meas17', 'meas18', 'meas19', 'meas20', 'meas21']


In [13]:
df_pca.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,1,2,0.0008,-0.0003,100.0,518.67,642.5,1584.69,1396.89,14.62,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,522.03,2388.0,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669
3,1,4,-0.002,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,522.49,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951
4,1,5,0.0016,0.0,100.0,518.67,641.68,1588.63,1397.65,14.62,...,522.58,2388.03,8147.8,8.3869,0.03,392,2388,100.0,39.14,23.4583


### Writing standardized to file

In [14]:
std_file = OUTPUT_DIR + 'train_case1_std.pkl'
with open(std_file, 'wb') as f:
    pkl.dump(x, f, protocol=pkl.HIGHEST_PROTOCOL)

### Reading back from pickle

In [15]:
with open(std_file, 'rb') as f:
    x = pkl.load(f)

### PCA Projection

In [16]:
from sklearn.decomposition import PCA
num_components = len(df_pca.columns.values) - 2
pca = PCA(n_components=num_components)
principalComponents = pca.fit_transform(x)

In [17]:
sum = 0
idx = 1
print('| Component | Individual Contribution | Cumulative |')
print('| -- | -- | -- |')
for val in pca.explained_variance_ratio_:
    sum += val
    print('| Component ' + str(idx), end = ' | ')
    print("{:10.2f}".format(val*100), end=' | ')
    print("{:10.2f}".format(sum*100), end= ' |\n')
    idx += 1

| Component | Individual Contribution | Cumulative |
| -- | -- | -- |
| Component 1 |      41.38 |      41.38 |
| Component 2 |      26.37 |      67.75 |
| Component 3 |       5.58 |      73.33 |
| Component 4 |       5.56 |      78.89 |
| Component 5 |       5.13 |      84.02 |
| Component 6 |       4.34 |      88.36 |
| Component 7 |       2.40 |      90.76 |
| Component 8 |       1.88 |      92.64 |
| Component 9 |       1.70 |      94.34 |
| Component 10 |       1.39 |      95.73 |
| Component 11 |       1.12 |      96.84 |
| Component 12 |       0.89 |      97.73 |
| Component 13 |       0.77 |      98.50 |
| Component 14 |       0.70 |      99.20 |
| Component 15 |       0.34 |      99.53 |
| Component 16 |       0.21 |      99.74 |
| Component 17 |       0.20 |      99.94 |
| Component 18 |       0.06 |     100.00 |
| Component 19 |       0.00 |     100.00 |
| Component 20 |       0.00 |     100.00 |
| Component 21 |       0.00 |     100.00 |
| Component 22 |       0.00 |     10

### Create a principal dataframe with all the components

In [18]:
pca_col_list = ['pc'+str(i+1) for i in range(num_components)]
principalDf = pd.DataFrame(data = principalComponents, columns = pca_col_list)

### Find the influence of features in each component

In [19]:
df_infl = pd.DataFrame(pca.components_,columns=features,index=pca_col_list)

In [20]:
df_infl

Unnamed: 0,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,meas06,meas07,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
pc1,-0.002085383,0.00143224,1.110223e-16,2.775558e-17,0.2007096,0.2458672,0.2294103,-0.0,0.02339025,0.3012531,...,0.3057597,0.3117744,0.2816727,-0.1731526,-0.0,0.2529474,-0.0,-0.0,0.1655271,0.1677629
pc2,0.001412365,0.001976861,-3.6429190000000005e-17,1.110223e-16,0.290541,0.2293067,0.2954265,0.0,0.2699801,-0.2435912,...,-0.2359161,0.08950946,-0.02027811,0.3779747,0.0,0.2403067,0.0,0.0,-0.3693626,-0.3692227
pc3,-0.05949306,0.9681333,9.714451000000001e-17,3.2959750000000005e-17,0.006163016,0.001570609,0.006276187,-2.220446e-16,-0.0964371,-0.02387594,...,-0.02202118,-0.0812203,0.1371296,0.006295483,-0.0,-0.001377298,-0.0,-0.0,-0.0272735,-0.02858743
pc4,0.995062,0.07655044,-1.387779e-17,0.0,-0.001163485,-0.002007247,0.002409231,-8.239937000000001e-18,0.01262933,0.008636138,...,0.007751478,0.02055021,-0.03787903,-0.0002079442,0.0,0.005965849,0.0,0.0,0.005653472,0.009881174
pc5,0.07704745,-0.2376649,2.775558e-16,5.5511150000000004e-17,0.0378333,0.007270527,0.04848516,2.775558e-17,-0.4128787,-0.08471069,...,-0.08110976,-0.3387421,0.5112637,0.02082428,-0.0,0.01204499,-0.0,-0.0,-0.1262942,-0.1163628
pc6,-0.01659121,0.01538238,-2.220446e-16,-2.151057e-16,0.2159252,0.1190866,0.2074491,5.5511150000000004e-17,-0.703873,0.01001363,...,0.01374884,-0.06301656,-0.3943318,-0.1004301,-0.0,0.1055387,-0.0,-0.0,-0.06266141,-0.06131332
pc7,0.001705671,0.009055215,-1.94289e-16,-9.714451000000001e-17,0.3061818,-0.1072757,0.1753775,-1.665335e-16,0.4532112,0.1100922,...,0.1130216,-0.445535,-0.1046986,0.01522925,-0.0,-0.06575331,-0.0,-0.0,0.1040932,0.1142095
pc8,0.003995168,-0.0001422305,2.775558e-16,-1.387779e-17,-0.3523474,0.874402,-0.01221429,1.110223e-16,0.07712295,0.005146526,...,0.006064479,-0.1321626,-0.03798255,-0.001965378,-0.0,-0.2505473,-0.0,-0.0,0.02274226,0.01941419
pc9,0.004791765,-0.004893685,-1.110223e-16,0.0,0.7097728,0.1600013,-0.1543561,1.110223e-16,-0.05522115,-0.001623332,...,-0.004815912,0.08661797,0.04663376,-0.04905289,0.0,-0.5937005,0.0,0.0,0.04697681,0.0553527
pc10,-0.001646016,-0.001884247,0.0,-6.245005e-17,0.2805007,0.2478956,-0.3252382,-1.526557e-16,0.02911479,0.01941625,...,0.01971,-0.1771982,-0.04886112,-0.09793808,0.0,0.6452661,0.0,0.0,0.2157077,0.1689953


### Write the principal components to file

In [21]:
principalDf_file = INT_DIR + 'fd003_components_' + str(num_components) + '.csv'
with open(principalDf_file, 'wb') as f:
    principalDf.to_csv(principalDf_file, header=True, index=False)

print('Writing to ', principalDf_file)

Writing to  ../data/interim/fd003_components_24.csv
