## Data Preprocessing

In [11]:
# Data = CMAPSSData
# Source = https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/
# Dataset = Turbofan Engine Degradation Simulation Data Set

In [12]:
# Data to be used - Train and Test data for 1

In [13]:
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

### Notebook options

In [14]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [21, 10] # Size of the plots

### Data Directories

In [43]:
INPUT_DIR = '../data/raw/CMAPSSData/'
INT_DIR = '../data/interim/'
OUTPUT_DIR = '../data/interim/'

### Read the input data

In [23]:
input_file = OUTPUT_DIR + 'train_case4.csv'
df_data = pd.read_csv(input_file)

### Data Imputation

In [24]:
# Not required as there are no missing values
# Remove the additional columns

### Data Standardization

In [25]:
df_data.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [26]:
from sklearn.preprocessing import StandardScaler

exclude_cols = ['unit', 'time_cycles']

df_pca = df_raw.copy()
features = [x for x in df_pca.columns.values if x not in set(exclude_cols)]
print(features)

# Separating out the features
x = df_pca.loc[:, features].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

['setting1', 'setting2', 'setting3', 'meas01', 'meas02', 'meas03', 'meas04', 'meas05', 'meas06', 'meas07', 'meas08', 'meas09', 'meas10', 'meas11', 'meas12', 'meas13', 'meas14', 'meas15', 'meas16', 'meas17', 'meas18', 'meas19', 'meas20', 'meas21']


In [35]:
df_pca.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


### Writing standardized to file

In [30]:
std_file = OUTPUT_DIR + 'train_case1_std.pkl'
with open(std_file, 'wb') as f:
    pkl.dump(x, f, protocol=pkl.HIGHEST_PROTOCOL)

### Reading back from pickle

In [31]:
with open(std_file, 'rb') as f:
    x = pkl.load(f)

### PCA Projection

In [36]:
from sklearn.decomposition import PCA
num_components = len(df_pca.columns.values) - 2
pca = PCA(n_components=num_components)
principalComponents = pca.fit_transform(x)

In [37]:
sum = 0
idx = 1
print('| Component | Individual Contribution | Cumulative |')
print('| -- | -- | -- |')
for val in pca.explained_variance_ratio_:
    sum += val
    print('| Component ' + str(idx), end = ' | ')
    print("{:10.2f}".format(val*100), end=' | ')
    print("{:10.2f}".format(sum*100), end= ' |\n')
    idx += 1

| Component | Individual Contribution | Cumulative |
| -- | -- | -- |
| Component 1 |      53.10 |      53.10 |
| Component 2 |      12.35 |      65.44 |
| Component 3 |       5.95 |      71.40 |
| Component 4 |       5.84 |      77.23 |
| Component 5 |       5.66 |      82.89 |
| Component 6 |       2.43 |      85.33 |
| Component 7 |       2.17 |      87.49 |
| Component 8 |       2.05 |      89.54 |
| Component 9 |       1.80 |      91.34 |
| Component 10 |       1.70 |      93.05 |
| Component 11 |       1.50 |      94.55 |
| Component 12 |       1.21 |      95.76 |
| Component 13 |       1.15 |      96.91 |
| Component 14 |       1.04 |      97.95 |
| Component 15 |       1.01 |      98.97 |
| Component 16 |       0.86 |      99.83 |
| Component 17 |       0.17 |     100.00 |
| Component 18 |       0.00 |     100.00 |
| Component 19 |       0.00 |     100.00 |
| Component 20 |       0.00 |     100.00 |
| Component 21 |       0.00 |     100.00 |
| Component 22 |       0.00 |     10

### Create a principal dataframe with all the components

In [38]:
pca_col_list = ['pc'+str(i+1) for i in range(num_components)]
principalDf = pd.DataFrame(data = principalComponents, columns = pca_col_list)

### Find the influence of features in each component

In [39]:
df_infl = pd.DataFrame(pca.components_,columns=features,index=pca_col_list)

In [40]:
df_infl

Unnamed: 0,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,meas06,meas07,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
pc1,0.002381516,0.005228829,-2.220446e-16,1.387779e-17,0.2727388,0.2595994,0.3005868,2.6469780000000004e-23,0.06235197,-0.2978555,...,-0.3038496,0.2809392,0.06595054,0.2865402,0.0,0.2687237,0.0,0.0,-0.282204,-0.2831342
pc2,-0.005823868,-0.01383703,1.387779e-17,2.220446e-16,0.01832876,0.0774208,0.01047304,-2.168404e-19,-0.04795679,0.05160007,...,0.06194938,-0.2450678,0.6662043,0.02156861,-0.0,0.07809534,-0.0,-0.0,-0.01900673,-0.0235428
pc3,0.6707551,0.7283068,0.0,0.0,-0.004185765,-0.01636545,0.001459961,0.0,0.1363168,-0.001572462,...,0.01128901,-0.009511458,0.01246022,0.002540448,0.0,-0.001378696,0.0,0.0,0.003505066,-0.001129396
pc4,0.7233193,-0.6027876,5.5511150000000004e-17,-0.0,0.01578858,-0.002129311,0.008052908,-0.0,-0.334726,-0.006510885,...,-0.005835685,0.003147119,-0.01420636,0.002978677,-0.0,-0.0003763976,-0.0,-0.0,-0.006757683,-0.02033935
pc5,-0.1614577,0.3253443,-2.775558e-17,5.5511150000000004e-17,0.01708512,0.03141878,0.01644704,-0.0,-0.9287713,-0.01552952,...,-0.01960024,0.02353502,-0.02459173,0.00707847,-0.0,0.01330539,-0.0,-0.0,-0.01316073,-0.01580802
pc6,0.017778,1.214089e-05,5.5511150000000004e-17,-2.775558e-17,-0.1890668,0.9335705,-0.03499099,-2.775558e-17,0.01727805,0.02900203,...,0.02158579,-0.02320394,-0.04110234,-0.07659331,-0.0,-0.2528993,-0.0,-0.0,0.09936542,0.06247518
pc7,0.008672085,-0.006785374,1.665335e-16,-2.220446e-16,-0.5558117,0.08237136,-0.03130741,-1.387779e-17,-0.001591533,-0.01032542,...,0.02650289,0.004252296,-0.04186451,-0.03287767,0.0,0.8126799,0.0,0.0,0.1115777,0.06106952
pc8,0.003291134,0.002418043,2.498002e-16,1.31839e-16,0.7359001,0.1671457,-0.09297022,1.231654e-16,0.009526035,0.0923377,...,0.06030746,-0.03933439,-0.049531,-0.1819975,0.0,0.3957538,0.0,0.0,0.4067382,0.2022768
pc9,-0.009964122,0.003652277,-1.665335e-16,5.5511150000000004e-17,-0.07548996,-0.0489355,-0.01598012,8.326673e-17,0.004370487,-0.01360828,...,-0.02231262,0.02355806,0.009628178,-0.1092976,0.0,-0.0696701,0.0,0.0,0.5829516,-0.7955712
pc10,0.002209943,-0.005335745,1.873501e-16,7.632783000000001e-17,-0.09186914,-0.05512166,0.06745213,-5.5511150000000004e-17,-0.009910794,-0.05907713,...,-0.05245136,0.04680884,0.03693687,0.746966,0.0,-0.1172983,0.0,0.0,0.5406478,0.3212577


### Write the principal components to file

In [44]:
principalDf_file = INT_DIR + 'components_' + str(num_components) + '.csv'
with open(principalDf_file, 'wb') as f:
    principalDf.to_csv(principalDf_file, header=True, index=False)

print('Writing to ', principalDf_file)

Writing to  ../data/interim/components_24.csv
