## Data Preprocessing

In [1]:
# Data = CMAPSSData
# Source = https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/
# Dataset = Turbofan Engine Degradation Simulation Data Set

In [2]:
# Data to be used - Train and Test data for 1

In [3]:
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

### Notebook options

In [4]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [21, 10] # Size of the plots

### Data Directories

In [5]:
INPUT_DIR = '../data/raw/CMAPSSData/'
INT_DIR = '../data/interim/'
OUTPUT_DIR = '../data/interim/'

### Read the input data

In [7]:
input_file = OUTPUT_DIR + 'train_case4.csv'
df_data = pd.read_csv(input_file)

### Data Imputation

In [8]:
# Not required as there are no missing values
# Remove the additional columns

### Data Standardization

In [9]:
df_data.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
0,1,1,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,...,129.78,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367
1,1,2,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,...,312.59,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552
2,1,3,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,...,129.62,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213
3,1,4,42.0,0.84,100.0,445.0,548.7,1341.24,1118.03,3.91,...,129.8,2388.02,8076.05,9.3369,0.02,328,2212,100.0,10.54,6.4176
4,1,5,25.0063,0.6207,60.0,462.54,536.1,1255.23,1033.59,7.05,...,164.11,2028.08,7865.8,10.8366,0.02,305,1915,84.93,14.03,8.6754


In [12]:
from sklearn.preprocessing import StandardScaler

exclude_cols = ['unit', 'time_cycles']

df_pca = df_data.copy()
features = [x for x in df_pca.columns.values if x not in set(exclude_cols)]
print(features)

# Separating out the features
x = df_pca.loc[:, features].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

['setting1', 'setting2', 'setting3', 'meas01', 'meas02', 'meas03', 'meas04', 'meas05', 'meas06', 'meas07', 'meas08', 'meas09', 'meas10', 'meas11', 'meas12', 'meas13', 'meas14', 'meas15', 'meas16', 'meas17', 'meas18', 'meas19', 'meas20', 'meas21']


In [13]:
df_pca.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
0,1,1,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,...,129.78,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367
1,1,2,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,...,312.59,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552
2,1,3,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,...,129.62,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213
3,1,4,42.0,0.84,100.0,445.0,548.7,1341.24,1118.03,3.91,...,129.8,2388.02,8076.05,9.3369,0.02,328,2212,100.0,10.54,6.4176
4,1,5,25.0063,0.6207,60.0,462.54,536.1,1255.23,1033.59,7.05,...,164.11,2028.08,7865.8,10.8366,0.02,305,1915,84.93,14.03,8.6754


### Writing standardized to file

In [14]:
std_file = OUTPUT_DIR + 'train_case1_std.pkl'
with open(std_file, 'wb') as f:
    pkl.dump(x, f, protocol=pkl.HIGHEST_PROTOCOL)

### Reading back from pickle

In [15]:
with open(std_file, 'rb') as f:
    x = pkl.load(f)

### PCA Projection

In [16]:
from sklearn.decomposition import PCA
num_components = len(df_pca.columns.values) - 2
pca = PCA(n_components=num_components)
principalComponents = pca.fit_transform(x)

In [17]:
sum = 0
idx = 1
print('| Component | Individual Contribution | Cumulative |')
print('| -- | -- | -- |')
for val in pca.explained_variance_ratio_:
    sum += val
    print('| Component ' + str(idx), end = ' | ')
    print("{:10.2f}".format(val*100), end=' | ')
    print("{:10.2f}".format(sum*100), end= ' |\n')
    idx += 1

| Component | Individual Contribution | Cumulative |
| -- | -- | -- |
| Component 1 |      77.36 |      77.36 |
| Component 2 |      19.96 |      97.32 |
| Component 3 |       1.99 |      99.31 |
| Component 4 |       0.32 |      99.63 |
| Component 5 |       0.17 |      99.80 |
| Component 6 |       0.08 |      99.87 |
| Component 7 |       0.06 |      99.94 |
| Component 8 |       0.03 |      99.97 |
| Component 9 |       0.01 |      99.98 |
| Component 10 |       0.01 |      99.99 |
| Component 11 |       0.00 |      99.99 |
| Component 12 |       0.00 |     100.00 |
| Component 13 |       0.00 |     100.00 |
| Component 14 |       0.00 |     100.00 |
| Component 15 |       0.00 |     100.00 |
| Component 16 |       0.00 |     100.00 |
| Component 17 |       0.00 |     100.00 |
| Component 18 |       0.00 |     100.00 |
| Component 19 |       0.00 |     100.00 |
| Component 20 |       0.00 |     100.00 |
| Component 21 |       0.00 |     100.00 |
| Component 22 |       0.00 |     10

### Create a principal dataframe with all the components

In [18]:
pca_col_list = ['pc'+str(i+1) for i in range(num_components)]
principalDf = pd.DataFrame(data = principalComponents, columns = pca_col_list)

### Find the influence of features in each component

In [19]:
df_infl = pd.DataFrame(pca.components_,columns=features,index=pca_col_list)

In [20]:
df_infl

Unnamed: 0,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,meas06,meas07,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
pc1,-0.1918326,-0.1902702,0.134038,0.2050538,0.2272332,0.2304192,0.2312604,0.2023484,0.2112382,0.2207809,...,0.2208847,0.1340462,0.1720726,-0.2017193,0.1978628,0.2305234,0.2007744,0.134038,0.2183341,0.2183252
pc2,-0.2528281,-0.224989,-0.372202,0.1951405,0.04992904,-0.02605992,0.01615133,0.2202126,0.1853463,0.137226,...,0.1366031,-0.3722172,-0.2854304,0.2136636,0.1093572,-0.02383815,-0.224442,-0.372202,0.1512058,0.1512458
pc3,0.0030096,0.3747889,0.029917,0.254155,0.2343977,0.1243408,-0.00406073,0.1087961,0.1006613,0.06484302,...,0.06414031,0.02965742,-0.2865961,0.1827713,-0.6058065,0.123748,0.1364011,0.029917,0.07268785,0.07281377
pc4,-0.06612618,0.4082688,0.022216,0.1559085,0.142581,0.102404,0.01765449,-0.06733375,-0.07704072,-0.114435,...,-0.1150906,0.0224856,-0.2648069,0.2413564,0.720895,0.0967685,0.09529706,0.022216,-0.08543471,-0.08561453
pc5,0.2662067,-0.1199661,-0.215858,0.241405,0.1506325,0.1904659,0.3005466,-0.1377372,-0.1454312,-0.1419619,...,-0.1406696,-0.2127385,0.396078,0.2271548,-0.0952457,0.1910131,-0.07823206,-0.215858,-0.2062264,-0.2062628
pc6,-0.03752511,0.3945981,-0.085296,-0.08688524,-0.07617898,-0.04480378,-0.1786839,0.144424,0.1269869,0.1335515,...,0.1330046,-0.07648369,0.6698796,-0.04326624,0.1203402,-0.04467515,-0.1105257,-0.085296,0.1532536,0.1533578
pc7,-0.6417909,0.3127121,-0.069105,0.1326305,0.04921603,0.03426633,-0.05465682,-0.1298255,-0.1660282,-0.1968213,...,-0.1993818,-0.06626092,0.1503434,-0.08976713,-0.1792721,0.01498798,0.0303624,-0.069105,-0.06323966,-0.06398643
pc8,-0.3260764,0.129212,0.085038,-0.4410196,-0.2622783,0.1241476,0.285921,0.06312492,0.07868721,0.02484918,...,0.02627985,0.08783683,0.03001177,0.4937584,-0.08017433,0.1156307,-0.114532,0.085038,0.05123857,0.0515571
pc9,-0.06780917,-0.3499347,0.155903,0.1009284,0.1056095,-0.1716301,-0.1644091,0.03872439,0.0626606,-0.03268063,...,-0.03332069,0.1520006,0.233457,0.6639145,0.003132137,-0.1592652,0.1731455,0.155903,-0.02718434,-0.02860072
pc10,0.04107158,-0.07746393,0.004321,-0.05770882,-0.05162662,0.8920734,-0.2189289,-0.008644736,-0.003518581,-0.00216603,...,-0.002025265,0.004005092,-0.0006545577,0.02451696,-0.0006448269,-0.3490947,-0.02366434,0.004321,-0.01060541,-0.009053768


### Write the principal components to file

In [22]:
principalDf_file = INT_DIR + 'fd004_components_' + str(num_components) + '.csv'
with open(principalDf_file, 'wb') as f:
    principalDf.to_csv(principalDf_file, header=True, index=False)

print('Writing to ', principalDf_file)

Writing to  ../data/interim/fd004_components_24.csv
