## Data Preprocessing

In [2]:
# Data = CMAPSSData
# Source = https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/
# Dataset = Turbofan Engine Degradation Simulation Data Set

In [3]:
# Data to be used - Train and Test data for 1

In [4]:
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

### Notebook options

In [5]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [21, 10] # Size of the plots

### Data Directories

In [6]:
INPUT_DIR = '../data/raw/CMAPSSData/'
INT_DIR = '../data/interim/'
OUTPUT_DIR = '../data/interim/'

### Read the input data

In [7]:
input_file = OUTPUT_DIR + 'train_case2.csv'
df_data = pd.read_csv(input_file)

### Data Imputation

In [8]:
# Not required as there are no missing values
# Remove the additional columns

### Data Standardization

In [9]:
df_data.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,130.72,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,164.31,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286


In [10]:
from sklearn.preprocessing import StandardScaler

exclude_cols = ['unit', 'time_cycles']

df_pca = df_data.copy()
features = [x for x in df_pca.columns.values if x not in set(exclude_cols)]
print(features)

# Separating out the features
x = df_pca.loc[:, features].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

['setting1', 'setting2', 'setting3', 'meas01', 'meas02', 'meas03', 'meas04', 'meas05', 'meas06', 'meas07', 'meas08', 'meas09', 'meas10', 'meas11', 'meas12', 'meas13', 'meas14', 'meas15', 'meas16', 'meas17', 'meas18', 'meas19', 'meas20', 'meas21']


In [11]:
df_pca.head()

Unnamed: 0,unit,time_cycles,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,130.72,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,164.31,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286


### Writing standardized to file

In [12]:
std_file = OUTPUT_DIR + 'train_case1_std.pkl'
with open(std_file, 'wb') as f:
    pkl.dump(x, f, protocol=pkl.HIGHEST_PROTOCOL)

### Reading back from pickle

In [13]:
with open(std_file, 'rb') as f:
    x = pkl.load(f)

### PCA Projection

In [14]:
from sklearn.decomposition import PCA
num_components = len(df_pca.columns.values) - 2
pca = PCA(n_components=num_components)
principalComponents = pca.fit_transform(x)

In [15]:
sum = 0
idx = 1
print('| Component | Individual Contribution | Cumulative |')
print('| -- | -- | -- |')
for val in pca.explained_variance_ratio_:
    sum += val
    print('| Component ' + str(idx), end = ' | ')
    print("{:10.2f}".format(val*100), end=' | ')
    print("{:10.2f}".format(sum*100), end= ' |\n')
    idx += 1

| Component | Individual Contribution | Cumulative |
| -- | -- | -- |
| Component 1 |      77.35 |      77.35 |
| Component 2 |      19.96 |      97.31 |
| Component 3 |       1.91 |      99.22 |
| Component 4 |       0.41 |      99.62 |
| Component 5 |       0.17 |      99.79 |
| Component 6 |       0.10 |      99.89 |
| Component 7 |       0.07 |      99.96 |
| Component 8 |       0.03 |      99.98 |
| Component 9 |       0.01 |      99.99 |
| Component 10 |       0.00 |      99.99 |
| Component 11 |       0.00 |     100.00 |
| Component 12 |       0.00 |     100.00 |
| Component 13 |       0.00 |     100.00 |
| Component 14 |       0.00 |     100.00 |
| Component 15 |       0.00 |     100.00 |
| Component 16 |       0.00 |     100.00 |
| Component 17 |       0.00 |     100.00 |
| Component 18 |       0.00 |     100.00 |
| Component 19 |       0.00 |     100.00 |
| Component 20 |       0.00 |     100.00 |
| Component 21 |       0.00 |     100.00 |
| Component 22 |       0.00 |     10

### Create a principal dataframe with all the components

In [16]:
pca_col_list = ['pc'+str(i+1) for i in range(num_components)]
principalDf = pd.DataFrame(data = principalComponents, columns = pca_col_list)

### Find the influence of features in each component

In [17]:
df_infl = pd.DataFrame(pca.components_,columns=features,index=pca_col_list)

In [18]:
df_infl

Unnamed: 0,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,meas06,meas07,...,meas12,meas13,meas14,meas15,meas16,meas17,meas18,meas19,meas20,meas21
pc1,-0.1918573,-0.1898804,0.134141,0.2051653,0.2273461,0.2304723,0.2313113,0.2023973,0.2112972,0.2207928,...,0.2208956,0.1341708,0.1714853,-0.2018051,0.1978547,0.2305683,0.2008428,0.134141,0.2183516,0.2183471
pc2,-0.2528392,-0.2248669,-0.372245,0.1950288,0.04928854,-0.02603471,0.01654839,0.2201963,0.1852538,0.1372298,...,0.1366101,-0.3722195,-0.2853409,0.214318,0.1076223,-0.02387213,-0.2245897,-0.372245,0.1511926,0.1512212
pc3,0.006757462,0.3850051,0.028096,0.255356,0.2351739,0.1253994,-0.006539875,0.1087197,0.1008249,0.06601313,...,0.06531707,0.02798432,-0.2874767,0.1776719,-0.5972289,0.1248774,0.1352747,0.028096,0.07389938,0.07391879
pc4,-0.03629464,0.4095069,0.020667,0.1469899,0.1359477,0.0954425,0.008244184,-0.05732009,-0.06659438,-0.09954069,...,-0.1002076,0.02094326,-0.2623034,0.2163077,0.7397053,0.09069014,0.08861036,0.020667,-0.07576903,-0.07570141
pc5,0.2465268,-0.1197215,-0.204378,0.2590742,0.1637613,0.1635845,0.2570327,-0.120487,-0.129924,-0.1413537,...,-0.1402874,-0.201241,0.4699619,0.2883297,-0.07340406,0.1679546,-0.06083637,-0.204378,-0.2004068,-0.2000838
pc6,-0.1080103,0.2908622,-0.018917,-0.09529753,-0.07310972,-0.1045916,-0.2390185,0.1673215,0.1579189,0.1274365,...,0.1266681,-0.011382,0.6694164,0.09624796,0.1092507,-0.1015218,-0.05799614,-0.018917,0.159653,0.1598275
pc7,-0.6547685,0.2940172,-0.04193,0.1300132,0.05645368,0.02688902,-0.05731664,-0.1321884,-0.1620277,-0.2030116,...,-0.2055428,-0.03907142,0.1196287,-0.0004173815,-0.1523377,0.005452228,0.05217237,-0.04193,-0.06771111,-0.06742924
pc8,-0.3055793,0.1671012,0.057635,-0.4412104,-0.2675007,0.1507929,0.3211007,0.06525064,0.06456262,0.025249,...,0.02677276,0.06168516,0.02197511,0.4098294,-0.0692263,0.1366727,-0.1378303,0.057635,0.04538111,0.04640581
pc9,-0.03876454,0.04290555,0.012261,0.05359664,0.05215498,-0.8930721,0.1932243,0.008513223,0.007079717,0.001444599,...,0.002225848,0.01218125,-0.0006824888,0.03432695,5.984318e-05,0.3749307,0.03531984,0.012261,0.008564763,0.008223005
pc10,-0.01977771,0.08348601,0.027176,0.07484208,0.07399735,-0.1561367,0.6291361,0.014285,0.01338205,0.007852281,...,0.007619947,0.02731158,0.001758577,0.006380542,0.002084742,-0.7358427,0.0565985,0.027176,0.01332239,0.01841587


### Write the principal components to file

In [19]:
principalDf_file = INT_DIR + 'fd002_components_' + str(num_components) + '.csv'
with open(principalDf_file, 'wb') as f:
    principalDf.to_csv(principalDf_file, header=True, index=False)

print('Writing to ', principalDf_file)

Writing to  ../data/interim/fd002_components_24.csv
