# Feature Engineering

# Download data from kaggle

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

In [2]:
pip install kaggle



In [3]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"snehasathish","key":"b72b4c47e42ce8bf94a6f214142dfb59"}'}

In [0]:
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c LANL-Earthquake-Prediction

Downloading sample_submission.csv to /content
  0% 0.00/33.3k [00:00<?, ?B/s]
100% 33.3k/33.3k [00:00<00:00, 32.2MB/s]
Downloading test.zip to /content
 97% 235M/242M [00:01<00:00, 137MB/s]
100% 242M/242M [00:01<00:00, 138MB/s]
Downloading train.csv.zip to /content
100% 2.03G/2.03G [00:34<00:00, 25.2MB/s]
100% 2.03G/2.03G [00:34<00:00, 63.6MB/s]


# Load Train Data

In [6]:
!unzip train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


In [7]:
signal = pd.read_csv('train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
signal.shape

(629145480, 2)

In [8]:
rows = 150000
segments = int(np.floor(signal.shape[0] / rows))
segments

4194

In [0]:
train_X = pd.DataFrame(index=range(segments), dtype=np.float64)
train_y = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['time_to_failure'])

# Aggregated Features

In [0]:
def statistical_features(seg_id, seg, seg_data):
  seg_data.loc[seg_id, 'mean'] = seg['acoustic_data'].mean()
  seg_data.loc[seg_id, 'abs_mean'] = np.abs(seg['acoustic_data']).mean()
  seg_data.loc[seg_id, 'mean_first_10000'] = seg['acoustic_data'][:10000].mean()
  seg_data.loc[seg_id, 'mean_first_50000'] = seg['acoustic_data'][:50000].mean()
  seg_data.loc[seg_id, 'mean_last_10000'] = seg['acoustic_data'][:-10000].mean()
  seg_data.loc[seg_id, 'mean_last_50000'] = seg['acoustic_data'][:-50000].mean()
  seg_data.loc[seg_id, 'std'] = seg['acoustic_data'].std()
  seg_data.loc[seg_id, 'abs_std'] = np.abs(seg['acoustic_data']).std()
  seg_data.loc[seg_id, 'std_first_10000'] = seg['acoustic_data'][:10000].std()
  seg_data.loc[seg_id, 'std_first_50000'] = seg['acoustic_data'][:50000].std()
  seg_data.loc[seg_id, 'std_last_10000'] = seg['acoustic_data'][:-10000].std()
  seg_data.loc[seg_id, 'std_last_50000'] = seg['acoustic_data'][:-50000].std()
  seg_data.loc[seg_id, 'median'] = seg['acoustic_data'].median()  
  seg_data.loc[seg_id, 'abs_median'] = np.abs(seg['acoustic_data']).median()
  seg_data.loc[seg_id, 'median_first_10000'] = seg['acoustic_data'][:10000].median()
  seg_data.loc[seg_id, 'median_first_50000'] = seg['acoustic_data'][:50000].median()
  seg_data.loc[seg_id, 'median_last_10000'] = seg['acoustic_data'][:-10000].median()
  seg_data.loc[seg_id, 'median_last_50000'] = seg['acoustic_data'][:-50000].median()
  seg_data.loc[seg_id, 'max'] = seg['acoustic_data'].max()  
  seg_data.loc[seg_id, 'abs_max'] = np.abs(seg['acoustic_data']).max()
  seg_data.loc[seg_id, 'max_first_10000'] = seg['acoustic_data'][:10000].max()
  seg_data.loc[seg_id, 'max_first_50000'] = seg['acoustic_data'][:50000].max()
  seg_data.loc[seg_id, 'max_last_10000'] = seg['acoustic_data'][:-10000].max()
  seg_data.loc[seg_id, 'max_last_50000'] = seg['acoustic_data'][:-50000].max()
  seg_data.loc[seg_id, 'min'] = seg['acoustic_data'].min()  
  seg_data.loc[seg_id, 'abs_min'] = np.abs(seg['acoustic_data']).min()
  seg_data.loc[seg_id, 'min_first_10000'] = seg['acoustic_data'][:10000].min()
  seg_data.loc[seg_id, 'min_first_50000'] = seg['acoustic_data'][:50000].min()
  seg_data.loc[seg_id, 'min_last_10000'] = seg['acoustic_data'][:-10000].min()
  seg_data.loc[seg_id, 'min_last_50000'] = seg['acoustic_data'][:-50000].min()
  seg_data.loc[seg_id, 'max_min'] =  seg['acoustic_data'].max()/seg['acoustic_data'].min()
  seg_data.loc[seg_id, 'Quant5'] = np.quantile(seg['acoustic_data'], 0.05)  
  seg_data.loc[seg_id, 'Quant25'] = np.quantile(seg['acoustic_data'], 0.25)
  seg_data.loc[seg_id, 'Quant75'] = np.quantile(seg['acoustic_data'], 0.75)
  seg_data.loc[seg_id, 'Quant95'] = np.quantile(seg['acoustic_data'], 0.95)
  seg_data.loc[seg_id, 'Quant99'] = np.quantile(seg['acoustic_data'], 0.99)
  seg_data.loc[seg_id, 'IQR'] = ((np.quantile(seg['acoustic_data'], 0.25)) -(np.quantile(seg['acoustic_data'], 0.25)))
  seg_data.loc[seg_id, 'Abs_Quant5'] = np.quantile(np.abs(seg['acoustic_data']), 0.05)  
  seg_data.loc[seg_id, 'Abs_Quant25'] = np.quantile(np.abs(seg['acoustic_data']), 0.25)
  seg_data.loc[seg_id, 'Abs_Quant75'] = np.quantile(np.abs(seg['acoustic_data']), 0.75)
  seg_data.loc[seg_id, 'Abs_Quant95'] = np.quantile(np.abs(seg['acoustic_data']), 0.95)
  seg_data.loc[seg_id, 'Abs_Quant99'] = np.quantile(np.abs(seg['acoustic_data']), 0.99)
  seg_data.loc[seg_id, 'Abs_IQR'] = ((np.quantile(np.abs(seg['acoustic_data']), 0.75)) -(np.quantile(np.abs(seg['acoustic_data']), 0.25)))
  seg_data.loc[seg_id, 'Sum'] = seg['acoustic_data'][:10000].sum()
  seg_data.loc[seg_id, 'Abs_sum'] = np.abs(seg['acoustic_data']).sum()
  seg_data.loc[seg_id, 'MAD'] = seg['acoustic_data'].mad()
  seg_data.loc[seg_id, 'Kurtosis'] = seg['acoustic_data'].kurt()
  seg_data.loc[seg_id, 'skew'] = seg['acoustic_data'].skew()

In [11]:
for seg_id in tqdm(range(segments)):
    seg = signal.iloc[seg_id*rows:seg_id*rows+rows]
    statistical_features(seg_id, seg, train_X)
    train_y.loc[seg_id, 'time_to_failure'] = seg['time_to_failure'].values[-1]

HBox(children=(IntProgress(value=0, max=4194), HTML(value='')))




In [0]:
print(train_X.shape)
train_X.head()

(4194, 48)


Unnamed: 0,mean,std,max,min,abs_mean,mean_first_10000,mean_first_50000,mean_last_10000,mean_last_50000,abs_std,...,Abs_Quant25,Abs_Quant75,Abs_Quant95,Abs_Quant99,Abs_IQR,Sum,Abs_sum,MAD,Kurtosis,skew
0,4.884113,5.101106,104.0,-98.0,5.576567,5.182,4.9621,4.899007,5.01594,4.333325,...,3.0,7.0,12.0,20.0,4.0,51820.0,836485.0,3.263401,33.662481,-0.024061
1,4.725767,6.588824,181.0,-154.0,5.734167,4.7772,4.6984,4.712293,4.69448,5.732777,...,3.0,7.0,12.0,24.0,4.0,47772.0,860125.0,3.574302,98.758517,0.390561
2,4.906393,6.967397,140.0,-106.0,6.152647,4.6814,4.7061,4.886771,4.81588,5.895945,...,3.0,8.0,14.0,30.0,5.0,46814.0,922897.0,3.948411,33.555211,0.217391
3,4.90224,6.922305,197.0,-199.0,5.93396,5.0364,4.84364,4.882936,4.83663,6.061214,...,3.0,8.0,13.0,26.0,5.0,50364.0,890094.0,3.647117,116.548172,0.757278
4,4.90872,7.30111,145.0,-126.0,6.110587,4.9405,4.89116,4.923021,4.94855,6.329485,...,3.0,8.0,13.0,32.0,5.0,49405.0,916588.0,3.826052,52.977905,0.064531


In [0]:
train_X.to_csv('statistical_features.csv', header=True, index=False) 
files.download('statistical_features.csv')

In [0]:
del train_X

In [13]:
print(train_y.shape)
train_y.head()

(4194, 1)


Unnamed: 0,time_to_failure
0,1.430797
1,1.391499
2,1.353196
3,1.313798
4,1.2744


In [0]:
train_y.to_csv('output.csv', header=True, index=False) 
files.download('output.csv')

In [0]:
del train_y

#  RollingWindow_features

In [0]:
train_X = pd.DataFrame(index=range(segments), dtype=np.float64)

In [0]:
def rollingwindow_features(seg_id, seg, seg_data):
  
  Rolling_mean_50 = seg['acoustic_data'].rolling(50).mean().dropna().values
    
  seg_data.loc[seg_id, 'MRoll_mean_50'] = Rolling_mean_50.mean()
  seg_data.loc[seg_id, 'MRoll_std_50'] = Rolling_mean_50.std()
  seg_data.loc[seg_id, 'MRoll_max_50'] = Rolling_mean_50.max()
  seg_data.loc[seg_id, 'MRoll_min_50'] = Rolling_mean_50.min()
  seg_data.loc[seg_id, 'MRoll_5quantile_50'] = np.quantile(Rolling_mean_50,0.05)
  seg_data.loc[seg_id, 'MRoll_25quantile_50'] = np.quantile(Rolling_mean_50,0.25)
  seg_data.loc[seg_id, 'MRoll_75quantile_50'] = np.quantile(Rolling_mean_50,0.75)
  seg_data.loc[seg_id, 'MRoll_95quantile_50'] = np.quantile(Rolling_mean_50,0.95)
  seg_data.loc[seg_id, 'MRoll_99quantile_50'] = np.quantile(Rolling_mean_50,0.99)
  seg_data.loc[seg_id, 'Abs_MRoll_5quantile_50'] = np.quantile(np.abs(Rolling_mean_50),0.05)
  seg_data.loc[seg_id, 'Abs_MRoll_25quantile_50'] = np.quantile(np.abs(Rolling_mean_50),0.25)
  seg_data.loc[seg_id, 'Abs_MRoll_75quantile_50'] = np.quantile(np.abs(Rolling_mean_50),0.75)
  seg_data.loc[seg_id, 'Abs_MRoll_95quantile_50'] = np.quantile(np.abs(Rolling_mean_50),0.95)
  seg_data.loc[seg_id, 'Abs_MRoll_99quantile_50'] = np.quantile(np.abs(Rolling_mean_50),0.99)
  
  Rolling_std_50 = seg['acoustic_data'].rolling(50).std().dropna().values
  
  seg_data.loc[seg_id, 'SRoll_mean_50'] = Rolling_std_50.mean()
  seg_data.loc[seg_id, 'SRoll_std_50'] = Rolling_std_50.std()
  seg_data.loc[seg_id, 'SRoll_max_50'] = Rolling_std_50.max()
  seg_data.loc[seg_id, 'SRoll_min_50'] = Rolling_std_50.min()
  seg_data.loc[seg_id, 'SRoll_5quantile_50'] = np.quantile(Rolling_std_50,0.05)
  seg_data.loc[seg_id, 'SRoll_25quantile_50'] = np.quantile(Rolling_std_50,0.25)
  seg_data.loc[seg_id, 'SRoll_75quantile_50'] = np.quantile(Rolling_std_50,0.75)
  seg_data.loc[seg_id, 'SRoll_95quantile_50'] = np.quantile(Rolling_std_50,0.95)
  seg_data.loc[seg_id, 'SRoll_99quantile_50'] = np.quantile(Rolling_std_50,0.99)
  seg_data.loc[seg_id, 'Abs_SRoll_5quantile_50'] = np.quantile(np.abs(Rolling_std_50),0.05)
  seg_data.loc[seg_id, 'Abs_SRoll_25quantile_50'] = np.quantile(np.abs(Rolling_std_50),0.25)
  seg_data.loc[seg_id, 'Abs_SRoll_75quantile_50'] = np.quantile(np.abs(Rolling_std_50),0.75)
  seg_data.loc[seg_id, 'Abs_SRoll_95quantile_50'] = np.quantile(np.abs(Rolling_std_50),0.95)
  seg_data.loc[seg_id, 'Abs_SRoll_99quantile_50'] = np.quantile(np.abs(Rolling_std_50),0.99)
  
  Rolling_mean_100 = seg['acoustic_data'].rolling(100).mean().dropna().values
    
  seg_data.loc[seg_id, 'MRoll_mean_100'] = Rolling_mean_100.mean()
  seg_data.loc[seg_id, 'MRoll_std_100'] = Rolling_mean_100.std()
  seg_data.loc[seg_id, 'MRoll_max_100'] = Rolling_mean_100.max()
  seg_data.loc[seg_id, 'MRoll_min_100'] = Rolling_mean_100.min()
  seg_data.loc[seg_id, 'MRoll_5quantile_100'] = np.quantile(Rolling_mean_100,0.05)
  seg_data.loc[seg_id, 'MRoll_25quantile_100'] = np.quantile(Rolling_mean_100,0.25)
  seg_data.loc[seg_id, 'MRoll_75quantile_100'] = np.quantile(Rolling_mean_100,0.75)
  seg_data.loc[seg_id, 'MRoll_95quantile_100'] = np.quantile(Rolling_mean_100,0.95)
  seg_data.loc[seg_id, 'MRoll_99quantile_100'] = np.quantile(Rolling_mean_100,0.99)
  seg_data.loc[seg_id, 'Abs_MRoll_5quantile_100'] = np.quantile(np.abs(Rolling_mean_100),0.05)
  seg_data.loc[seg_id, 'Abs_MRoll_25quantile_100'] = np.quantile(np.abs(Rolling_mean_100),0.25)
  seg_data.loc[seg_id, 'Abs_MRoll_75quantile_100'] = np.quantile(np.abs(Rolling_mean_100),0.75)
  seg_data.loc[seg_id, 'Abs_MRoll_95quantile_100'] = np.quantile(np.abs(Rolling_mean_100),0.95)
  seg_data.loc[seg_id, 'Abs_MRoll_99quantile_100'] = np.quantile(np.abs(Rolling_mean_100),0.99)
  
  Rolling_std_100 = seg['acoustic_data'].rolling(100).std().dropna().values
  
  seg_data.loc[seg_id, 'SRoll_mean_100'] = Rolling_std_100.mean()
  seg_data.loc[seg_id, 'SRoll_std_100'] = Rolling_std_100.std()
  seg_data.loc[seg_id, 'SRoll_max_100'] = Rolling_std_100.max()
  seg_data.loc[seg_id, 'SRoll_min_100'] = Rolling_std_100.min()
  seg_data.loc[seg_id, 'SRoll_5quantile_100'] = np.quantile(Rolling_std_100,0.05)
  seg_data.loc[seg_id, 'SRoll_25quantile_100'] = np.quantile(Rolling_std_100,0.25)
  seg_data.loc[seg_id, 'SRoll_75quantile_100'] = np.quantile(Rolling_std_100,0.75)
  seg_data.loc[seg_id, 'SRoll_95quantile_100'] = np.quantile(Rolling_std_100,0.95)
  seg_data.loc[seg_id, 'SRoll_99quantile_100'] = np.quantile(Rolling_std_100,0.99)
  seg_data.loc[seg_id, 'Abs_SRoll_5quantile_100'] = np.quantile(np.abs(Rolling_std_100),0.05)
  seg_data.loc[seg_id, 'Abs_SRoll_25quantile_100'] = np.quantile(np.abs(Rolling_std_100),0.25)
  seg_data.loc[seg_id, 'Abs_SRoll_75quantile_100'] = np.quantile(np.abs(Rolling_std_100),0.75)
  seg_data.loc[seg_id, 'Abs_SRoll_95quantile_100'] = np.quantile(np.abs(Rolling_std_100),0.95)
  seg_data.loc[seg_id, 'Abs_SRoll_99quantile_100'] = np.quantile(np.abs(Rolling_std_100),0.99)

In [0]:
for seg_id in tqdm(range(segments)):
    seg = signal.iloc[seg_id*rows:seg_id*rows+rows]
    rollingwindow_features(seg_id, seg, train_X)

HBox(children=(IntProgress(value=0, max=4194), HTML(value='')))




In [0]:
print(train_X.shape)
train_X.head()

(4194, 56)


Unnamed: 0,MRoll_mean_50,MRoll_std_50,MRoll_max_50,MRoll_min_50,MRoll_5quantile_50,MRoll_25quantile_50,MRoll_75quantile_50,MRoll_95quantile_50,MRoll_99quantile_50,Abs_MRoll_5quantile_50,...,SRoll_5quantile_100,SRoll_25quantile_100,SRoll_75quantile_100,SRoll_95quantile_100,SRoll_99quantile_100,Abs_SRoll_5quantile_100,Abs_SRoll_25quantile_100,Abs_SRoll_75quantile_100,Abs_SRoll_95quantile_100,Abs_SRoll_99quantile_100
0,4.883969,0.606039,12.82,-3.1,3.96,4.52,5.26,5.8,6.36,3.96,...,2.475639,2.786312,4.018895,8.195903,16.948797,2.475639,2.786312,4.018895,8.195903,16.948797
1,4.725729,0.764507,28.26,-13.3,3.82,4.38,5.08,5.64,6.32,3.82,...,2.475965,2.783265,4.115246,9.829922,23.45727,2.475965,2.783265,4.115246,9.829922,23.45727
2,4.906072,0.811309,17.36,-7.72,3.88,4.52,5.28,5.94,7.14,3.88,...,2.538591,2.873406,4.476651,13.485267,28.598375,2.538591,2.873406,4.476651,13.485267,28.598375
3,4.902059,0.959834,27.72,-18.86,3.92,4.54,5.28,5.88,6.84,3.94,...,2.496442,2.800054,4.173643,10.36749,26.380301,2.496442,2.800054,4.173643,10.36749,26.380301
4,4.908958,0.903958,19.98,-8.82,3.94,4.54,5.26,5.92,7.5,3.94,...,2.491521,2.802668,4.151475,12.41382,34.718196,2.491521,2.802668,4.151475,12.41382,34.718196


In [0]:
train_X.to_csv('rollingwindow_features.csv', header=True, index=False) 
files.download('rollingwindow_features.csv')

In [0]:
del train_X

# Fourier Transform Features

In [0]:
train_X = pd.DataFrame(index=range(segments), dtype=np.float64)

In [0]:
def fourier_features(seg_id, seg, seg_data):
  
  real_FFT = np.real(np.fft.fft(seg['acoustic_data']))
  
  seg_data.loc[seg_id,'real_FFT_mean'] = real_FFT.mean()
  seg_data.loc[seg_id,'real_FFT_std'] = real_FFT.std()
  seg_data.loc[seg_id,'real_FFT_max'] = real_FFT.max()
  seg_data.loc[seg_id,'real_FFT_min'] = real_FFT.min()
  seg_data.loc[seg_id,'real_FFT_5quantile'] = np.quantile(real_FFT,0.05)
  seg_data.loc[seg_id,'real_FFT_25quantile'] = np.quantile(real_FFT,0.25)
  seg_data.loc[seg_id,'real_FFT_75quantile'] = np.quantile(real_FFT,0.75)
  seg_data.loc[seg_id,'real_FFT_95quantile'] = np.quantile(real_FFT,0.95)
  seg_data.loc[seg_id,'real_FFT_99quantile'] = np.quantile(real_FFT,0.99)
  seg_data.loc[seg_id,'Abs_real_FFT_5quantile'] = np.quantile(np.abs(real_FFT),0.05)
  seg_data.loc[seg_id,'Abs_real_FFT_25quantile'] = np.quantile(np.abs(real_FFT),0.25)
  seg_data.loc[seg_id,'Abs_real_FFT_75quantile'] = np.quantile(np.abs(real_FFT),0.75)
  seg_data.loc[seg_id,'Abs_real_FFT_95quantile'] = np.quantile(np.abs(real_FFT),0.95)
  seg_data.loc[seg_id,'Abs_real_FFT_99quantile'] = np.quantile(np.abs(real_FFT),0.99)
  
  img_FFT = np.imag(np.fft.fft(seg['acoustic_data']))
  
  seg_data.loc[seg_id,'img_FFT_mean'] = img_FFT.mean()
  seg_data.loc[seg_id,'img_FFT_std'] = img_FFT.std()
  seg_data.loc[seg_id,'img_FFT_max'] = img_FFT.max()
  seg_data.loc[seg_id,'img_FFT_min'] = img_FFT.min()
  seg_data.loc[seg_id,'img_FFT_5quantile'] = np.quantile(img_FFT,0.05)
  seg_data.loc[seg_id,'img_FFT_25quantile'] = np.quantile(img_FFT,0.25)
  seg_data.loc[seg_id,'img_FFT_75quantile'] = np.quantile(img_FFT,0.75)
  seg_data.loc[seg_id,'img_FFT_95quantile'] = np.quantile(img_FFT,0.95)
  seg_data.loc[seg_id,'img_FFT_99quantile'] = np.quantile(img_FFT,0.99)
  seg_data.loc[seg_id,'Abs_img_FFT_5quantile'] = np.quantile(np.abs(img_FFT),0.05)
  seg_data.loc[seg_id,'Abs_img_FFT_25quantile'] = np.quantile(np.abs(img_FFT),0.25)
  seg_data.loc[seg_id,'Abs_img_FFT_75quantile'] = np.quantile(np.abs(img_FFT),0.75)
  seg_data.loc[seg_id,'Abs_img_FFT_95quantile'] = np.quantile(np.abs(img_FFT),0.95)
  seg_data.loc[seg_id,'Abs_img_FFT_99quantile'] = np.quantile(np.abs(img_FFT),0.99)


In [0]:
for seg_id in tqdm(range(segments)):
    seg = signal.iloc[seg_id*rows:seg_id*rows+rows]
    fourier_features(seg_id, seg, train_X)

HBox(children=(IntProgress(value=0, max=4194), HTML(value='')))




In [0]:
print(train_X.shape)
train_X.head()

(4194, 28)


Unnamed: 0,real_FFT_mean,real_FFT_std,real_FFT_max,real_FFT_min,real_FFT_5quantile,real_FFT_25quantile,real_FFT_75quantile,real_FFT_95quantile,real_FFT_99quantile,Abs_real_FFT_5quantile,...,img_FFT_5quantile,img_FFT_25quantile,img_FFT_75quantile,img_FFT_95quantile,img_FFT_99quantile,Abs_img_FFT_5quantile,Abs_img_FFT_25quantile,Abs_img_FFT_75quantile,Abs_img_FFT_95quantile,Abs_img_FFT_99quantile
0,12.0,2349.811482,732617.0,-20121.154171,-1622.831836,-479.454912,495.000015,1620.809807,4435.602455,43.066466,...,-1605.470156,-482.489889,482.489889,1605.470156,4405.696651,44.356456,221.172141,902.446488,2541.286199,6026.272473
1,5.0,2566.032248,708865.0,-31056.675076,-1866.865973,-505.679986,497.538637,1880.188004,5963.507327,44.059513,...,-1859.559966,-498.116853,498.116853,1859.559966,6167.335386,43.699724,225.440886,962.006407,3352.459228,8313.225811
2,5.0,2683.549049,735959.0,-27654.557067,-1939.964988,-501.23831,500.494469,1963.027088,6442.726128,44.454268,...,-1944.064062,-504.606183,504.606183,1944.064062,6505.967037,43.934691,231.641902,963.181454,3673.027915,8757.168892
3,5.0,2685.788525,735336.0,-25622.393604,-1933.184665,-495.443867,506.367685,1878.801921,6211.90047,43.595092,...,-1889.301741,-499.063608,499.063608,1889.301741,6304.333597,44.196261,227.584444,957.089563,3515.577331,8450.62756
4,12.0,2761.715771,736308.0,-26271.075117,-1983.873136,-493.043931,503.817756,2005.173493,7291.262605,44.142367,...,-1965.867577,-500.220786,500.220786,1965.867577,7236.530838,43.879064,225.525678,958.857782,3996.647033,9482.175503


In [0]:
train_X.to_csv('fourier_features.csv', header=True, index=False) 
files.download('fourier_features.csv')

# Load Test data

In [0]:
!unzip test.zip

Archive:  test.zip
  inflating: seg_430e66.csv          
  inflating: seg_d1a281.csv          
  inflating: seg_05a1b0.csv          
  inflating: seg_f8dd7e.csv          
  inflating: seg_b9bdd7.csv          
  inflating: seg_24c1c9.csv          
  inflating: seg_c5abaa.csv          
  inflating: seg_6262c4.csv          
  inflating: seg_734a88.csv          
  inflating: seg_94a133.csv          
  inflating: seg_d0c280.csv          
  inflating: seg_d36737.csv          
  inflating: seg_f80e44.csv          
  inflating: seg_07c815.csv          
  inflating: seg_7c9433.csv          
  inflating: seg_211486.csv          
  inflating: seg_78ded2.csv          
  inflating: seg_f11f77.csv          
  inflating: seg_b3883e.csv          
  inflating: seg_3db0a8.csv          
  inflating: seg_81f798.csv          
  inflating: seg_0a45a1.csv          
  inflating: seg_dc188b.csv          
  inflating: seg_4a9e8d.csv          
  inflating: seg_32fc4e.csv          
  inflating: seg_7b2994.csv    

In [0]:
seg_id = pd.read_csv('sample_submission.csv')
seg_id.head()

Unnamed: 0,seg_id,time_to_failure
0,seg_00030f,0
1,seg_0012b5,0
2,seg_00184e,0
3,seg_003339,0
4,seg_0042cc,0


In [0]:
file_names = []

for ids in range(seg_id.shape[0]):
  file_names.append(seg_id['seg_id'][ids] + '.csv')
  
file_names[0:5]

['seg_00030f.csv',
 'seg_0012b5.csv',
 'seg_00184e.csv',
 'seg_003339.csv',
 'seg_0042cc.csv']

In [0]:
len(file_names)

2624

In [0]:
test1 = pd.read_csv(file_names[0])
test1.head()

Unnamed: 0,acoustic_data
0,4
1,0
2,-2
3,0
4,2


# Statistical Features

In [0]:
test_X = pd.DataFrame(index=range(len(file_names)), dtype=np.float64)

In [0]:
for seg_id in tqdm(range(len(file_names))):
    seg = pd.read_csv(file_names[seg_id])
    statistical_features(seg_id,seg, test_X)

HBox(children=(IntProgress(value=0, max=2624), HTML(value='')))




In [0]:
print(test_X.shape)
test_X.head()

(2624, 48)


Unnamed: 0,mean,abs_mean,mean_first_10000,mean_first_50000,mean_last_10000,mean_last_50000,std,abs_std,std_first_10000,std_first_50000,...,Abs_Quant25,Abs_Quant75,Abs_Quant95,Abs_Quant99,Abs_IQR,Sum,Abs_sum,MAD,Kurtosis,skew
0,4.49178,5.224607,4.3842,4.46644,4.5078,4.48968,4.89369,4.102161,5.226846,5.350451,...,3.0,7.0,11.0,19.0,4.0,43842.0,783691.0,3.248521,28.837568,0.327908
1,4.171153,5.19834,4.0635,4.01786,4.1742,4.13451,5.922839,5.045369,3.523253,6.249515,...,2.0,7.0,12.0,24.0,5.0,40635.0,779751.0,3.429208,56.218955,0.295708
2,4.61026,5.597193,4.2452,4.55518,4.601907,4.63849,6.94699,6.179525,3.950119,9.793473,...,3.0,7.0,12.0,25.0,4.0,42452.0,839579.0,3.461984,162.118284,0.428688
3,4.531473,4.961487,4.3834,4.49052,4.536093,4.5526,4.114147,3.583863,4.001275,3.664088,...,3.0,7.0,10.0,16.0,4.0,43834.0,744223.0,2.678503,41.241827,0.061889
4,4.12834,5.0709,4.4902,4.2302,4.117429,4.13989,5.797164,4.993617,5.214578,5.321133,...,2.0,7.0,11.0,22.0,5.0,44902.0,760635.0,3.283856,79.539708,0.073898


In [0]:
test_X.to_csv('test_statistical_features.csv', header=True, index=False) 
files.download('test_statistical_features.csv')

In [0]:
del test_X

# Rolling Window Features

In [0]:
test_X = pd.DataFrame(index=range(len(file_names)), dtype=np.float64)

In [0]:
for seg_id in tqdm(range(len(file_names))):
    seg = pd.read_csv(file_names[seg_id])
    rollingwindow_features(seg_id,seg, test_X)

HBox(children=(IntProgress(value=0, max=2624), HTML(value='')))




In [0]:
print(test_X.shape)
test_X.head()

(2624, 56)


Unnamed: 0,MRoll_mean_50,MRoll_std_50,MRoll_max_50,MRoll_min_50,MRoll_5quantile_50,MRoll_25quantile_50,MRoll_75quantile_50,MRoll_95quantile_50,MRoll_99quantile_50,Abs_MRoll_5quantile_50,...,SRoll_5quantile_100,SRoll_25quantile_100,SRoll_75quantile_100,SRoll_95quantile_100,SRoll_99quantile_100,Abs_SRoll_5quantile_100,Abs_SRoll_25quantile_100,Abs_SRoll_75quantile_100,Abs_SRoll_95quantile_100,Abs_SRoll_99quantile_100
0,4.491821,0.595202,14.28,-2.62,3.6,4.14,4.84,5.38,5.96,3.6,...,2.514985,2.833619,4.026478,8.362223,16.026427,2.514985,2.833619,4.026478,8.362223,16.026427
1,4.171347,0.720321,18.42,-9.92,3.26,3.82,4.52,5.08,5.94,3.26,...,2.475659,2.780815,3.915509,9.954726,24.173153,2.475659,2.780815,3.915509,9.954726,24.173153
2,4.610326,0.753897,33.16,-12.6,3.68,4.26,4.96,5.5,6.2,3.68,...,2.475639,2.743219,3.729977,8.870357,26.885607,2.475639,2.743219,3.729977,8.870357,26.885607
3,4.531519,0.529305,12.74,-3.22,3.74,4.22,4.84,5.32,5.74,3.74,...,2.380476,2.599048,3.053397,6.106885,14.290249,2.380476,2.599048,3.053397,6.106885,14.290249
4,4.128184,0.676384,20.8,-8.88,3.22,3.76,4.48,5.04,5.72,3.22,...,2.44007,2.708964,3.657012,9.545553,21.857599,2.44007,2.708964,3.657012,9.545553,21.857599


In [0]:
test_X.to_csv('test_rollingwindow_features.csv', header=True, index=False) 
files.download('test_rollingwindow_features.csv')

In [0]:
del test_X

# Fourier Features

In [0]:
test_X = pd.DataFrame(index=range(len(file_names)), dtype=np.float64)

In [0]:
for seg_id in tqdm(range(len(file_names))):
    seg = pd.read_csv(file_names[seg_id])
    fourier_features(seg_id,seg, test_X)

HBox(children=(IntProgress(value=0, max=2624), HTML(value='')))




In [0]:
print(test_X.shape)
test_X.head()

(2624, 28)


Unnamed: 0,real_FFT_mean,real_FFT_std,real_FFT_max,real_FFT_min,real_FFT_5quantile,real_FFT_25quantile,real_FFT_75quantile,real_FFT_95quantile,real_FFT_99quantile,Abs_real_FFT_5quantile,...,img_FFT_5quantile,img_FFT_25quantile,img_FFT_75quantile,img_FFT_95quantile,img_FFT_99quantile,Abs_img_FFT_5quantile,Abs_img_FFT_25quantile,Abs_img_FFT_75quantile,Abs_img_FFT_95quantile,Abs_img_FFT_99quantile
0,4.0,2198.344036,673767.0,-14758.442559,-1678.489341,-485.801935,486.20139,1661.823673,4470.876941,43.399849,...,-1655.008891,-490.942334,490.942334,1655.008891,4366.437426,42.795036,224.132322,921.270665,2630.114727,5707.384923
1,5.0,2289.922379,625673.0,-22626.387706,-1811.536525,-499.019587,497.452948,1834.63664,5462.314638,44.67438,...,-1826.025863,-499.703161,499.703161,1826.025863,5507.859783,45.15663,228.293443,953.010116,3171.770604,7237.97501
2,8.0,2611.055629,691539.0,-23593.939294,-1938.837969,-490.811968,506.433391,1927.431862,6617.537834,43.898088,...,-1907.575533,-502.357239,502.357239,1907.575533,6682.716341,45.425604,232.309455,959.293358,3615.795524,8976.339212
3,2.0,2085.543454,679721.0,-11908.537959,-1480.047712,-481.757014,478.459079,1490.191774,3403.298285,42.740339,...,-1487.046177,-480.409197,480.409197,1487.046177,3421.404809,43.252146,219.937573,887.44113,2116.247775,4610.570182
4,5.0,2243.929923,619251.0,-24048.05587,-1696.502599,-486.496051,491.031805,1690.514869,5081.064421,43.817842,...,-1703.798814,-491.703463,491.703463,1703.798814,5211.634169,43.529178,224.773642,927.654752,2862.623827,7182.467858


In [0]:
test_X.to_csv('test_fourier_features.csv', header=True, index=False) 
files.download('test_fourier_features.csv')

In [0]:
del test_X