### Project II

#### Dataset I 
The dimensional discrete Fourier Transform(DDFT) on the raw sensor data to obtain training and test data with 441 
features.
Reference Source Code: https://www.kaggle.com/ajcostarino/ingv-volcanic-eruption-prediction-lgbm-baseline

In [11]:
#Importing essential libraries
from sklearn.datasets import fetch_openml

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os 
import cv2
import random 
import math
import pandas as pd

from datetime import datetime

from sklearn import metrics
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten

import tensorflow as tf
from tensorflow import keras

In [5]:
import numpy as np

from sklearn.linear_model import LinearRegression
import scipy.stats as spstats

# Basic statistics will calculate distribution values for each sensor
def basic_statistics(t_X, x, s, sensor, postfix=''):
    """Computes basic statistics for the training feature set.
    
    Args:
        t_X (pandas.DataFrame): The feature set being built.
        x (pandas.Series): The signal values.
        s (int): The integer number of the segment.
        postfix (str): The postfix string value.
    Return:
        t_X (pandas.DataFrame): The feature set being built.
    """

    t_X.loc[s, f'{sensor}_sum{postfix}']       = x.sum()
    t_X.loc[s, f'{sensor}_mean{postfix}']      = x.mean()
    t_X.loc[s, f'{sensor}_std{postfix}']       = x.std()
    t_X.loc[s, f'{sensor}_var{postfix}']       = x.var() 
    t_X.loc[s, f'{sensor}_max{postfix}']       = x.max()
    t_X.loc[s, f'{sensor}_min{postfix}']       = x.min()
    t_X.loc[s, f'{sensor}_median{postfix}']    = x.median()
    t_X.loc[s, f'{sensor}_skew{postfix}']      = x.skew()
    t_X.loc[s, f'{sensor}_mad{postfix}']       = x.mad()
    t_X.loc[s, f'{sensor}_kurtosis{postfix}']  = x.kurtosis()

    return t_X


#Quantiles will calculate quantiles for each sensor
def quantiles(t_X, x, s, sensor, postfix=''):
    """Calculates quantile features for the training feature set.
    Args:
        t_X (pandas.DataFrame): The feature set being built.
        x (pandas.Series): The signal values.
        s (int): The integer number of the segment.
        postfix (str): The postfix string value.
    Return:
        t_X (pandas.DataFrame): The feature set being built.
    """
    t_X.loc[s, f'{sensor}_q999{postfix}']     = np.quantile(x ,0.999)
    t_X.loc[s, f'{sensor}_q99{postfix}']      = np.quantile(x, 0.99)
    t_X.loc[s, f'{sensor}_q95{postfix}']      = np.quantile(x, 0.95)
    t_X.loc[s, f'{sensor}_q87{postfix}']      = np.quantile(x, 0.87)
    t_X.loc[s, f'{sensor}_q13{postfix}']      = np.quantile(x, 0.13)  
    t_X.loc[s, f'{sensor}_q05{postfix}']      = np.quantile(x, 0.05)
    t_X.loc[s, f'{sensor}_q01{postfix}']      = np.quantile(x, 0.01)
    t_X.loc[s, f'{sensor}_q001{postfix}']     = np.quantile(x ,0.001)
    
    x_abs = np.abs(x)
    t_X.loc[s, f'{sensor}_q999_abs{postfix}'] = np.quantile(x_abs, 0.999)
    t_X.loc[s, f'{sensor}_q99_abs{postfix}']  = np.quantile(x_abs, 0.99)
    t_X.loc[s, f'{sensor}_q95_abs{postfix}']  = np.quantile(x_abs, 0.95)
    t_X.loc[s, f'{sensor}_q87_abs{postfix}']  = np.quantile(x_abs, 0.87)
    t_X.loc[s, f'{sensor}_q13_abs{postfix}']  = np.quantile(x_abs, 0.13)
    t_X.loc[s, f'{sensor}_q05_abs{postfix}']  = np.quantile(x_abs, 0.05)
    t_X.loc[s, f'{sensor}_q01_abs{postfix}']  = np.quantile(x_abs, 0.01)
    t_X.loc[s, f'{sensor}_q001_abs{postfix}'] = np.quantile(x_abs, 0.001)
    
    t_X.loc[s, f'{sensor}_iqr']     = np.subtract(*np.percentile(x, [75, 25]))
    t_X.loc[s, f'{sensor}_iqr_abs'] = np.subtract(*np.percentile(x_abs, [75, 25]))

    return t_X

#Linear regression builds a linear regression model for each sensor and returns the coefficients
def __linear_regression(arr, abs_v=False):
    """
    """
    idx = np.array(range(len(arr)))
    if abs_v:
        arr = np.abs(arr)
    lr = LinearRegression()
    fit_X = idx.reshape(-1, 1)
    lr.fit(fit_X, arr)
    return lr.coef_[0]


def __classic_sta_lta(x, length_sta, length_lta):
    sta = np.cumsum(x ** 2)
    # Convert to float
    sta = np.require(sta, dtype=np.float)
    # Copy for LTA
    lta = sta.copy()
    # Compute the STA and the LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta
    # Pad zeros
    sta[:length_lta - 1] = 0
    # Avoid division by zero by setting zero values to tiny float
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny
    return sta / lta


def linear_regression(t_X, x, s, sensor, postfix=''):
    t_X.loc[s, f'{sensor}_lr_coef{postfix}'] = __linear_regression(x)
    t_X.loc[s, f'{sensor}_lr_coef_abs{postfix}'] = __linear_regression(x, True)
    return t_X


def classic_sta_lta(t_X, x, sensor, s):
    t_X.loc[s, f'{sensor}_classic_sta_lta1_mean'] = __classic_sta_lta(x, 500, 10000).mean()
    t_X.loc[s, f'{sensor}_classic_sta_lta2_mean'] = __classic_sta_lta(x, 5000, 100000).mean()
    t_X.loc[s, f'{sensor}_classic_sta_lta3_mean'] = __classic_sta_lta(x, 3333, 6666).mean()
    t_X.loc[s, f'{sensor}_classic_sta_lta4_mean'] = __classic_sta_lta(x, 10000, 25000).mean()
    return t_X

#Fast-Fourier transforms
def fft(t_X, x, s, sensor, postfix=''):
    """Generates basic statistics over the fft of the signal"""
    z = np.fft.fft(x)
    fft_real = np.real(z)
    fft_imag = np.imag(z)

    t_X.loc[s, f'fft_A0']             = abs(z[0])
    
    t_X.loc[s, f'{sensor}_fft_real_mean{postfix}']      = fft_real.mean()
    t_X.loc[s, f'{sensor}_fft_real_std{postfix}']       = fft_real.std()
    t_X.loc[s, f'{sensor}_fft_real_max{postfix}']       = fft_real.max()
    t_X.loc[s, f'{sensor}_fft_real_min{postfix}']       = fft_real.min()
    t_X.loc[s, f'{sensor}_fft_real_median{postfix}']    = np.median(fft_real)
    t_X.loc[s, f'{sensor}_fft_real_skew{postfix}']      = spstats.skew(fft_real)
    t_X.loc[s, f'{sensor}_fft_real_kurtosis{postfix}']  = spstats.kurtosis(fft_real)
    
    t_X.loc[s, f'{sensor}_fft_imag_mean{postfix}']      = fft_imag.mean()
    t_X.loc[s, f'{sensor}_fft_imag_std{postfix}']       = fft_imag.std()
    t_X.loc[s, f'{sensor}_fft_imag_max{postfix}']       = fft_imag.max()
    t_X.loc[s, f'{sensor}_fft_imag_min{postfix}']       = fft_imag.min()
    t_X.loc[s, f'{sensor}_fft_imag_median{postfix}']    = np.median(fft_imag)
    t_X.loc[s, f'{sensor}_fft_imag_skew{postfix}']      = spstats.skew(fft_imag)
    t_X.loc[s, f'{sensor}_fft_imag_kurtosis{postfix}']  = spstats.kurtosis(fft_imag)
    
    return t_X

#### Data Preprocessing and Feature Extraction for training data 

In [6]:
train = pd.read_csv('C:/Datasets/predict-volcanic-eruptions-ingv-oe/train.csv') #path for the train.csv
train_set = pd.DataFrame()
train_set['segment_id'] = train.segment_id
train_set = train_set.set_index('segment_id')

j = 0
for seg in train.segment_id:
    signals = pd.read_csv(f'C:/Datasets/predict-volcanic-eruptions-ingv-oe/train/{seg}.csv')
    for i in range(1, 11):
        sensor_id = f'sensor_{i}'
        train_set = basic_statistics(train_set, signals[sensor_id].fillna(0), seg, sensor_id, postfix='')
        train_set = quantiles(train_set, signals[sensor_id].fillna(0), seg, sensor_id, postfix='')
        train_set = linear_regression(train_set, signals[sensor_id].fillna(0), seg, sensor_id, postfix='')
        train_set = fft(train_set, signals[sensor_id].fillna(0), seg, sensor_id, postfix='')
        
train_set = pd.merge(train_set.reset_index(), train, on=['segment_id'], how='left').set_index('segment_id')

In [7]:
train_set.shape

(4431, 442)

In [8]:
y_train = train_set['time_to_eruption']
X_train = train_set.drop(['time_to_eruption'], axis = 1)

In [9]:
print('X_train : ', X_train.shape)
print('y_train : ', y_train.shape)

X_train :  (4431, 441)
y_train :  (4431,)


In [10]:
import pickle

pickle_out = open("X_train_T1.pickle","wb")
pickle.dump(X_train, pickle_out)
pickle_out.close()

pickle_out = open("y_train_T1.pickle","wb")
pickle.dump(y_train, pickle_out)
pickle_out.close()

#### Data Preprocessing and Feature Extraction for testing data

In [19]:
test = pd.read_csv('C:/Datasets/predict-volcanic-eruptions-ingv-oe/sample_submission.csv') #path for train.csv
test_set = pd.DataFrame()
test_set['segment_id'] = test.segment_id
test_set = test_set.set_index('segment_id')


for seg in test.segment_id:
    signals = pd.read_csv(f'C:/Datasets/predict-volcanic-eruptions-ingv-oe/test/{seg}.csv')
    for i in range(1, 11):
        sensor_id = f'sensor_{i}'
        test_set = basic_statistics(test_set, signals[sensor_id].fillna(0), seg, sensor_id, postfix='')
        test_set = quantiles(test_set, signals[sensor_id].fillna(0), seg, sensor_id, postfix='')
        test_set = linear_regression(test_set, signals[sensor_id].fillna(0), seg, sensor_id, postfix='')
        test_set = fft(test_set, signals[sensor_id].fillna(0), seg, sensor_id, postfix='')

In [20]:
test_set.shape

(4520, 441)

In [25]:
#y_test = test_set['time_to_eruption']
X_test=test_set

In [36]:
print('X_test : ', X_test.shape)
#print('y_test : ', y_test.shape)

X_test :  (4520, 441)


In [28]:
import pickle

pickle_out = open("X_test_T1.pickle","wb")
pickle.dump(X_test, pickle_out)
pickle_out.close()

#pickle_out = open("y_test_T1.pickle","wb")
#pickle.dump(y_test, pickle_out)
#pickle_out.close()