# [LANL competition](https://www.kaggle.com/c/LANL-Earthquake-Prediction)
- [introduction](https://www.kaggle.com/c/LANL-Earthquake-Prediction/discussion/77525)
- [Benchmark analysis](https://www.kaggle.com/inversion/basic-feature-benchmark/notebook)
- [good EDA and discussion + comments](https://www.kaggle.com/allunia/shaking-earth/comments)

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error

In [None]:
os.path.abspath('.')

In [None]:
version='v2'

In [None]:
root = 'D:\\LANL\\all' # windows
#root = '/media/ben/data/kaggle/LANL/' # linux

# Load Training Data

In [None]:
# # load 9GB csv
# train = pd.read_csv(os.path.join(root,'train.csv'), dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [None]:
try:
    train = pd.read_hdf(os.path.join(root,'trainx_{}.hdf'.format(version)),'mydata')
except:
    # load in as chunks
    chunksize = 10 ** 6
    chunks = list()
    sequenceNumber = 0 # first failure sequence
    
    for chunk in tqdm(pd.read_csv(os.path.join(root,'train.csv'),
                             dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64},
                             chunksize=chunksize)):

        chunk['seq'] = chunk['time_to_failure'].diff() # get time difference between each step
        chunk['seq'] = (chunk['seq']>0).astype(int).copy() # binary column where time_difference between steps is positive (i.e. reset after failure)
        chunk['seq'].values[0] = sequenceNumber # set first as sequenceNumber
        chunk['seq'] = chunk['seq'].cumsum().astype(float).copy()
        chunks.append(chunk)
        sequenceNumber = chunk['seq'].values[-1]
        
    # concat chunks & save
    train = pd.concat(chunks)
    print('Out:{}'.format(len(train)))
    train.to_hdf(os.path.join(root,'train_{}.hdf'.format(version)),'mydata',mode='w')

In [None]:
# pandas doesn't show us all the decimals
pd.options.display.precision = 15
train.head()

# Failures
There are several failures in the data set, but the time-to-fail only ever reaches very small decimals. Lets, use the time difference to identify where the T2F jumps back up and mark these as failures.

In [None]:
for chunk in chunks:
    chunk['seq'] = chunk['seq'].astype(float).copy()
    

In [None]:
del train

In [None]:
train = pd.concat(chunks)
print('Out:{}'.format(len(train)))
train.to_hdf(os.path.join(root,'train_{}.hdf'.format(version)),'mydata',mode='w')

In [None]:
for s in [x for x in range(0,17)]:
    
    

In [None]:
train['seq'][5600000:5700000].astype(float).plot()

In [None]:
train['seq'].isnull().sum()

In [None]:
train.loc[train['seq']==0]

In [None]:
chunks = list()
for chunk in np.array_split(train,10000):
    chunk['tdiff'] = (chunk['tdiff']>0).astype(int).copy()

In [None]:
train['failSeq'] = np.nan # short for Failure Sequence
start = 0
sequences = list()

for seq_num,end in enumerate(train.loc[train['tdiff']>0].index.values):
    train.iloc[start:end,3] = seq_num
    print('Sequence:{}, start={}, end={}'.format(seq_num,start,end))
    start=end
    sequences.append(seq_num)
train.iloc[start:,3] = seq_num+1 # and the last one
print('Sequence:{}, start={}, end={}'.format(seq_num+1,start,len(train)))
sequences.append(seq_num+1)

In [None]:
# just check thats all working
#train[5656570:5656580]

# Feature Engineering

In [None]:
# num of data points per failure in millions
(train['failSeq'].value_counts()/1000000).sort_index().plot.bar();

In [None]:
# cumulative acoustic data to failure
train['cumsum'] = train.groupby(['failSeq'])['acoustic_data'].cumsum()

In [None]:
ranges = [1000,10000]
for r in ranges:
    label = 'n{}k_mean'.format(int(r/1000))
    train[label] = np.nan
    
    for seq in sequences:
        print(label,seq)
        train.loc[train['failSeq']==seq,label] = train.loc[train['failSeq']==seq,'acoustic_data'].rolling(window=r,min_periods=1).mean()
        
    #train[label] = train.groupby(['failSeq'])['acoustic_data'].rolling(window=r).mean() ## this uses lots of memory!

# Training Data Prep

1. There are 150,000 rows in each test segment. 
2. The training data is reduced from 6 millions rows to summary features every 150,000 rows. (~5000 summary rows).

In [None]:
sequences

In [None]:
fig, axs = plt.subplots(4,2,figsize=(25, 4))


#axs[0].set_xlim(0,n+1);

In [None]:
# Create a training file with simple derived features
rows = 150_000
segments = int(np.floor(train.shape[0] / rows))

X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['ave', 'std', 'max', 'min'])
y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['time_to_failure'])

for segment in tqdm(range(segments)):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]
    
    y_train.loc[segment, 'time_to_failure'] = y
    
    X_train.loc[segment, 'ave'] = x.mean()
    X_train.loc[segment, 'std'] = x.std()
    X_train.loc[segment, 'max'] = x.max()
    X_train.loc[segment, 'min'] = x.min()

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

# Train, Predict & Score

In [None]:
svm = NuSVR()
svm.fit(X_train_scaled, y_train.values.flatten())
y_pred = svm.predict(X_train_scaled)

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_train.values.flatten(), y_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.show()

In [None]:
score = mean_absolute_error(y_train.values.flatten(), y_pred)
print(f'Score: {score:0.3f}')

# Predict on Test Data and Write Submission

In [None]:
submission = pd.read_csv(os.path.join(root,'sample_submission.csv'), index_col='seg_id')
submission.head()

In [None]:
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)

In [None]:
for seg_id in tqdm(X_test.index[:]):
    seg = pd.read_csv(os.path.join(root,os.path.join('test',seg_id))+ '.csv')
    x = seg['acoustic_data'].values

    X_test.loc[seg_id, 'ave'] = x.mean()
    X_test.loc[seg_id, 'std'] = x.std()
    X_test.loc[seg_id, 'max'] = x.max()
    X_test.loc[seg_id, 'min'] = x.min()

In [None]:
X_test_scaled = scaler.transform(X_test)
submission['time_to_failure'] = svm.predict(X_test_scaled)

In [None]:
submission.to_csv(os.path.join(root,'{}_submission.csv'.format(version)))

# END