In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error

In [None]:
root = 'D:\\LANL\\all'

# Load Training Data

In [None]:
# load 9GB csv
train = pd.read_csv(os.path.join(root,'train.csv'), dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [None]:
train.to_hdf(os.path.join(root,'train.hdf'),'mydata',mode='w')

In [None]:
# pandas doesn't show us all the decimals
pd.options.display.precision = 15
train.head()

In [None]:
train.loc[train['time_to_failure']<0.]

# Training Data Prep

In [None]:
fig, ax1 = plt.subplots(figsize=(16, 8))
n = 4500

train[:n]['acoustic_data'].plot(ax=ax1)
ax2=ax1.twinx()
train[:n]['time_to_failure'].plot(ax=ax2);

In [None]:
# Create a training file with simple derived features
# reduces 6 millions rows to ~5000 by taking features every 150,000 rows
rows = 150_000
segments = int(np.floor(train.shape[0] / rows))

X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['ave', 'std', 'max', 'min'])
y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['time_to_failure'])

for segment in tqdm(range(segments)):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]
    
    y_train.loc[segment, 'time_to_failure'] = y
    
    X_train.loc[segment, 'ave'] = x.mean()
    X_train.loc[segment, 'std'] = x.std()
    X_train.loc[segment, 'max'] = x.max()
    X_train.loc[segment, 'min'] = x.min()

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

# Train, Predict & Score

In [None]:
svm = NuSVR()
svm.fit(X_train_scaled, y_train.values.flatten())
y_pred = svm.predict(X_train_scaled)

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_train.values.flatten(), y_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.show()

In [None]:
score = mean_absolute_error(y_train.values.flatten(), y_pred)
print(f'Score: {score:0.3f}')

# Predict on Test Data and Write Submission

In [None]:
submission = pd.read_csv(os.path.join(root,'sample_submission.csv'), index_col='seg_id')
submission.head()

In [None]:
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)

In [None]:
for seg_id in tqdm(X_test.index[:]):
    seg = pd.read_csv(os.path.join(root,os.path.join('test',seg_id))+ '.csv')
    x = seg['acoustic_data'].values

    X_test.loc[seg_id, 'ave'] = x.mean()
    X_test.loc[seg_id, 'std'] = x.std()
    X_test.loc[seg_id, 'max'] = x.max()
    X_test.loc[seg_id, 'min'] = x.min()

  5%|▍         | 131/2624 [00:20<05:34,  7.45it/s]

In [None]:
X_test_scaled = scaler.transform(X_test)
submission['time_to_failure'] = svm.predict(X_test_scaled)

In [None]:
submission.to_csv('submission.csv')