In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, max_error, mean_absolute_percentage_error
from scipy.stats import kurtosis, skew
import shap

In [None]:
train = pd.read_csv('../input/LANL-Earthquake-Prediction/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
pd.options.display.precision = 15
train.head()

The dataset in input is divided in segments of 150 000 samples each (as is divided the test set for the challange), it generates 4194 segments

In [None]:
rows = 150_000
segments = int(np.floor(train.shape[0] / rows))

X_train = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['ave','std','max','min','skew','kurtosis'])
y_train = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['time_to_failure'])

for segment in range(segments):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]
    y_train.loc[segment, 'time_to_failure'] = y
    X_train.loc[segment, 'ave'] = x.mean()
    X_train.loc[segment, 'std'] = x.std()
    X_train.loc[segment, 'max'] = x.max()
    X_train.loc[segment, 'min'] = x.min()
    X_train.loc[segment, 'skew'] = skew(x)
    X_train.loc[segment, 'kurtosis'] = kurtosis(x)    

In [None]:
X_train.head()

Then normalize the training data

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
y_train_flatten = y_train.values.flatten()

In [None]:
def plot(y_train_flatten, y_pred):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_train_flatten, y_pred)
    plt.xlim(0, 20)
    plt.ylim(0, 20)
    plt.xlabel('actual', fontsize=12)
    plt.ylabel('predicted', fontsize=12)
    plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
    plt.show()

In [None]:
def score(y_train_flatten, y_pred):
    max = max_error(y_train_flatten, y_pred)
    mae = mean_absolute_error(y_train_flatten, y_pred)
    mape = mean_absolute_percentage_error(y_train_flatten, y_pred)
    print(f'Max Error: {max:0.3f}')
    print(f'Mean Absolute Error: {mae:0.3f}')
    print(f'Mean Absolute Percentage Error: {mape:0.3f}')    

# Catboost with Root Mean Square Error

In [None]:
m_rmse = CatBoostRegressor()
m_rmse.fit(X_train_scaled, y_train.values.flatten(), silent=True)
y_pred_m_rmse = m_rmse.predict(X_train_scaled)

In [None]:
plot(y_train_flatten, y_pred_m_rmse)

In [None]:
score(y_train_flatten, y_pred_m_rmse)

## Interpretation of the model

In [None]:
explainer = shap.Explainer(m_rmse)
shap_values = explainer(X_train)

In [None]:
shap.plots.bar(shap_values)

The plot above shows the **importance** of the feature in the feature in scoring a segment.

In [None]:
shap.plots.beeswarm(shap_values)

The plot above shows how the calculated features in the dataset impact the model’s output.
The blue points are associated with low values, the red ones with high values.

In [None]:
shap.initjs()
shap.plots.force(shap_values[1651])

The graph above shows how the different features impact the score calculated for the segment 1651. The mangitude of the arrows corresponds to the weight of the features shown in the first graph of this section.

# Submission of the model

In [None]:
submission = pd.read_csv('../input/LANL-Earthquake-Prediction/sample_submission.csv', index_col='seg_id')

In [None]:
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)

In [None]:
for segment_id in X_test.index:
    filename = "../input/LANL-Earthquake-Prediction/test/{0}.csv".format(segment_id)
    segment =  pd.read_csv(filename)
    x = segment['acoustic_data'].values
    X_test.loc[segment_id, 'ave'] = x.mean()
    X_test.loc[segment_id, 'std'] = x.std()
    X_test.loc[segment_id, 'max'] = x.max()
    X_test.loc[segment_id, 'min'] = x.min()
    X_test.loc[segment_id, 'skew'] = skew(x)
    X_test.loc[segment_id, 'kurtosis'] = kurtosis(x)    

In [None]:
X_test_scaled = scaler.transform(X_test)
submission['time_to_failure'] = m_rmse.predict(X_test_scaled)
submission.to_csv('submission.csv')