# LANL Earthquake Prediction - predict upcoming laboratory earthquakes

# Models

In [17]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

  from numpy.core.umath_tests import inner1d


#  Load Data

In [3]:
path = r"C:\Users\Friend\AI\AI_datasets\LANL\lanl"

In [4]:
statistical_features =  pd.read_csv(os.path.join(path,'statistical_features.csv'))
print(statistical_features.shape)
statistical_features.head()

(8387, 48)


Unnamed: 0,mean,abs_mean,mean_first_10000,mean_first_50000,mean_last_10000,mean_last_50000,std,abs_std,std_first_10000,std_first_50000,...,Abs_Quant25,Abs_Quant75,Abs_Quant95,Abs_Quant99,Abs_IQR,Sum,Abs_sum,MAD,Kurtosis,skew
0,4.884113,5.576567,5.182,4.9621,4.899007,5.01594,5.101106,4.333325,11.207151,6.488552,...,3.0,7.0,12.0,20.0,4.0,51820.0,836485.0,3.263401,33.662481,-0.024061
1,4.725767,5.734167,4.7772,4.6984,4.712293,4.69448,6.588824,5.732777,3.97675,7.305233,...,3.0,7.0,12.0,24.0,4.0,47772.0,860125.0,3.574302,98.758517,0.390561
2,4.906393,6.152647,4.6814,4.7061,4.886771,4.81588,6.967397,5.895945,8.454717,6.104836,...,3.0,8.0,14.0,30.0,5.0,46814.0,922897.0,3.948411,33.555211,0.217391
3,4.90224,5.93396,5.0364,4.84364,4.882936,4.83663,6.922305,6.061214,6.866177,6.238109,...,3.0,8.0,13.0,26.0,5.0,50364.0,890094.0,3.647117,116.548172,0.757278
4,4.90872,6.110587,4.9405,4.89116,4.923021,4.94855,7.30111,6.329485,5.164594,5.32383,...,3.0,8.0,13.0,32.0,5.0,49405.0,916588.0,3.826052,52.977905,0.064531


In [5]:
rollingwindow_features =  pd.read_csv(os.path.join(path,'rollingwindow_features.csv'))
print(rollingwindow_features.shape)
rollingwindow_features.head()

(8387, 56)


Unnamed: 0,MRoll_mean_50,MRoll_std_50,MRoll_max_50,MRoll_min_50,MRoll_5quantile_50,MRoll_25quantile_50,MRoll_75quantile_50,MRoll_95quantile_50,MRoll_99quantile_50,Abs_MRoll_5quantile_50,...,SRoll_5quantile_100,SRoll_25quantile_100,SRoll_75quantile_100,SRoll_95quantile_100,SRoll_99quantile_100,Abs_SRoll_5quantile_100,Abs_SRoll_25quantile_100,Abs_SRoll_75quantile_100,Abs_SRoll_95quantile_100,Abs_SRoll_99quantile_100
0,4.883969,0.606039,12.82,-3.1,3.96,4.52,5.26,5.8,6.36,3.96,...,2.475639,2.786312,4.018895,8.195903,16.948797,2.475639,2.786312,4.018895,8.195903,16.948797
1,4.725729,0.764507,28.26,-13.3,3.82,4.38,5.08,5.64,6.32,3.82,...,2.475965,2.783265,4.115246,9.829922,23.45727,2.475965,2.783265,4.115246,9.829922,23.45727
2,4.906072,0.811309,17.36,-7.72,3.88,4.52,5.28,5.94,7.14,3.88,...,2.538591,2.873406,4.476651,13.485267,28.598375,2.538591,2.873406,4.476651,13.485267,28.598375
3,4.902059,0.959834,27.72,-18.86,3.92,4.54,5.28,5.88,6.84,3.94,...,2.496442,2.800054,4.173643,10.36749,26.380301,2.496442,2.800054,4.173643,10.36749,26.380301
4,4.908958,0.903958,19.98,-8.82,3.94,4.54,5.26,5.92,7.5,3.94,...,2.491521,2.802668,4.151475,12.41382,34.718196,2.491521,2.802668,4.151475,12.41382,34.718196


In [6]:
fourier_features =  pd.read_csv(os.path.join(path,'fourier_features.csv'))
print(fourier_features.shape)
fourier_features.head()

(8387, 28)


Unnamed: 0,real_FFT_mean,real_FFT_std,real_FFT_max,real_FFT_min,real_FFT_5quantile,real_FFT_25quantile,real_FFT_75quantile,real_FFT_95quantile,real_FFT_99quantile,Abs_real_FFT_5quantile,...,img_FFT_5quantile,img_FFT_25quantile,img_FFT_75quantile,img_FFT_95quantile,img_FFT_99quantile,Abs_img_FFT_5quantile,Abs_img_FFT_25quantile,Abs_img_FFT_75quantile,Abs_img_FFT_95quantile,Abs_img_FFT_99quantile
0,12.0,2349.811482,732617.0,-20121.154171,-1622.831836,-479.454912,495.000015,1620.809807,4435.602455,43.066466,...,-1605.470156,-482.489889,482.489889,1605.470156,4405.696651,44.356456,221.172141,902.446488,2541.286199,6026.272473
1,5.0,2566.032248,708865.0,-31056.675076,-1866.865973,-505.679986,497.538637,1880.188004,5963.507327,44.059513,...,-1859.559966,-498.116853,498.116853,1859.559966,6167.335386,43.699724,225.440886,962.006407,3352.459228,8313.225811
2,5.0,2683.549049,735959.0,-27654.557067,-1939.964988,-501.23831,500.494469,1963.027088,6442.726128,44.454268,...,-1944.064062,-504.606183,504.606183,1944.064062,6505.967037,43.934691,231.641902,963.181454,3673.027915,8757.168892
3,5.0,2685.788525,735336.0,-25622.393604,-1933.184665,-495.443867,506.367685,1878.801921,6211.90047,43.595092,...,-1889.301741,-499.063608,499.063608,1889.301741,6304.333597,44.196261,227.584444,957.089563,3515.577331,8450.62756
4,12.0,2761.715771,736308.0,-26271.075117,-1983.873136,-493.043931,503.817756,2005.173493,7291.262605,44.142367,...,-1965.867577,-500.220786,500.220786,1965.867577,7236.530838,43.879064,225.525678,958.857782,3996.647033,9482.175503


In [7]:
temp = statistical_features.join(rollingwindow_features)
data = temp.join(fourier_features)
print(data.shape)
data.head()

(8387, 132)


Unnamed: 0,mean,abs_mean,mean_first_10000,mean_first_50000,mean_last_10000,mean_last_50000,std,abs_std,std_first_10000,std_first_50000,...,img_FFT_5quantile,img_FFT_25quantile,img_FFT_75quantile,img_FFT_95quantile,img_FFT_99quantile,Abs_img_FFT_5quantile,Abs_img_FFT_25quantile,Abs_img_FFT_75quantile,Abs_img_FFT_95quantile,Abs_img_FFT_99quantile
0,4.884113,5.576567,5.182,4.9621,4.899007,5.01594,5.101106,4.333325,11.207151,6.488552,...,-1605.470156,-482.489889,482.489889,1605.470156,4405.696651,44.356456,221.172141,902.446488,2541.286199,6026.272473
1,4.725767,5.734167,4.7772,4.6984,4.712293,4.69448,6.588824,5.732777,3.97675,7.305233,...,-1859.559966,-498.116853,498.116853,1859.559966,6167.335386,43.699724,225.440886,962.006407,3352.459228,8313.225811
2,4.906393,6.152647,4.6814,4.7061,4.886771,4.81588,6.967397,5.895945,8.454717,6.104836,...,-1944.064062,-504.606183,504.606183,1944.064062,6505.967037,43.934691,231.641902,963.181454,3673.027915,8757.168892
3,4.90224,5.93396,5.0364,4.84364,4.882936,4.83663,6.922305,6.061214,6.866177,6.238109,...,-1889.301741,-499.063608,499.063608,1889.301741,6304.333597,44.196261,227.584444,957.089563,3515.577331,8450.62756
4,4.90872,6.110587,4.9405,4.89116,4.923021,4.94855,7.30111,6.329485,5.164594,5.32383,...,-1965.867577,-500.220786,500.220786,1965.867577,7236.530838,43.879064,225.525678,958.857782,3996.647033,9482.175503


In [9]:
y =  pd.read_csv(os.path.join(path,'output.csv'))
print(y.shape)
y.head()

(8387, 1)


Unnamed: 0,time_to_failure
0,1.430797
1,1.391499
2,1.353196
3,1.313798
4,1.2744


In [13]:
test_statistical_features =  pd.read_csv(os.path.join(path,'test_statistical_features.csv'))
print(test_statistical_features.shape)
test_statistical_features.head()

(2624, 48)


Unnamed: 0,mean,abs_mean,mean_first_10000,mean_first_50000,mean_last_10000,mean_last_50000,std,abs_std,std_first_10000,std_first_50000,...,Abs_Quant25,Abs_Quant75,Abs_Quant95,Abs_Quant99,Abs_IQR,Sum,Abs_sum,MAD,Kurtosis,skew
0,4.49178,5.224607,4.3842,4.46644,4.5078,4.48968,4.89369,4.102161,5.226846,5.350451,...,3.0,7.0,11.0,19.0,4.0,43842.0,783691.0,3.248521,28.837568,0.327908
1,4.171153,5.19834,4.0635,4.01786,4.1742,4.13451,5.922839,5.045369,3.523253,6.249515,...,2.0,7.0,12.0,24.0,5.0,40635.0,779751.0,3.429208,56.218955,0.295708
2,4.61026,5.597193,4.2452,4.55518,4.601907,4.63849,6.94699,6.179525,3.950119,9.793473,...,3.0,7.0,12.0,25.0,4.0,42452.0,839579.0,3.461984,162.118284,0.428688
3,4.531473,4.961487,4.3834,4.49052,4.536093,4.5526,4.114147,3.583863,4.001275,3.664088,...,3.0,7.0,10.0,16.0,4.0,43834.0,744223.0,2.678503,41.241827,0.061889
4,4.12834,5.0709,4.4902,4.2302,4.117429,4.13989,5.797164,4.993617,5.214578,5.321133,...,2.0,7.0,11.0,22.0,5.0,44902.0,760635.0,3.283856,79.539708,0.073898


In [14]:
test_rollingwindow_features =  pd.read_csv(os.path.join(path,'test_rollingwindow_features.csv'))
print(test_rollingwindow_features.shape)
test_rollingwindow_features.head()

(2624, 56)


Unnamed: 0,MRoll_mean_50,MRoll_std_50,MRoll_max_50,MRoll_min_50,MRoll_5quantile_50,MRoll_25quantile_50,MRoll_75quantile_50,MRoll_95quantile_50,MRoll_99quantile_50,Abs_MRoll_5quantile_50,...,SRoll_5quantile_100,SRoll_25quantile_100,SRoll_75quantile_100,SRoll_95quantile_100,SRoll_99quantile_100,Abs_SRoll_5quantile_100,Abs_SRoll_25quantile_100,Abs_SRoll_75quantile_100,Abs_SRoll_95quantile_100,Abs_SRoll_99quantile_100
0,4.491821,0.595202,14.28,-2.62,3.6,4.14,4.84,5.38,5.96,3.6,...,2.514985,2.833619,4.026478,8.362223,16.026427,2.514985,2.833619,4.026478,8.362223,16.026427
1,4.171347,0.720321,18.42,-9.92,3.26,3.82,4.52,5.08,5.94,3.26,...,2.475659,2.780815,3.915509,9.954726,24.173153,2.475659,2.780815,3.915509,9.954726,24.173153
2,4.610326,0.753897,33.16,-12.6,3.68,4.26,4.96,5.5,6.2,3.68,...,2.475639,2.743219,3.729977,8.870357,26.885607,2.475639,2.743219,3.729977,8.870357,26.885607
3,4.531519,0.529305,12.74,-3.22,3.74,4.22,4.84,5.32,5.74,3.74,...,2.380476,2.599048,3.053397,6.106885,14.290249,2.380476,2.599048,3.053397,6.106885,14.290249
4,4.128184,0.676384,20.8,-8.88,3.22,3.76,4.48,5.04,5.72,3.22,...,2.44007,2.708964,3.657012,9.545553,21.857599,2.44007,2.708964,3.657012,9.545553,21.857599


In [15]:
test_fourier_features =  pd.read_csv(os.path.join(path,'test_fourier_features.csv'))
print(test_fourier_features.shape)
test_fourier_features.head()

(2624, 28)


Unnamed: 0,real_FFT_mean,real_FFT_std,real_FFT_max,real_FFT_min,real_FFT_5quantile,real_FFT_25quantile,real_FFT_75quantile,real_FFT_95quantile,real_FFT_99quantile,Abs_real_FFT_5quantile,...,img_FFT_5quantile,img_FFT_25quantile,img_FFT_75quantile,img_FFT_95quantile,img_FFT_99quantile,Abs_img_FFT_5quantile,Abs_img_FFT_25quantile,Abs_img_FFT_75quantile,Abs_img_FFT_95quantile,Abs_img_FFT_99quantile
0,4.0,2198.344036,673767.0,-14758.442559,-1678.489341,-485.801935,486.20139,1661.823673,4470.876941,43.399849,...,-1655.008891,-490.942334,490.942334,1655.008891,4366.437426,42.795036,224.132322,921.270665,2630.114727,5707.384923
1,5.0,2289.922379,625673.0,-22626.387706,-1811.536525,-499.019587,497.452948,1834.63664,5462.314638,44.67438,...,-1826.025863,-499.703161,499.703161,1826.025863,5507.859783,45.15663,228.293443,953.010116,3171.770604,7237.97501
2,8.0,2611.055629,691539.0,-23593.939294,-1938.837969,-490.811968,506.433391,1927.431862,6617.537834,43.898088,...,-1907.575533,-502.357239,502.357239,1907.575533,6682.716341,45.425604,232.309455,959.293358,3615.795524,8976.339212
3,2.0,2085.543454,679721.0,-11908.537959,-1480.047712,-481.757014,478.459079,1490.191774,3403.298285,42.740339,...,-1487.046177,-480.409197,480.409197,1487.046177,3421.404809,43.252146,219.937573,887.44113,2116.247775,4610.570182
4,5.0,2243.929923,619251.0,-24048.05587,-1696.502599,-486.496051,491.031805,1690.514869,5081.064421,43.817842,...,-1703.798814,-491.703463,491.703463,1703.798814,5211.634169,43.529178,224.773642,927.654752,2862.623827,7182.467858


In [16]:
ttemp = test_statistical_features.join(test_rollingwindow_features)
test_data = ttemp.join(test_fourier_features)
print(test_data.shape)
test_data.head()

(2624, 132)


Unnamed: 0,mean,abs_mean,mean_first_10000,mean_first_50000,mean_last_10000,mean_last_50000,std,abs_std,std_first_10000,std_first_50000,...,img_FFT_5quantile,img_FFT_25quantile,img_FFT_75quantile,img_FFT_95quantile,img_FFT_99quantile,Abs_img_FFT_5quantile,Abs_img_FFT_25quantile,Abs_img_FFT_75quantile,Abs_img_FFT_95quantile,Abs_img_FFT_99quantile
0,4.49178,5.224607,4.3842,4.46644,4.5078,4.48968,4.89369,4.102161,5.226846,5.350451,...,-1655.008891,-490.942334,490.942334,1655.008891,4366.437426,42.795036,224.132322,921.270665,2630.114727,5707.384923
1,4.171153,5.19834,4.0635,4.01786,4.1742,4.13451,5.922839,5.045369,3.523253,6.249515,...,-1826.025863,-499.703161,499.703161,1826.025863,5507.859783,45.15663,228.293443,953.010116,3171.770604,7237.97501
2,4.61026,5.597193,4.2452,4.55518,4.601907,4.63849,6.94699,6.179525,3.950119,9.793473,...,-1907.575533,-502.357239,502.357239,1907.575533,6682.716341,45.425604,232.309455,959.293358,3615.795524,8976.339212
3,4.531473,4.961487,4.3834,4.49052,4.536093,4.5526,4.114147,3.583863,4.001275,3.664088,...,-1487.046177,-480.409197,480.409197,1487.046177,3421.404809,43.252146,219.937573,887.44113,2116.247775,4610.570182
4,4.12834,5.0709,4.4902,4.2302,4.117429,4.13989,5.797164,4.993617,5.214578,5.321133,...,-1703.798814,-491.703463,491.703463,1703.798814,5211.634169,43.529178,224.773642,927.654752,2862.623827,7182.467858


# Machine Learning Models

# Linear Regression

In [18]:
Grid_Parameters = {'alpha' : [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]}

clf_linear = GridSearchCV(SGDRegressor(loss = "squared_loss", penalty = "l2"),Grid_Parameters,cv =5)
clf_linear.fit(data,y)
alpha = clf_linear.best_params_["alpha"]

print(alpha)

10


In [19]:
clf_linear = SGDRegressor(loss = "squared_loss", penalty = "l2", alpha = alpha)
clf_linear.fit(data,y)

y_pred = clf_linear.predict(data)
MAE_Linear = mean_absolute_error(y,y_pred)

# Random Forest

In [38]:
grid_hyperparameter = [{'n_estimators'  : [30,40,50,60,100,200,500,1000],'max_depth':[10,15,20]}]

clf = GridSearchCV(RandomForestRegressor(max_features='sqrt',min_samples_leaf=4,min_samples_split=3), grid_hyperparameter, cv=2)
clf.fit(data,y)

clf_nr = clf.best_estimator_.get_params()['n_estimators']
clf_depthr = clf.best_estimator_.get_params()['max_depth']

print(clf_nr,clf_depthr)

1000 20


In [39]:
clf_RF = RandomForestRegressor(max_features='sqrt',min_samples_leaf=4,min_samples_split=3,n_estimators=clf_nr,max_depth = clf_depthr)
clf_RF.fit(data,y)

y_pred = clf_RF.predict(data)
MAE_RF = mean_absolute_error(y,y_pred)

# SVR

In [24]:
from sklearn.svm import  SVR

grid_hyperparameter = [{'C': [0.1, 0.2, 0.25, 0.5, 1, 1.5, 2]}]

clf_SVR = GridSearchCV(SVR(kernel='rbf', tol=0.01), grid_hyperparameter, cv=5, scoring='neg_mean_absolute_error')
clf_SVR.fit(data,y)
clf_c = clf_SVR.best_estimator_.get_params()['C']

In [25]:
clf_SVR = SVR(kernel='rbf', tol=0.01,C = clf_c)
clf_SVR.fit(data,y)

y_pred = clf_SVR.predict(data)
MAE_SVR = mean_absolute_error(y,y_pred)

# XGBoost

In [27]:
grid_hyperparameter = [{'n_estimators'  : [10,20,30],'max_depth':[5,10]}]

x_model = xgb.XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=3,
 gamma=0,
 subsample=0.8,
 reg_alpha=200, reg_lambda=200,
 colsample_bytree=0.8,nthread=4)

clf_XG = GridSearchCV(x_model, grid_hyperparameter, cv=2)
clf_XG.fit(data,y)

clf_n = clf_XG.best_estimator_.get_params()['n_estimators']
clf_depth = clf_XG.best_estimator_.get_params()['max_depth']

In [28]:
clf_XG =  xgb.XGBRegressor(
 learning_rate =0.1,
 n_estimators=clf_n,
 max_depth=clf_depth,
 min_child_weight=3,
 gamma=0,
 subsample=0.8,
 reg_alpha=200, reg_lambda=200,
 colsample_bytree=0.8,nthread=4)
clf_XG.fit(data,y)

y_pred = clf_XG.predict(data)
MAE_XG = mean_absolute_error(y,y_pred)

# Conclusion

In [40]:
from prettytable import PrettyTable
    
Table = PrettyTable()

Table.field_names  = ["Model","parameter","MAE"]

Table.add_row(["Linear Regression",alpha, MAE_Linear])
Table.add_row(["Random Forest",[clf_nr,clf_depthr],MAE_RF])
Table.add_row(["SVR",clf_c,MAE_SVR])
Table.add_row(["XG",[clf_n,clf_depth],MAE_XG])


print(Table)

+-------------------+------------+------------------------+
|       Model       | parameter  |          MAE           |
+-------------------+------------+------------------------+
| Linear Regression |     10     | 3.9988299777617756e+20 |
|   Random Forest   | [1000, 20] |   1.1440665395512535   |
|        SVR        |    0.1     |   2.9407977588754917   |
|         XG        |  [30, 5]   |   2.044399770580513    |
+-------------------+------------+------------------------+


# Submission File

In [43]:
clf_RF = RandomForestRegressor(max_features='sqrt',min_samples_leaf=4,min_samples_split=3,n_estimators=1000,max_depth = 20)
clf_RF.fit(data,y)

y_pred = clf_RF.predict(test_data)

In [44]:
y_pred[0:5]

array([2.83253445, 5.71063887, 5.28570185, 9.06505927, 7.29267752])

In [45]:
submission = pd.read_csv(os.path.join(path,"sample_submission.csv"))
submission['time_to_failure'] = y_pred
submission.to_csv(os.path.join(path,"result.csv"),index = False)

In [46]:
submission = pd.read_csv(os.path.join(path,"result.csv"))
submission.head()

Unnamed: 0,seg_id,time_to_failure
0,seg_00030f,2.832534
1,seg_0012b5,5.710639
2,seg_00184e,5.285702
3,seg_003339,9.065059
4,seg_0042cc,7.292678
