[Link](https://www.kaggle.com/c/LANL-Earthquake-Prediction/overview)

simple base with good features

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
%%time
train = pd.read_csv('/kaggle/input/LANL-Earthquake-Prediction/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [None]:
# train.head()

In [None]:
#通过线性的方式构造特征 arr:每段信号数据
def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

# 通过微震信号自动检测的STA/LTA算法构造特征
def classic_sta_lta(x, length_sta, length_lta):
    sta = np.cumsum(x ** 2)
    # 变为 float
    sta = np.require(sta, dtype=np.float)

    lta = sta.copy()
    # 计算 STA 和 LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta
    
    sta[:length_lta - 1] = 0
    # 通过将零值设置为极小的浮点数来避免除以零
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny
    return sta / lta

In [None]:
rows = 150000 #每次取信号长度
segments = int(np.floor(train.shape[0] / rows))

X = pd.DataFrame(index=range(segments), dtype=np.float64)
Y = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['time_to_failure'])

#构建特征
for segment in range(segments):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    x1 = seg['acoustic_data']
    zc = np.fft.fft(x)  # 傅里叶变换
    y = seg['time_to_failure'].values[-1]
    
    Y.loc[segment, 'time_to_failure'] = y
    
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)
    X.loc[segment, 'Rmean'] = realFFT.mean()
    X.loc[segment, 'Rstd'] = realFFT.std()
    X.loc[segment, 'Rmax'] = realFFT.max()
    X.loc[segment, 'Rmin'] = realFFT.min()
    X.loc[segment, 'Imean'] = imagFFT.mean()
    X.loc[segment, 'Istd'] = imagFFT.std()
    X.loc[segment, 'Imax'] = imagFFT.max()
    X.loc[segment, 'Imin'] = imagFFT.min()
    X.loc[segment, 'Rmean_last_5000'] = realFFT[-5000:].mean()
    X.loc[segment, 'Rstd__last_5000'] = realFFT[-5000:].std()
    X.loc[segment, 'Rmax_last_5000'] = realFFT[-5000:].max()
    X.loc[segment, 'Rmin_last_5000'] = realFFT[-5000:].min()
    X.loc[segment, 'Rmean_last_15000'] = realFFT[-15000:].mean()
    X.loc[segment, 'Rstd_last_15000'] = realFFT[-15000:].std()
    X.loc[segment, 'Rmax_last_15000'] = realFFT[-15000:].max()
    X.loc[segment, 'Rmin_last_15000'] = realFFT[-15000:].min()
    
    
    X.loc[segment, 'mean'] = x.mean()
    X.loc[segment, 'stdev'] = x.std()
    X.loc[segment, 'variance'] = np.var(x)
    X.loc[segment, 'max'] = x.max()
    X.loc[segment, 'min'] = x.min()
    X.loc[segment, 'max-min-diff'] = x.max()-x.min()
    X.loc[segment, 'max-mean-diff'] = x.max()-x.mean()
    X.loc[segment, 'mean-change-abs'] = np.mean(np.diff(x))
    X.loc[segment, 'abs-min'] = np.abs(x).min()
    X.loc[segment, 'abs-max'] = np.abs(x).max()
    X.loc[segment, 'abs_mean'] = np.abs(x).mean()
    X.loc[segment, 'abs_std'] = np.abs(x).std()
    X.loc[segment, 'std-first-50000'] = x[:50000].std()
    X.loc[segment, 'std-last-50000'] = x[-50000:].std()
    X.loc[segment, 'mean-first-50000'] = x[:50000].min()
    X.loc[segment, 'mean-last-50000'] = x[-50000:].mean()
    X.loc[segment, 'max-first-50000'] = x[:50000].max()
    X.loc[segment, 'max-last-50000'] = x[-50000:].max()
    X.loc[segment, 'min-first-50000'] = x[:50000].min()
    X.loc[segment, 'min-last-50000'] = x[-50000:].min()
    X.loc[segment, 'q01'] = np.quantile(x, 0.01)
    X.loc[segment, 'q05'] = np.quantile(x, 0.05)
    X.loc[segment, 'q10'] = np.quantile(x, 0.10)
    X.loc[segment, 'q95'] = np.quantile(x, 0.95)
    X.loc[segment, 'q99'] = np.quantile(x, 0.99)
    
    X.loc[segment, 'trend'] = add_trend_feature(x)
    X.loc[segment, 'abs_trend'] = add_trend_feature(x, abs_values=True)
    
    X.loc[segment, 'classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean()
    X.loc[segment, 'classic_sta_lta2_mean'] = classic_sta_lta(x, 5000, 100000).mean()
    X.loc[segment, 'classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean()
    X.loc[segment, 'classic_sta_lta4_mean'] = classic_sta_lta(x, 10000, 25000).mean()
    #窗口滑动 rolling()
    for w in [10, 50, 100, 1000]:
        x_roll_std = x1.rolling(w).std().dropna().values
        x_roll_mean = x1.rolling(w).mean().dropna().values
        x_roll_abs_mean = x1.abs().rolling(w).mean().dropna().values
        
        X.loc[segment, 'ave_roll_std_' + str(w)] = x_roll_std.mean()
        X.loc[segment, 'std_roll_std_' + str(w)] = x_roll_std.std()
        X.loc[segment, 'max_roll_std_' + str(w)] = x_roll_std.max()
        X.loc[segment, 'min_roll_std_' + str(w)] = x_roll_std.min()
        X.loc[segment, 'q01_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.01)
        X.loc[segment, 'q05_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.05)
        X.loc[segment, 'q10_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.10)
        X.loc[segment, 'q95_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.95)
        X.loc[segment, 'q99_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.99)
        
        X.loc[segment, 'ave_roll_mean_' + str(w)] = x_roll_mean.mean()
        X.loc[segment, 'std_roll_mean_' + str(w)] = x_roll_mean.std()
        X.loc[segment, 'max_roll_mean_' + str(w)] = x_roll_mean.max()
        X.loc[segment, 'min_roll_mean_' + str(w)] = x_roll_mean.min()
        X.loc[segment, 'q01_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.01)
        X.loc[segment, 'q05_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.05)
        X.loc[segment, 'q95_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.95)
        X.loc[segment, 'q99_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.99)
        
        X.loc[segment, 'ave_roll_abs_mean_' + str(w)] = x_roll_abs_mean.mean()
        X.loc[segment, 'std_roll_abs_mean_' + str(w)] = x_roll_abs_mean.std()
        X.loc[segment, 'max_roll_abs_mean_' + str(w)] = x_roll_abs_mean.max()
        X.loc[segment, 'min_roll_abs_mean_' + str(w)] = x_roll_abs_mean.min()
        X.loc[segment, 'q01_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.01)
        X.loc[segment, 'q05_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.05)
        X.loc[segment, 'q95_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.95)
        X.loc[segment, 'q99_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.99)

In [None]:
num_folds = 5
def grid_search_cv(model, parames, features, target):
    t0 = time.time()
    reg = GridSearchCV(model, parames, cv=num_folds, scoring='neg_mean_absolute_error')
    reg.fit(features, target)
    
    t0 = time.time() - t0
    print("Best CV score: {:.4f}, time: {:.1f}s".format(-reg.best_score_, t0))
    print(reg.best_params_)
    return reg.best_params_

In [None]:
scaler = StandardScaler()
scaler.fit(X)
train_x = scaler.transform(X)
train_y = Y.values.flatten()

In [None]:
#模型GBDT训练
gbdt = GradientBoostingRegressor(learning_rate=0.05, min_samples_split=10,n_estimators=400,max_depth=3,max_features=11,
                                 min_samples_leaf=200,subsample=0.3,random_state=10)
gbdt.fit(train_x,train_y)
# 网格调节参数
# best_gbdt_params = grid_search_cv(gbdt, parame_gbdt, train_x, train_y)

In [None]:
#模型XGB训练
# params_xgb = {'reg_alpha': [0.05, 0.1,0.25,0.5,0.75, 1, 2, 3], 'reg_lambda': [0.05, 0.1,0.25,0.5,0.75, 1, 2, 3]}
xgb_fit = xgb.XGBRegressor(learning_rate=0.05,seed=0,n_estimators=200,max_depth=1,min_child_weight=7,gamma=0.1,
                           subsample=0.2,colsample_bytree=0.75,reg_alpha=2,reg_lambda=2)
# best_xgb_params = grid_search_cv(xgb_fit, params_xgb, train_x, train_y)
xgb_fit.fit(train_x,train_y)

In [None]:
#模型LGB训练
# params_lgb = {'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.1,0.3, 0.5],'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.1,0.3, 0.5,0.75]}
lgb_fit =lgb.LGBMRegressor(objective='regression',n_estimators=200,learning_rate=0.05,max_depth=1,min_child_samples=20,reg_alpha=0.5,
                           reg_lambda=0.75,min_child_weight=0.001,num_leaves=47,feature_fraction=0.9,bagging_fraction=0.6)
# best_lgb_params = grid_search_cv(lgb_fit, params_lgb, train_x, train_y)
lgb_fit.fit(train_x,train_y)

In [None]:
#Catboost模型训练
# parameters = {'learning_rate': [0.01,0.02,0.1,0.5,1],
#         'l2_leaf_reg':[0.01,0.1,0.5,1],
#         'depth':[2,3,4,5,6,7]
# }
# # m.best_params_{'depth': 2, 'l2_leaf_reg': 1, 'learningf_rate': 0.01}
# cat = CatBoostRegressor(iterations=1000,loss_function='MAE')
# # cat.fit(X,Y.values.flatten())
# m = GridSearchCV(cat, param_grid=parameters, cv=3)
# m.fit(X, Y.values.flatten(), silent=True)

In [None]:
# 添加测试集的特征
for seg_id in tqdm(X_testx.index):
    seg = pd.read_csv('/kaggle/input/LANL-Earthquake-Prediction/test/' + seg_id + '.csv')
    x = seg['acoustic_data'].values
    x1 = seg['acoustic_data']
    zc = np.fft.fft(x)###傅里叶变换
    
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)
    X_testx.loc[seg_id, 'Rmean'] = realFFT.mean()
    X_testx.loc[seg_id, 'Rstd'] = realFFT.std()
    X_testx.loc[seg_id, 'Rmax'] = realFFT.max()
    X_testx.loc[seg_id, 'Rmin'] = realFFT.min()
    X_testx.loc[seg_id, 'Imean'] = imagFFT.mean()
    X_testx.loc[seg_id, 'Istd'] = imagFFT.std()
    X_testx.loc[seg_id, 'Imax'] = imagFFT.max()
    X_testx.loc[seg_id, 'Imin'] = imagFFT.min()
    X_testx.loc[seg_id, 'Rmean_last_5000'] = realFFT[-5000:].mean()
    X_testx.loc[seg_id, 'Rstd__last_5000'] = realFFT[-5000:].std()
    X_testx.loc[seg_id, 'Rmax_last_5000'] = realFFT[-5000:].max()
    X_testx.loc[seg_id, 'Rmin_last_5000'] = realFFT[-5000:].min()
    X_testx.loc[seg_id, 'Rmean_last_15000'] = realFFT[-15000:].mean()
    X_testx.loc[seg_id, 'Rstd_last_15000'] = realFFT[-15000:].std()
    X_testx.loc[seg_id, 'Rmax_last_15000'] = realFFT[-15000:].max()
    X_testx.loc[seg_id, 'Rmin_last_15000'] = realFFT[-15000:].min()
    
    
    
    X_testx.loc[seg_id, 'mean'] = x.mean()
    X_testx.loc[seg_id, 'stdev'] = x.std()
    X_testx.loc[seg_id, 'variance'] = np.var(x)
    X_testx.loc[seg_id, 'max'] = x.max()
    X_testx.loc[seg_id, 'min'] = x.min()
    X_testx.loc[seg_id, 'max-min-diff'] = x.max()-x.min()
    X_testx.loc[seg_id, 'max-mean-diff'] = x.max()-x.mean()
    X_testx.loc[seg_id, 'mean-change-abs'] = np.mean(np.diff(x))
    X_testx.loc[seg_id, 'abs-min'] = np.abs(x).min()
    X_testx.loc[seg_id, 'abs-max'] = np.abs(x).max()
    X_testx.loc[seg_id, 'abs_mean'] = np.abs(x).mean()
    X_testx.loc[seg_id, 'abs_std'] = np.abs(x).std()
    X_testx.loc[seg_id, 'std-first-50000'] = x[:50000].std()
    X_testx.loc[seg_id, 'std-last-50000'] = x[-50000:].std()
    X_testx.loc[seg_id, 'mean-first-50000'] = x[:50000].min()
    X_testx.loc[seg_id, 'mean-last-50000'] = x[-50000:].mean()
    X_testx.loc[seg_id, 'max-first-50000'] = x[:50000].max()
    X_testx.loc[seg_id, 'max-last-50000'] = x[-50000:].max()
    X_testx.loc[seg_id, 'min-first-50000'] = x[:50000].min()
    X_testx.loc[seg_id, 'min-last-50000'] = x[-50000:].min()
    X_testx.loc[seg_id, 'q01'] = np.quantile(x, 0.01)
    X_testx.loc[seg_id, 'q05'] = np.quantile(x, 0.05)
    X_testx.loc[seg_id, 'q10'] = np.quantile(x, 0.10)
    X_testx.loc[seg_id, 'q95'] = np.quantile(x, 0.95)
    X_testx.loc[seg_id, 'q99'] = np.quantile(x, 0.99)
    
    X_testx.loc[seg_id, 'trend'] = add_trend_feature(x)
    X_testx.loc[seg_id, 'abs_trend'] = add_trend_feature(x, abs_values=True)
    
    X_testx.loc[seg_id, 'classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean()
    X_testx.loc[seg_id, 'classic_sta_lta2_mean'] = classic_sta_lta(x, 5000, 100000).mean()
    X_testx.loc[seg_id, 'classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean()
    X_testx.loc[seg_id, 'classic_sta_lta4_mean'] = classic_sta_lta(x, 10000, 25000).mean()
    
    for w in [10, 50, 100, 1000]:
        x_roll_std = x1.rolling(w).std().dropna().values
        x_roll_mean = x1.rolling(w).mean().dropna().values
        x_roll_abs_mean = x1.abs().rolling(w).mean().dropna().values
        
        X_testx.loc[seg_id, 'ave_roll_std_' + str(w)] = x_roll_std.mean()
        X_testx.loc[seg_id, 'std_roll_std_' + str(w)] = x_roll_std.std()
        X_testx.loc[seg_id, 'max_roll_std_' + str(w)] = x_roll_std.max()
        X_testx.loc[seg_id, 'min_roll_std_' + str(w)] = x_roll_std.min()
        X_testx.loc[seg_id, 'q01_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.01)
        X_testx.loc[seg_id, 'q05_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.05)
        X_testx.loc[seg_id, 'q10_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.10)
        X_testx.loc[seg_id, 'q95_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.95)
        X_testx.loc[seg_id, 'q99_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.99)
        
        X_testx.loc[seg_id, 'ave_roll_mean_' + str(w)] = x_roll_mean.mean()
        X_testx.loc[seg_id, 'std_roll_mean_' + str(w)] = x_roll_mean.std()
        X_testx.loc[seg_id, 'max_roll_mean_' + str(w)] = x_roll_mean.max()
        X_testx.loc[seg_id, 'min_roll_mean_' + str(w)] = x_roll_mean.min()
        X_testx.loc[seg_id, 'q01_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.01)
        X_testx.loc[seg_id, 'q05_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.05)
        X_testx.loc[seg_id, 'q95_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.95)
        X_testx.loc[seg_id, 'q99_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.99)
        
        X_testx.loc[seg_id, 'ave_roll_abs_mean_' + str(w)] = x_roll_abs_mean.mean()
        X_testx.loc[seg_id, 'std_roll_abs_mean_' + str(w)] = x_roll_abs_mean.std()
        X_testx.loc[seg_id, 'max_roll_abs_mean_' + str(w)] = x_roll_abs_mean.max()
        X_testx.loc[seg_id, 'min_roll_abs_mean_' + str(w)] = x_roll_abs_mean.min()
        X_testx.loc[seg_id, 'q01_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.01)
        X_testx.loc[seg_id, 'q05_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.05)
        X_testx.loc[seg_id, 'q95_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.95)
        X_testx.loc[seg_id, 'q99_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.99)
        
        
X_test_scaled = pd.DataFrame(scaler.transform(X_testx), columns=X_testx.columns)

In [None]:
pred_test1 = gbdt.predict(X_test_scaled)
pred_test2 = xgb_fit.predict(X_test_scaled.values)
pred_test3 = lgb_fit.predict(X_test_scaled)