In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import scipy.signal as signal
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import seaborn as sns
import warnings
import sys, os

warnings.filterwarnings('ignore')
random_state = 6
np.random.seed(random_state)

In [3]:
# filename = 'seismogram_data_62.csv'
filename = 'seismogram_data_62_new.csv'
df = pd.read_csv(filename)

In [4]:
df.describe()

Unnamed: 0,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,mfccs_5,mfccs_6,mfccs_7,mfccs_8,mfccs_9,...,centiroid,max_amplitude,mean_amplitude,moment,variation,skew,var,autocr,kurto,target
count,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,...,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0,22216.0
mean,841.535354,72.717304,-1.120552,52.234151,2.483228,34.444831,7.67164,22.622343,12.085533,15.528755,...,971.44996,239865400.0,296083.1,0.0,20.183022,-0.027121,555637100000.0,9.640448e+16,6.574073,0.450801
std,191.235047,38.199436,39.099088,19.40694,24.244137,12.98745,15.622192,9.220608,10.365557,7.056197,...,1035.548479,1618594000.0,4228556.0,0.0,4024.576118,1.39349,26259370000000.0,3.678673e+18,197.093498,0.497585
min,480.726115,-87.232311,-198.422667,-32.555981,-88.160009,-41.837174,-48.987699,-26.721758,-35.454891,-23.973469,...,3.588867,4490.561,41.40245,0.0,-427348.3125,-148.525772,0.0,1098498.0,-3.0,0.0
25%,699.059195,44.646795,-28.657048,39.179819,-16.10059,25.82346,-3.387524,16.973561,5.998598,11.365791,...,198.914251,585058.0,1836.427,0.0,-0.326106,-0.084359,36995.06,10651730000.0,-0.067151,0.0
50%,823.266826,73.334659,2.453667,53.082876,3.731802,34.055244,9.972462,22.04225,14.128136,15.497889,...,550.956333,1193249.0,3733.953,0.0,0.20473,-0.002923,148552.3,44483880000.0,0.197497,0.0
75%,959.873963,99.011031,28.34615,64.883821,22.434741,42.256966,20.350147,27.610037,19.563121,19.315831,...,1465.813517,7911418.0,20248.87,0.0,0.90134,0.064298,3477907.0,1656055000000.0,2.471488,1.0
max,2065.265843,233.043814,112.815778,132.919902,61.106265,118.624193,58.102843,71.468831,53.939397,65.529958,...,6894.203309,71428600000.0,543755100.0,0.0,287691.59375,67.194862,3695033000000000.0,4.833089e+20,27424.24321,1.0


In [5]:
Y = df['target']
X = df.drop(['target', 'moment'], axis=1)

print(np.shape(X), np.shape(Y))

X, Y = shuffle(X, Y, random_state = random_state)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state=random_state)
mms = StandardScaler()
X_train = mms.fit_transform(X_train)
X_test = mms.fit_transform(X_test)

(22216, 60) (22216,)


## LightGBM

In [16]:
import lightgbm as lgb
lgbm_model = lgb.LGBMClassifier(num_leaves=300, min_child_samples=5)
lgbm_model.fit(X_train, Y_train)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=5, min_child_weight=5,
        min_split_gain=0.0, n_estimators=10, n_jobs=-1, num_leaves=300,
        objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1)

In [17]:
Y_pred = lgbm_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.5

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.53      0.72      0.61      3020
        1.0       0.41      0.23      0.30      2534

avg / total       0.48      0.50      0.47      5554
 

3. Confusion matrix 
 [[2184 1941]
 [ 836  593]] 

4. Roc_Auc score 
 0.4722150264011705


## xgboost

In [61]:
import xgboost as xgb 
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [62]:
Y_pred = xgb_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.6070820565202588

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.64      0.66      0.65      3221
        1.0       0.57      0.54      0.56      2653

avg / total       0.61      0.61      0.61      5874
 

3. Confusion matrix 
 [[2124 1211]
 [1097 1442]] 

4. Roc_Auc score 
 0.6024108465656892


## Random forest classifier

In [38]:
rf_model = RandomForestClassifier(n_estimators = 200, criterion='entropy', max_features='log2', class_weight='balanced')

rf_model.fit(X_train, Y_train)
Y_pred = rf_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.7799783939503061

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.79      0.81      0.80      3020
        1.0       0.77      0.74      0.76      2534

avg / total       0.78      0.78      0.78      5554
 

3. Confusion matrix 
 [[2448  650]
 [ 572 1884]] 

4. Roc_Auc score 
 0.7786440973792066


## Gaussian Naive Bayes

In [59]:
from sklearn.naive_bayes import GaussianNB
gb_model = GaussianNB()

gb_model.fit(X_train, Y_train)
Y_pred = gb_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))


1. The accuracy of the model is 0.5030643513789581

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.80      0.12      0.22      3221
        1.0       0.48      0.96      0.64      2653

avg / total       0.65      0.50      0.41      5874
 

3. Confusion matrix 
 [[ 401   99]
 [2820 2554]] 

4. Roc_Auc score 
 0.638625604763677


## Support Vector Machine

In [60]:
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(X_train, Y_train)
Y_pred = svm_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.7774940415389854

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.80      0.78      0.79      3221
        1.0       0.75      0.77      0.76      2653

avg / total       0.78      0.78      0.78      5874
 

3. Confusion matrix 
 [[2528  614]
 [ 693 2039]] 

4. Roc_Auc score 
 0.7754613730005695
