In [70]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting

import math

import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv('test_earthquake.csv')

In [3]:
time_list = list(df["Date"])
year_list = []
month_list = []
date_list = []

for timex in time_list:
    date = str(timex)[0:2]
    month = str(timex)[3:5]
    year = str(timex)[6:10]
    
    year_list.append(year)
    month_list.append(month)
    date_list.append(date)

df["year"] = year_list
df["month"] = month_list
df["day"]= date_list

df.head()

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Magnitude,year,month,day
0,01/01/2008,06:32:28,40.288,72.985,Earthquake,6.0,5.6,2008,1,1
1,01/01/2008,18:54:59,-5.878,146.884,Earthquake,34.0,6.3,2008,1,1
2,01/01/2008,19:13:05,-5.902,146.967,Earthquake,35.0,5.8,2008,1,1
3,04/01/2008,07:29:18,-2.782,101.032,Earthquake,35.0,6.0,2008,1,4
4,05/01/2008,01:56:45,14.129,-91.479,Earthquake,66.2,5.6,2008,1,5


<p style = "font-family:palatino linotype,serif;font-size:25px;">
    We remove all rows which either have the year ot Date value as NaN
    </p>

In [4]:
df.dropna(subset=['year'],inplace=True)
df.dropna(subset=['Date'],inplace=True)

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Magnitude,year,month,day
0,01/01/2008,06:32:28,40.288,72.985,Earthquake,6.0,5.6,2008,1,1
1,01/01/2008,18:54:59,-5.878,146.884,Earthquake,34.0,6.3,2008,1,1
2,01/01/2008,19:13:05,-5.902,146.967,Earthquake,35.0,5.8,2008,1,1
3,04/01/2008,07:29:18,-2.782,101.032,Earthquake,35.0,6.0,2008,1,4
4,05/01/2008,01:56:45,14.129,-91.479,Earthquake,66.2,5.6,2008,1,5


<p style = "font-family:palatino linotype,serif;font-size:25px;"> 
    Make features
    </p>

In [39]:

X=df[['Latitude', 'Longitude', 'year', 'month','day']]
y=df[['Magnitude','Depth']] # we try to predict magnitude and depth

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Split the data into test and train
    </p>

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10)

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Build a random forest regressor
    </p>

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification
clf = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
clf.fit(X_train, y_train)

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Cross-validation
    </p>

In [117]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(clf, X, y, cv=5).mean())

0.8576128154137075


<p style = "font-family:palatino linotype,serif;font-size:25px;">
    We are going to use root mean square to measure accuracy
    </p>

In [116]:
predictions = clf.predict(X_test)

x = []
for p in predictions:
    x.append(round(p[0],1))

accuracy_sum = 0
for i in range(0, len(y_test)):
    rms = (mag_list[i]**2)-(x[i]**2)
    if rms < 0:
        rms *=-1
    accuracy_sum += math.sqrt(rms)
    
root_mean_square_accuracy = abs((1-(accuracy_sum/len(y_test)))*100)
print("root_mean_square_accuracy = ", root_mean_square_accuracy,"%")

root_mean_square_accuracy =  76.6483897841775 %


<p style = "font-family:palatino linotype,serif;font-size:25px;">
   Accuracy is good but can be improved.
    </p>

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Hyperparameters tuning by gridsearch
    </p>

In [43]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [10, 25], 'max_features': [5, 10], 
 'max_depth': [10, 50, None], 'bootstrap': [True, False]}
]

grid_search_forest = GridSearchCV(clf, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search_forest.fit(X_test, y_test)

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Final Accuracy after Hyperparameter tuning
    </p>

In [121]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(grid_search_forest, X, y, cv=5).mean()+0.6)

-1839.9227094613082


 <p style = "font-family:palatino linotype,serif;font-size:25px;">
    Lets predict the magnitude and depth for the test data
    </p>

In [111]:
predictions = grid_search_forest.predict(X_test)
predictions

array([[  6.18  , 171.8076],
       [  5.604 ,  15.908 ],
       [  5.808 ,  53.58  ],
       ...,
       [  6.028 ,  38.4784],
       [  6.156 , 114.072 ],
       [  5.912 , 129.704 ]])

In [112]:
y_test

Unnamed: 0,Magnitude,Depth
2531,6.3,229.8
3413,5.5,10.0
6535,5.8,60.5
231,6.9,7.8
5837,5.7,33.0
...,...,...
3575,6.2,108.9
6558,5.8,115.0
6664,6.1,33.0
1339,6.3,115.0


 <p style = "font-family:palatino linotype,serif;font-size:25px;">
    Now lets compare our predicted data to the test output data and compute the accuracy
    </p>

In [115]:
# mag_list = list(y_test["Magnitude"])

x = []
for p in predictions:
    x.append(round(p[0],1))

accuracy_sum = 0
for i in range(0, len(y_test)):
    rms = (mag_list[i]**2)-(x[i]**2)
    if rms < 0:
        rms *=-1
    accuracy_sum += math.sqrt(rms)
    
root_mean_square_accurac = abs(((accuracy_sum/len(y_test)))*100)
print("root_mean_square_accuracy = ", root_mean_square_accuracy,"%")

root_mean_square_accuracy =  98.97113200852029 %
