In [1]:
import pandas as panda
import numpy as numpy
numpy.random.seed(42)
from pandas.api.types import is_numeric_dtype
import csv as csv

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import learning_curve

from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

from datetime import datetime
import pytz

import seaborn as sns
import matplotlib.pyplot as plt

import geopandas as gpd

output_notebook() # inline display of bokeh graphs

## Loading the data ##


In [2]:
completeDataset = panda.read_csv('quakes.csv')
nrRows = len(completeDataset)
print('total data size: ', nrRows)

total data size:  1276


### Remove nulls ###

In [3]:
completeDataset = completeDataset.dropna()
nrRows = len(completeDataset)
print('total data size: ', nrRows)

total data size:  896


### Format timestamp ###

In [4]:
# def convert_date_to_number(date):
#     date = str(date)
#     date = date.replace('T',' ').replace('Z','+00:00')
#     return int(datetime.fromisoformat(date).timestamp())

# completeDataset['timestamp'] = completeDataset.time.apply(convert_date_to_number)

### Take only the necessary fields ###

In [5]:
# reduced_dataset = completeDataset[['latitude', 'longitude', 'depth', 'mag', 'timestamp', 'rms', 'magError', 'time']].copy()
reduced_dataset = completeDataset[['latitude', 'longitude', 'depth', 'mag']].copy()

reduced_dataset.head()
# print('train dataset: %s, test dataset %s' %(str(train_dataset_full.shape), str(test_dataset_full.shape)) )


Unnamed: 0,latitude,longitude,depth,mag
0,-28.8317,-68.8227,82.7,4.5
1,-28.5433,-68.803,130.4,4.6
2,-28.4109,-67.489,101.51,5.5
3,-25.8261,-69.3778,108.01,4.3
4,-25.8184,-69.3011,135.12,4.3


In [6]:
# Need to round the magnitude in order to be able to compute the accuracy
# reduced_dataset.mag = panda.Series(reduced_dataset.mag).apply(round)
# rounded may disturb, try multiplying
reduced_dataset.mag = reduced_dataset.mag * 10

## Extract X_train, Y_train, X_test and Y_test ##

In [7]:
reduced_dataset_mag=reduced_dataset['mag']
reduced_dataset_no_mag=reduced_dataset.drop("mag",axis=1)
# X_train, X_test, Y_train, Y_test = train_test_split(reduced_dataset_no_mag, reduced_dataset_mag, test_size= 0.25, random_state=42)
X_train = reduced_dataset_no_mag.tail(nrRows-1)
X_test = reduced_dataset_no_mag.head(1)
Y_train = reduced_dataset_mag.tail(nrRows-1)
Y_test =reduced_dataset_mag.head(1)
print('Training data size: ', len(X_train))
print('Validation data size: ', len(X_test))
# print(X_train)
# print(Y_train)
X_train.head()

Training data size:  895
Validation data size:  1


Unnamed: 0,latitude,longitude,depth
1,-28.5433,-68.803,130.4
2,-28.4109,-67.489,101.51
3,-25.8261,-69.3778,108.01
4,-25.8184,-69.3011,135.12
5,-28.6372,-67.1752,131.87


### Remove outliers ###

In [8]:
# def remove_outliers(df):
#     low = .0
#     high = 1.0
#     quant_df = df.quantile([low, high])
#     for name in list(df.columns):
#       if is_numeric_dtype(df[name]):
#        df = df[(df[name] > quant_df.loc[low, name]) 
#                & (df[name] < quant_df.loc[high, name])]
#     return df

# X_train = remove_outliers(X_train)

## RFC ##

In [9]:
rf=RandomForestClassifier(n_estimators=300,class_weight='balanced',n_jobs=2,random_state=42)
rf.fit(X_train,Y_train)
# acc=rf.score(X_test,Y_test)
# print(acc)
pred=rf.predict(X_test)
print(pred/10)
# acc2=accuracy_score(Y_test, pred)
# print(acc2)

[4.6]


## MLP ##

### Without scaling ###

In [10]:
mlp = MLPClassifier(hidden_layer_sizes=(50,50,50,50,50,50 ), max_iter=500, alpha=0.0001, solver='sgd', verbose=0,  random_state=42,tol=0.000000001)
mlp.fit(X_train, Y_train)
# acc = mlp.score(X_test, Y_test)
# print(acc)
pred = mlp.predict(X_test)
print(pred/10)
# acc2 = accuracy_score(Y_test, pred)
# print(acc2)

[0.]


### With scaling ###

In [11]:
scaler = preprocessing.StandardScaler() 
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)  
# apply same transformation to test data
X_test_scaled = scaler.transform(X_test)  

In [12]:
mlp.fit(X_train_scaled, Y_train)
# acc = mlp.score(X_test_scaled, Y_test)
# print(acc)
pred = mlp.predict(X_test_scaled)
print(pred/10)

[0.]




### Errors Plot ###

In [13]:
# train_sizes, train_scores, validation_scores = learning_curve(
#     estimator = mlp,
#     X = X_train,
#     y = Y_train,
#     train_sizes = [1, 100, 500, 2000, 5000, 6152],
#     cv = 2,
#     scoring = 'neg_mean_squared_error'
# )
# train_scores_mean = -train_scores.mean(axis = 1)
# validation_scores_mean = -validation_scores.mean(axis = 1)
# plt.style.use('seaborn')
# plt.plot(train_sizes, train_scores_mean, label = 'Training error')
# plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
# plt.ylabel('RFC', fontsize = 14)
# plt.xlabel('Training set size', fontsize = 14)
# plt.legend()
# plt.ylim(0,0.5)

## Logistic Regression ##

In [14]:
logreg = LogisticRegression(solver='liblinear', multi_class='auto') #(C=0.1, penalty='l1', tol=1e-6)
logreg.fit(X_train, Y_train)
logistic_regression_result = logreg.predict(X_test)
print(logistic_regression_result/10)
# acc = logreg.score(X_test, Y_test)
# print(acc)

[4.4]


## Support Vector Machines ##

In [15]:
svc = SVC(C = 0.1, gamma=0.1)
svc.fit(X_train, Y_train)

# acc = svc.score(X_test, Y_test)
# print(acc)
pred = svc.predict(X_test)
print(pred/10)

[4.4]


## Random Forests ##

In [16]:
random_forest = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    max_depth=11,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    bootstrap=False,
    oob_score=False,
    n_jobs=-1,
    random_state=50,
    verbose=0
)

random_forest.fit(X_train, Y_train)
# acc = random_forest.score(X_test, Y_test)
# print(acc)
pred = random_forest.predict(X_test)
print(pred/10)

[4.6]
