In [1]:
import pandas as panda

import csv

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# from sklearn.model_selection import learning_curve

from datetime import datetime
from os import environ

## Loading the data ##


In [2]:
# completeDataset = panda.read_csv('quakes_radius300.csv')
inputFileName = environ['inputFileName']
completeDataset = panda.read_csv(inputFileName)
nrRows = len(completeDataset)
print('total data size: ', nrRows)

total data size:  2000


### Remove nulls ###

In [3]:
completeDataset = completeDataset.dropna()
nrRows = len(completeDataset)
print('total data size: ', nrRows)

total data size:  2000


### Format timestamp ###

In [4]:
# def convert_date_to_number(date):
#     date = str(date)
#     date = date.replace('T',' ').replace('Z','+00:00')
#     return int(datetime.fromisoformat(date).timestamp())

# completeDataset['timestamp'] = completeDataset.time.apply(convert_date_to_number)

### Take only the necessary fields ###

In [None]:
# reduced_dataset = completeDataset[['latitude', 'longitude', 'depth', 'mag', 'timestamp', 'rms', 'magError', 'time']].copy()
reduced_dataset = completeDataset[['latitude', 'longitude', 'depth', 'mag']].copy()

reduced_dataset.head()
# print('train dataset: %s, test dataset %s' %(str(train_dataset_full.shape), str(test_dataset_full.shape)) )


Unnamed: 0,latitude,longitude,depth,mag
0,-8.6473,-74.3805,155.59,4.5
1,-58.0753,-25.7163,69.39,5.0
2,-21.1131,-69.0743,132.21,4.4
3,-18.2029,-69.5807,152.75,5.1
4,13.2549,-87.7317,170.43,4.6


In [None]:
# rounded may disturb, try multiplying
reduced_dataset.mag = reduced_dataset.mag * 100
# Need to round the magnitude in order to be able to compute the accuracy
reduced_dataset.mag = panda.Series(reduced_dataset.mag).apply(round)

## Extract X_train, Y_train, X_test and Y_test ##

In [None]:
reduced_dataset_mag=reduced_dataset['mag']
reduced_dataset_no_mag=reduced_dataset.drop("mag",axis=1)
# X_train, X_test, Y_train, Y_test = train_test_split(reduced_dataset_no_mag, reduced_dataset_mag, test_size= 0.25, random_state=42)
X_train = reduced_dataset_no_mag.tail(nrRows-1)
X_test = reduced_dataset_no_mag.head(1)
Y_train = reduced_dataset_mag.tail(nrRows-1)
Y_test =reduced_dataset_mag.head(1)
print('Training data size: ', len(X_train))
print('Validation data size: ', len(X_test))
# print(X_train)
# print(Y_train)
X_train.head()

Training data size:  1999
Validation data size:  1


Unnamed: 0,latitude,longitude,depth
1,-58.0753,-25.7163,69.39
2,-21.1131,-69.0743,132.21
3,-18.2029,-69.5807,152.75
4,13.2549,-87.7317,170.43
5,-35.869,-103.8833,10.0


### Remove outliers ###

In [None]:
# def remove_outliers(df):
#     low = .0
#     high = 1.0
#     quant_df = df.quantile([low, high])
#     for name in list(df.columns):
#       if is_numeric_dtype(df[name]):
#        df = df[(df[name] > quant_df.loc[low, name]) 
#                & (df[name] < quant_df.loc[high, name])]
#     return df

# X_train = remove_outliers(X_train)

## RFC ##

In [None]:
rf=RandomForestClassifier(n_estimators=300,class_weight='balanced',n_jobs=2,random_state=42)
rf.fit(X_train,Y_train)
# acc=rf.score(X_test,Y_test)
# print(acc)
pred = []
pred.append(rf.predict(X_test)/100)
print(pred[0])
# acc2=accuracy_score(Y_test, pred)
# print(acc2)

[4.2]


## MLP ##

### Without scaling ###

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(50,50,50,50,50,50 ), max_iter=500, alpha=0.0001, solver='sgd', verbose=0,  random_state=42,tol=0.000000001)
mlp.fit(X_train, Y_train)
# acc = mlp.score(X_test, Y_test)
# print(acc)
pred.append(mlp.predict(X_test)/100)
print(pred[1])
# acc2 = accuracy_score(Y_test, pred)
# print(acc2)

### With scaling ###

In [None]:
scaler = preprocessing.StandardScaler() 
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)  
# apply same transformation to test data
X_test_scaled = scaler.transform(X_test)  

In [None]:
mlp.fit(X_train_scaled, Y_train)
# acc = mlp.score(X_test_scaled, Y_test)
# print(acc)
pred.append(mlp.predict(X_test_scaled)/100)
print(pred[2])

### Errors Plot ###

In [None]:
# train_sizes, train_scores, validation_scores = learning_curve(
#     estimator = mlp,
#     X = X_train,
#     y = Y_train,
#     train_sizes = [1, 100, 500, 2000, 5000, 6152],
#     cv = 2,
#     scoring = 'neg_mean_squared_error'
# )
# train_scores_mean = -train_scores.mean(axis = 1)
# validation_scores_mean = -validation_scores.mean(axis = 1)
# plt.style.use('seaborn')
# plt.plot(train_sizes, train_scores_mean, label = 'Training error')
# plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
# plt.ylabel('RFC', fontsize = 14)
# plt.xlabel('Training set size', fontsize = 14)
# plt.legend()
# plt.ylim(0,0.5)

## Logistic Regression ##

In [None]:
logreg = LogisticRegression(solver='liblinear', multi_class='auto') #(C=0.1, penalty='l1', tol=1e-6)
logreg.fit(X_train, Y_train)
pred.append(logreg.predict(X_test)/100)
print(pred[3])
# acc = logreg.score(X_test, Y_test)
# print(acc)

## Support Vector Machines ##

In [None]:
svc = SVC(C = 0.1, gamma=0.1)
svc.fit(X_train, Y_train)

# acc = svc.score(X_test, Y_test)
# print(acc)
pred.append(svc.predict(X_test)/100)
print(pred[4])

## Random Forests ##

In [None]:
random_forest = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    max_depth=11,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    bootstrap=False,
    oob_score=False,
    n_jobs=-1,
    random_state=50,
    verbose=0
)

random_forest.fit(X_train, Y_train)
# acc = random_forest.score(X_test, Y_test)
# print(acc)
pred.append(random_forest.predict(X_test)/100)
print(pred[5])

## Write Results ##

In [None]:
with open('predictions.csv', 'w') as csvfile:
  w = csv.writer(csvfile)
  w.writerow(["Prediction"])

  for i in range(len(pred)):
    w.writerow(pred[i])