In [1]:
import pandas as panda

import csv
import re

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# from sklearn.model_selection import learning_curve

from datetime import datetime
from os import environ

## Loading the data ##


In [2]:
inputFileName = environ['inputFileName']
# inputFileName = 'quakes_radius1000.csv'
completeDataset = panda.read_csv(inputFileName)
nrRows = len(completeDataset)
print('total data size: ', nrRows)

total data size:  2000


### Remove nulls ###

In [3]:
completeDataset = completeDataset.dropna()
nrRows = len(completeDataset)
print('total data size: ', nrRows)

total data size:  2000


### Format timestamp ###

In [4]:
# def convert_date_to_number(date):
#     date = str(date)
#     date = date.replace('T',' ').replace('Z','+00:00')
#     return int(datetime.fromisoformat(date).timestamp())

# completeDataset['timestamp'] = completeDataset.time.apply(convert_date_to_number)

### Take only the necessary fields ###

In [5]:
# reduced_dataset = completeDataset[['latitude', 'longitude', 'depth', 'mag', 'timestamp', 'rms', 'magError', 'time']].copy()
reduced_dataset = completeDataset[['latitude', 'longitude', 'depth', 'mag']].copy()

reduced_dataset.head()
# print('train dataset: %s, test dataset %s' %(str(train_dataset_full.shape), str(test_dataset_full.shape)) )


Unnamed: 0,latitude,longitude,depth,mag
0,-27.6846,-67.9473,132.73,4.6
1,-30.63,-71.4315,44.75,4.2
2,-24.034,-66.9969,168.16,4.8
3,-24.0112,-66.93,164.35,5.4
4,-24.1014,-66.7737,213.97,4.2


In [6]:
# rounded may disturb, try multiplying
reduced_dataset.mag = reduced_dataset.mag * 100
# Need to round the magnitude in order to be able to compute the accuracy
reduced_dataset.mag = panda.Series(reduced_dataset.mag).apply(round)

## Extract X_train, Y_train, X_test and Y_test ##

In [7]:
reduced_dataset_mag=reduced_dataset['mag']
reduced_dataset_no_mag=reduced_dataset.drop("mag",axis=1)
# X_train, X_test, Y_train, Y_test = train_test_split(reduced_dataset_no_mag, reduced_dataset_mag, test_size= 0.25, random_state=42)
X_train = reduced_dataset_no_mag.tail(nrRows-1)
X_test = reduced_dataset_no_mag.head(1)
Y_train = reduced_dataset_mag.tail(nrRows-1)
Y_test =reduced_dataset_mag.head(1)
print('Training data size: ', len(X_train))
print('Validation data size: ', len(X_test))
# print(X_train)
# print(Y_train)
X_train.head()

Training data size:  1999
Validation data size:  1


Unnamed: 0,latitude,longitude,depth
1,-30.63,-71.4315,44.75
2,-24.034,-66.9969,168.16
3,-24.0112,-66.93,164.35
4,-24.1014,-66.7737,213.97
5,-24.2461,-67.3871,207.1


### Remove outliers ###

In [8]:
# def remove_outliers(df):
#     low = .0
#     high = 1.0
#     quant_df = df.quantile([low, high])
#     for name in list(df.columns):
#       if is_numeric_dtype(df[name]):
#        df = df[(df[name] > quant_df.loc[low, name]) 
#                & (df[name] < quant_df.loc[high, name])]
#     return df

# X_train = remove_outliers(X_train)

In [9]:
pred = [-1,-1,-1,-1,-1,-1,-1]

## RFC ##

In [10]:
# rf=RandomForestClassifier(n_estimators=300,class_weight='balanced',n_jobs=2,random_state=42)
# rf.fit(X_train,Y_train)
# # acc=rf.score(X_test,Y_test)
# # print(acc)
# pred = [-1,-1,-1,-1,-1,-1,-1]
# pred[0] = rf.predict(X_test)/100
# print(pred[0])
# # acc2=accuracy_score(Y_test, pred)
# # print(acc2)

random_forest = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    max_depth=11,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    bootstrap=False,
    oob_score=False,
    n_jobs=-1,
    random_state=50,
    verbose=0
)

random_forest.fit(X_train, Y_train)
# acc = random_forest.score(X_test, Y_test)
# print(acc)
pred[0] = random_forest.predict(X_test)/100
print(pred[0])

[4.4]


## MLP ##

### Without scaling ###

In [11]:
mlp = MLPClassifier(hidden_layer_sizes=(50,50,50,50,50,50 ), max_iter=500, alpha=0.0001, solver='sgd', verbose=0,  random_state=42,tol=0.000000001)
mlp.fit(X_train, Y_train)
# acc = mlp.score(X_test, Y_test)
# print(acc)
pred[1] = mlp.predict(X_test)/100
print(pred[1])
# acc2 = accuracy_score(Y_test, pred)
# print(acc2)

[4.3]


### With scaling ###

In [12]:
scaler = preprocessing.StandardScaler() 
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)  
# apply same transformation to test data
X_test_scaled = scaler.transform(X_test)  

In [13]:
mlp.fit(X_train_scaled, Y_train)
# acc = mlp.score(X_test_scaled, Y_test)
# print(acc)
pred[2] = mlp.predict(X_test_scaled)/100
print(pred[2])

[4.2]




### Errors Plot ###

In [14]:
# train_sizes, train_scores, validation_scores = learning_curve(
#     estimator = mlp,
#     X = X_train,
#     y = Y_train,
#     train_sizes = [1, 100, 500, 2000, 5000, 6152],
#     cv = 2,
#     scoring = 'neg_mean_squared_error'
# )
# train_scores_mean = -train_scores.mean(axis = 1)
# validation_scores_mean = -validation_scores.mean(axis = 1)
# plt.style.use('seaborn')
# plt.plot(train_sizes, train_scores_mean, label = 'Training error')
# plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
# plt.ylabel('RFC', fontsize = 14)
# plt.xlabel('Training set size', fontsize = 14)
# plt.legend()
# plt.ylim(0,0.5)

## Logistic Regression ##

In [15]:
logreg = LogisticRegression(solver='liblinear', multi_class='auto') #(C=0.1, penalty='l1', tol=1e-6)
logreg.fit(X_train, Y_train)
pred[3] = logreg.predict(X_test)/100
print(pred[3])
# acc = logreg.score(X_test, Y_test)
# print(acc)

[4.3]


## Support Vector Machines ##

In [16]:
svc = SVC(C = 0.1, gamma=0.1)
svc.fit(X_train, Y_train)

# acc = svc.score(X_test, Y_test)
# print(acc)
pred[4] = svc.predict(X_test)/100
print(pred[4])

[4.3]


## Write Results ##

In [17]:
with open('predictions.csv', 'a+') as csvfile:
  w = csv.writer(csvfile)
 # real magnitude, file, output
  radius = re.search('quakes_radius(.*?).csv', inputFileName).group(1)
  w.writerow([radius, Y_test[0]/100, pred[0][0], pred[1][0], pred[2][0], pred[3][0], pred[4][0]])

#   for i in range(len(pred)):
#     w.writerow(pred[i])