In [7]:
#import libraries
import pandas as pd
import numpy as np
import datetime as dt
import os

In [8]:
#import datasets
usgs = pd.read_csv('datasets\\usgs_data.csv', sep = ',', lineterminator='\n', dtype={'time':str})
etas = pd.read_csv('datasets\\modified_etas.csv', sep = ',', lineterminator='\n')

In [9]:
#filter the dataset by Date > 1960-01-01 and Date < 2023-01-1 
usgs['time'] = pd.to_datetime(usgs['time'], errors='coerce').dt.strftime('%Y-%m-%d')
usgs = usgs[(pd.to_datetime(usgs['time']) > pd.to_datetime('1960-01-01')) & (pd.to_datetime(usgs['time']) < pd.to_datetime('2023-01-01'))]

In [10]:
#correct datatypes
usgs['longitude'] = pd.to_numeric(usgs['longitude'], errors='coerce')
usgs['latitude'] = pd.to_numeric(usgs['latitude'], errors='coerce')
usgs['mag'] = pd.to_numeric(usgs['mag'], errors='coerce')
usgs['time'] = pd.to_datetime(usgs['time'])

In [11]:
usgs = usgs.sort_values(by='time')
usgs.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
8524,1960-01-02,35.556333,-121.351,6.0,4.04,ml,9.0,293.0,1.331,0.65,...,2016-01-28T19:43:17.710Z,"24km W of Cambria, CA",earthquake,3.27,31.61,0.161,11.0,reviewed,ci,ci
8523,1960-01-03,31.5635,-116.397667,6.0,3.77,ml,2.0,359.0,1.137,0.34,...,2016-01-28T19:43:18.210Z,"23km SE of Maneadero, B.C., MX",earthquake,29.12,31.61,0.052,8.0,reviewed,ci,ci
8522,1960-01-03,31.097167,-116.079667,6.0,3.99,ml,4.0,353.0,1.657,1.24,...,2016-01-28T20:00:38.470Z,"83km SE of Maneadero, B.C., MX",earthquake,77.36,31.61,0.131,5.0,reviewed,ci,ci
8521,1960-01-05,34.014833,-117.710667,6.22,3.03,ml,9.0,139.0,0.2793,0.56,...,2016-01-28T19:47:36.260Z,"2km W of Chino, CA",earthquake,2.2,2.85,0.215,11.0,reviewed,ci,ci
8520,1960-01-07,32.221167,-115.904833,6.0,3.64,ml,3.0,340.0,0.7939,0.83,...,2016-01-28T19:43:06.290Z,"50km SW of Progreso, B.C., MX",earthquake,10.65,31.61,0.112,5.0,reviewed,ci,ci


In [12]:
#filter the dataset by X > -123 and X < -113 and Y > 29 and Y < 39
usgs = usgs[usgs['longitude'] > -123]
usgs = usgs[usgs['longitude'] < -113]
usgs = usgs[usgs['latitude'] < 39]
usgs = usgs[usgs['latitude'] > 29]

In [13]:
etas.columns = ['date', 'time', 'year', 'longitude', 'latitude', 'mag', 'z', 'aftershock']
etas.head()

Unnamed: 0,date,time,year,longitude,latitude,mag,z,aftershock
0,1960/01/02,0:09:42.00,1960.006741,-121.7122,37.3552,4.68,8.3275,b\r
1,1960/01/03,0:13:21.00,1960.009279,-118.3268,34.3443,3.73,7.591,2.0\r
2,1960/01/03,0:14:04.00,1960.009778,-117.4833,33.7307,3.53,6.5357,b\r
3,1960/01/05,0:23:16.00,1960.016158,-116.7325,33.7002,3.61,6.4911,b\r
4,1960/01/06,0:25:14.00,1960.017534,-116.341,33.939,3.67,9.3259,b\r


In [14]:
#filter the dataset by Date > 1960-01-01 and Date < 2023-01-1 
etas['date'] = pd.to_datetime(etas['date'], errors='coerce').dt.strftime('%Y-%m-%d')
etas['date'] = pd.to_datetime(etas['date'])
# etas.loc[etas['Date'].dt.year > pd.Timestamp.now().year, 'Date'] -= pd.DateOffset(years=100)
etas = etas[(pd.to_datetime(etas['date']) > pd.to_datetime('1960-01-01')) & (pd.to_datetime(etas['date']) < pd.to_datetime('2023-01-01'))]

In [15]:
#correct datatypes
etas['longitude'] = pd.to_numeric(etas['longitude'], errors='coerce')
etas['latitude'] = pd.to_numeric(etas['latitude'], errors='coerce')
etas['mag'] = pd.to_numeric(etas['mag'], errors='coerce')

In [16]:
#filter the dataset by X > -123 and X < -113 and Y > 29 and Y < 39
etas = etas[etas['longitude'] > -123]
etas = etas[etas['longitude'] < -113]
etas = etas[etas['latitude'] < 39]
etas = etas[etas['latitude'] > 29]

In [17]:
#making etas and usgs more consistent by column headers and sorting
etas = etas.drop(columns='time')
usgs = usgs.rename(columns={'time':'date'})
usgs = usgs.sort_values(by='date', ascending=True)

In [18]:
#drop index column
usgs = usgs.reset_index(drop=True)
etas = etas.reset_index(drop=True)

In [19]:
etas['aftershock'] = etas['aftershock'].str.replace('\r', '')
etas.head()

Unnamed: 0,date,year,longitude,latitude,mag,z,aftershock
0,1960-01-02,1960.006741,-121.7122,37.3552,4.68,8.3275,b
1,1960-01-03,1960.009279,-118.3268,34.3443,3.73,7.591,2.0
2,1960-01-03,1960.009778,-117.4833,33.7307,3.53,6.5357,b
3,1960-01-05,1960.016158,-116.7325,33.7002,3.61,6.4911,b
4,1960-01-06,1960.017534,-116.341,33.939,3.67,9.3259,b


In [20]:
#saving the datasets to csv
os.makedirs('datasets', exist_ok=True)
usgs.to_csv('datasets/USGS.csv')
etas.to_csv('datasets/ModifiedETAS.csv')