## Imports

In [61]:
import sqlite3
import pandas as pd

## Make Dataframes

In [62]:
# Converts csvs to DataFrames
moons_df = pd.read_csv('../datasets/full_moons.csv')
plate_df = pd.read_csv('../datasets/plate_boundaries.csv')
quake_df = pd.read_csv('../datasets/significant_earthquakes.csv')

In [63]:
quake_df

Unnamed: 0.1,Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,0,1900-10-09T12:25:00.000Z,57.0900,-153.4800,,7.86,mw,,,,...,2022-05-09T14:44:17.838Z,"16 km SW of Old Harbor, Alaska",earthquake,,,,,reviewed,ushis,pt
1,1,1901-03-03T07:45:00.000Z,36.0000,-120.5000,,6.40,ms,,,,...,2018-06-04T20:43:44.000Z,"12 km NNW of Parkfield, California",earthquake,,,,,reviewed,ushis,ell
2,2,1901-07-26T22:20:00.000Z,40.8000,-115.7000,,5.00,fa,,,,...,2018-06-04T20:43:44.000Z,"6 km SE of Elko, Nevada",earthquake,,,,,reviewed,ushis,sjg
3,3,1901-12-30T22:34:00.000Z,52.0000,-160.0000,,7.00,ms,,,,...,2018-06-04T20:43:44.000Z,south of Alaska,earthquake,,,,,reviewed,ushis,abe
4,4,1902-01-01T05:20:30.000Z,52.3800,-167.4500,,7.00,ms,,,,...,2018-06-04T20:43:44.000Z,"113 km ESE of Nikolski, Alaska",earthquake,,,,,reviewed,ushis,abe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97390,97390,2023-06-04T00:25:10.260Z,-6.8207,130.3819,114.921,5.20,mww,106.0,32.0,2.332,...,2023-06-24T03:40:07.142Z,Banda Sea,earthquake,7.79,5.094,0.093,11.0,reviewed,us,us
97391,97391,2023-06-03T02:48:12.487Z,-15.8418,-71.9907,10.000,5.00,mb,50.0,146.0,2.968,...,2023-06-23T01:20:28.040Z,"17 km SE of Huambo, Peru",earthquake,6.71,1.880,0.068,69.0,reviewed,us,us
97392,97392,2023-06-03T02:27:39.252Z,-15.8595,-71.7954,10.000,5.10,mww,68.0,96.0,2.858,...,2023-06-23T00:54:59.239Z,,earthquake,8.39,1.773,0.068,21.0,reviewed,us,us
97393,97393,2023-06-03T01:49:15.167Z,-15.6705,-71.6050,10.000,5.40,mww,72.0,96.0,2.945,...,2023-06-23T00:45:06.323Z,southern Peru,earthquake,10.65,1.769,0.047,44.0,reviewed,us,us


## Date/Time to Hours

In [64]:
# This is the value that all dates are compared to 
baseline = pd.to_datetime('1900-01-01 00:00:00')

For the Moons DataFrame

In [65]:
# Combines 'Date' and 'Time' columns into 'DateTime' 
moons_df['DateTime'] = pd.to_datetime(moons_df['Date'] + ' ' + moons_df['Time'], format="%d %B %Y %I:%M:%S %p")

In [66]:
# Converts time elapsed since baseline into hours, then rounds
moons_df['time'] = (moons_df['DateTime'] - baseline).dt.total_seconds() / 3600
moons_df['time'] = moons_df['time'].round(decimals=2)

For the Quakes DataFrame

In [67]:
# Converts time elapsed since baseline into hours, then rounds to int
quake_df['time'] = pd.to_datetime(quake_df['time'], format="%Y-%m-%dT%H:%M:%S.%fZ")
quake_df['time'] = (quake_df['time'] - baseline).dt.total_seconds() / 3600
quake_df['time'] = quake_df['time'].round(decimals=0).astype(int)

## Column Cleaning

For the Moons DataFrame

In [68]:
# Drops unneeded columns
moons_df.drop(columns=['Day', 'Date', 'Time', 'Flag', 'DateTime'], inplace=True)

In [69]:
# Creates empty DataFrame to store interpolated new moon rows at midpoints 
new_moons_df = pd.DataFrame(columns=moons_df.columns)

# Loops through the rows of 'moons_df'
for i in range(len(moons_df) - 1):
    # Gets the mean of each pair of full moon rows and adds it to 'new_moons_df'
    avg_time = (moons_df['time'].iloc[i] + moons_df['time'].iloc[i+1]) / 2
    row = pd.DataFrame([avg_time], columns=['time'])
    new_moons_df = pd.concat([new_moons_df, row], ignore_index=True)

# Combines the full moon rows with the new moon rows
moons_df = pd.concat([moons_df, new_moons_df]).sort_values('time').reset_index(drop=True)

In [70]:
# Rounds to int now that calculations are complete 
moons_df['time'] = moons_df['time'].round(decimals=0).astype(int)

In [71]:
# Changes index to id 
moons_df.reset_index(level=0, inplace=True)
moons_df.rename(columns={'index': 'moon_ID'}, inplace=True)
moons_df.set_index('moon_ID', inplace=True)

For the Plates DataFrame

In [72]:
# Changes index to id 
plate_df.reset_index(level=0, inplace=True)
plate_df.rename(columns={'index': 'point_ID'}, inplace=True)
plate_df.set_index('point_ID', inplace=True)

For the Quakes DataFrame

In [73]:
# Selects earthquake-derived seismic activity only 
type_mask = quake_df['type'] == 'earthquake'
quake_df = quake_df[type_mask]

# Selects quakes measured with moment magnitude scale only
mag_type_mask = quake_df['magType'] == 'mw'
quake_df = quake_df[mag_type_mask]

In [74]:
# Drops unneeded columns
quake_df.drop(columns=['Unnamed: 0','magType', 'nst', 'dmin','rms', 'net', 'id', 'updated', 'place', 'type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status', 'locationSource', 'magSource', 'gap'], inplace=True)

In [75]:
# Changes index to id 
quake_df.reset_index(level=0, inplace=True)
quake_df.rename(columns={'index': 'quake_ID'}, inplace=True)
quake_df.set_index('quake_ID', inplace=True)

## Checking for null values

In [76]:
print("MOONS DF", "\n", moons_df.isna().sum(), "\n") 
print("QUAKE DF", "\n", quake_df.isna().sum(), "\n")
print("PLATE DF", "\n", plate_df.isna().sum())

MOONS DF 
 time    0
dtype: int64 

QUAKE DF 
 time         0
latitude     0
longitude    0
depth        7
mag          0
dtype: int64 

PLATE DF 
 plate    0
lat      0
lon      0
dtype: int64


In [77]:
#fill null values in 'depth' column of quake_df with 0 
quake_df['depth'].fillna(0, inplace=True)

## Remove irrelevant data and join dfs


In [78]:
quake_df

Unnamed: 0_level_0,time,latitude,longitude,depth,mag
quake_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6756,57.090000,-153.480000,0.00,7.86
16,37306,41.758000,23.249000,15.00,7.02
17,37306,41.802000,23.108000,15.00,6.84
18,39285,52.763000,160.277000,30.00,7.70
19,39279,51.424000,161.638000,15.00,7.50
...,...,...,...,...,...
96941,1081579,40.377500,-125.562833,10.00,5.58
97003,1081354,40.196000,-121.099833,6.06,5.16
97006,1081343,40.204167,-121.109500,5.85,5.48
97073,1081579,40.377500,-125.562833,10.00,5.58


In [79]:
quake_df.drop(columns=['latitude', 'longitude', 'depth'], inplace=True)

In [80]:
quake_df

Unnamed: 0_level_0,time,mag
quake_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6756,7.86
16,37306,7.02
17,37306,6.84
18,39285,7.70
19,39279,7.50
...,...,...
96941,1081579,5.58
97003,1081354,5.16
97006,1081343,5.48
97073,1081579,5.58


In [81]:
quake_df_bigmag = quake_df[quake_df['mag'] >= 6.0]

In [92]:
quake_df_bigmag

Unnamed: 0_level_0,time,mag
quake_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6756,7.86
16,37306,7.02
17,37306,6.84
18,39285,7.70
19,39279,7.50
...,...,...
89776,1047570,6.40
89931,1049523,6.00
93081,1065215,6.00
93861,1069172,6.20


In [91]:
moons_df

Unnamed: 0_level_0,time
moon_ID,Unnamed: 1_level_1
0,356
1,713
2,1071
3,1428
4,1785
...,...
3730,1322140
3731,1322494
3732,1322848
3733,1323203


In [104]:
merged_df = pd.merge(quake_df_bigmag, moons_df, on='time',how='left')

In [105]:
merged_df

Unnamed: 0,time,mag
0,6756,7.86
1,37306,7.02
2,37306,6.84
3,39285,7.70
4,39279,7.50
...,...,...
8167,1047570,6.40
8168,1049523,6.00
8169,1065215,6.00
8170,1069172,6.20
