## Imports

In [2]:
import numpy as np 
import pandas as pd 
import sqlite3 

## Make Dataframes

In [3]:
# Converts csvs to DataFrames
moons_df = pd.read_csv('../datasets/full_moons.csv')
plate_df = pd.read_csv('../datasets/plate_boundaries.csv')
quake_df = pd.read_csv('../datasets/significant_earthquakes.csv')

## Date/Time to Hours

In [4]:
# This is the value that all dates are compared to 
baseline = pd.to_datetime('1900-01-01 00:00:00')

For the Moons DataFrame

In [5]:
# Combines 'Date' and 'Time' columns into 'DateTime' 
moons_df['DateTime'] = pd.to_datetime(moons_df['Date'] + ' ' + moons_df['Time'], format="%d %B %Y %I:%M:%S %p")

In [6]:
# Converts time elapsed since baseline into hours, then rounds
moons_df['time'] = (moons_df['DateTime'] - baseline).dt.total_seconds() / 3600
moons_df['time'] = moons_df['time'].round(decimals=2)

For the Quakes DataFrame

In [7]:
# Converts time elapsed since baseline into hours, then rounds to int
quake_df['time'] = pd.to_datetime(quake_df['time'], format="%Y-%m-%dT%H:%M:%S.%fZ")
quake_df['time'] = (quake_df['time'] - baseline).dt.total_seconds() / 3600
quake_df['time'] = quake_df['time'].round(decimals=0).astype(int)

## Column Cleaning

For the Moons DataFrame

In [8]:
# Drops unneeded columns
moons_df.drop(columns=['Day', 'Date', 'Time', 'Flag', 'DateTime'], inplace=True)

In [9]:
# Creates empty DataFrame to store interpolated new moon rows at midpoints 
new_moons_df = pd.DataFrame(columns=moons_df.columns)

# Loops through the rows of 'moons_df'
for i in range(len(moons_df) - 1):
    # Gets the mean of each pair of full moon rows and adds it to 'new_moons_df'
    avg_time = (moons_df['time'].iloc[i] + moons_df['time'].iloc[i+1]) / 2
    row = pd.DataFrame([avg_time], columns=['time'])
    new_moons_df = pd.concat([new_moons_df, row], ignore_index=True)

# Combines the full moon rows with the new moon rows
moons_df = pd.concat([moons_df, new_moons_df]).sort_values('time').reset_index(drop=True)

In [10]:
# Rounds to int now that calculations are complete 
moons_df['time'] = moons_df['time'].round(decimals=0).astype(int)

In [11]:
# Changes index to id 
moons_df.reset_index(level=0, inplace=True)
moons_df.rename(columns={'index': 'moon_ID'}, inplace=True)
moons_df.set_index('moon_ID', inplace=True)

For the Plates DataFrame

In [12]:
# Changes index to id 
plate_df.reset_index(level=0, inplace=True)
plate_df.rename(columns={'index': 'point_ID'}, inplace=True)
plate_df.set_index('point_ID', inplace=True)

For the Quakes DataFrame

In [13]:
# Selects earthquake-derived seismic activity only 
type_mask = quake_df['type'] == 'earthquake'
quake_df = quake_df[type_mask]

# Selects quakes measured with moment magnitude scale only
mag_type_mask = quake_df['magType'] == 'mw'
quake_df = quake_df[mag_type_mask]

In [14]:
# Drops unneeded columns
quake_df.drop(columns=['Unnamed: 0','magType', 'nst', 'dmin','rms', 'net', 'id', 'updated', 'place', 'type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status', 'locationSource', 'magSource', 'gap'], inplace=True)

In [15]:
# Changes index to id 
quake_df.reset_index(level=0, inplace=True)
quake_df.rename(columns={'index': 'quake_ID'}, inplace=True)
quake_df.set_index('quake_ID', inplace=True)

## Checking for null values

In [16]:
# Prints amount of null values in each column 
print("MOONS DF", "\n", moons_df.isna().sum(), "\n") 
print("QUAKE DF", "\n", quake_df.isna().sum(), "\n")
print("PLATE DF", "\n", plate_df.isna().sum())

MOONS DF 
 time    0
dtype: int64 

QUAKE DF 
 time         0
latitude     0
longitude    0
depth        7
mag          0
dtype: int64 

PLATE DF 
 plate    0
lat      0
lon      0
dtype: int64


In [17]:
# Fill null values in 'depth' column of quake_df with 0 
quake_df['depth'].fillna(0, inplace=True)

## Aftershock/Preshock Remover

In [28]:
def identify_shocks(quake_df):
    # Sort the dataframe by time, latitude, longitude, and magnitude
    quake_df.sort_values(['time', 'latitude', 'longitude', 'mag'], inplace=True)
    
    quake_df['shock_type'] = np.nan
    quake_df['hours_diff'] = np.nan

    for i in range(len(quake_df)):
        # Get the current earthquake
        cur_quake = quake_df.iloc[i]

        # Get all earthquakes within the given range of the current one
        in_range = quake_df[
            ((quake_df.time >= cur_quake.time - 2190) & (quake_df.time <= cur_quake.time + 2190)) & 
            (abs(quake_df.latitude - cur_quake.latitude) <= 17) & 
            (abs(quake_df.longitude - cur_quake.longitude) <= 17)
        ]

        # Get the quake with maximum magnitude within range
        max_mag_quake = in_range[in_range.mag == in_range.mag.max()]

        # If there's more than one quake with maximum magnitude, take the earliest one
        if len(max_mag_quake) > 1:
            max_mag_quake = max_mag_quake[max_mag_quake.time == max_mag_quake.time.min()]

        max_mag_quake_id = max_mag_quake.index[0]

        # If the current quake is the main quake
        if cur_quake.name == max_mag_quake_id:
            quake_df.at[cur_quake.name, 'shock_type'] = 'M'
            quake_df.at[cur_quake.name, 'hours_diff'] = 0
        else:
            quake_df.at[cur_quake.name, 'hours_diff'] = cur_quake.time - max_mag_quake.time.values[0]
            # If the current quake occurred before the main quake
            if quake_df.at[cur_quake.name, 'hours_diff'] < 0:
                quake_df.at[cur_quake.name, 'shock_type'] = 'P'
            else:  # If the current quake occurred after the main quake
                quake_df.at[cur_quake.name, 'shock_type'] = 'A'

    return quake_df

In [44]:
identify_shocks(quake_df)

Unnamed: 0_level_0,time,latitude,longitude,depth,mag,shock_type,hours_diff
quake_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,6756,57.090000,-153.480000,0.00,7.86,M,0.0
16,37306,41.758000,23.249000,15.00,7.02,M,0.0
17,37306,41.802000,23.108000,15.00,6.84,A,0.0
19,39279,51.424000,161.638000,15.00,7.50,P,-6.0
18,39285,52.763000,160.277000,30.00,7.70,M,0.0
...,...,...,...,...,...,...,...
96776,1081579,40.377500,-125.562833,10.00,5.58,M,0.0
96941,1081579,40.377500,-125.562833,10.00,5.58,A,0.0
97073,1081579,40.377500,-125.562833,10.00,5.58,A,0.0
97187,1081579,40.377500,-125.562833,10.00,5.58,A,0.0


In [41]:
preshock_mask = quake_df["shock_type"] == 'P'
mainshock_mask = quake_df["shock_type"] == 'M'
aftershock_mask = quake_df["shock_type"] == 'A'

In [42]:
preshocks = quake_df[preshock_mask]
mainshocks = quake_df[mainshock_mask]
aftershocks = quake_df[aftershock_mask]

In [43]:
print(len(preshocks))
print(len(mainshocks))
print(len(aftershocks))

8880
3706
11674


In [45]:
quake_df["mag"].min

<bound method NDFrame._add_numeric_operations.<locals>.min of quake_ID
0        7.86
16       7.02
17       6.84
19       7.50
18       7.70
         ... 
96776    5.58
96941    5.58
97073    5.58
97187    5.58
96606    5.54
Name: mag, Length: 24260, dtype: float64>

## Store as SQL

In [46]:
# Creates and connects to repo SQL database
conn = sqlite3.connect('../datasets/database.db')

In [47]:
# Creates SQL tables from each DataFrame
moons_table_name = 'moons_table'
moons_df.to_sql(moons_table_name, conn, if_exists='replace', index=True)

plate_table_name = 'plate_table'
plate_df.to_sql(plate_table_name, conn, if_exists='replace', index=True)

quake_table_name = 'quake_table'
quake_df.to_sql(quake_table_name, conn, if_exists='replace', index=True)

24260