## Imports

In [2]:
import sqlite3
import pandas as pd

## Make Dataframes

In [3]:
# Converts csvs to DataFrames
moons_df = pd.read_csv('../datasets/full_moons.csv')
plate_df = pd.read_csv('../datasets/plate_boundaries.csv')
quake_df = pd.read_csv('../datasets/significant_earthquakes.csv')

## Date/Time to Hours

In [4]:
# This is the value that all dates are compared to 
baseline = pd.to_datetime('1900-01-01 00:00:00')

For the Moons DataFrame

In [5]:
# Combines 'Date' and 'Time' columns into 'DateTime' 
moons_df['DateTime'] = pd.to_datetime(moons_df['Date'] + ' ' + moons_df['Time'], format="%d %B %Y %I:%M:%S %p")

In [6]:
# Converts time elapsed since baseline into hours, then rounds
moons_df['time'] = (moons_df['DateTime'] - baseline).dt.total_seconds() / 3600
moons_df['time'] = moons_df['time'].round(decimals=2)

For the Quakes DataFrame

In [7]:
# Converts time elapsed since baseline into hours, then rounds to int
quake_df['time'] = pd.to_datetime(quake_df['time'], format="%Y-%m-%dT%H:%M:%S.%fZ")
quake_df['time'] = (quake_df['time'] - baseline).dt.total_seconds() / 3600
quake_df['time'] = quake_df['time'].round(decimals=0).astype(int)

## Column Cleaning

For the Moons DataFrame

In [8]:
# Drops unneeded columns
moons_df.drop(columns=['Day', 'Date', 'Time', 'Flag', 'DateTime'], inplace=True)

In [9]:
# Creates empty DataFrame to store interpolated new moon rows at midpoints 
new_moons_df = pd.DataFrame(columns=moons_df.columns)

# Loops through the rows of 'moons_df'
for i in range(len(moons_df) - 1):
    # Gets the mean of each pair of full moon rows and adds it to 'new_moons_df'
    avg_time = (moons_df['time'].iloc[i] + moons_df['time'].iloc[i+1]) / 2
    row = pd.DataFrame([avg_time], columns=['time'])
    new_moons_df = pd.concat([new_moons_df, row], ignore_index=True)

# Combines the full moon rows with the new moon rows
moons_df = pd.concat([moons_df, new_moons_df]).sort_values('time').reset_index(drop=True)

In [10]:
# Rounds to int now that calculations are complete 
moons_df['time'] = moons_df['time'].round(decimals=0).astype(int)

In [11]:
# Changes index to id 
moons_df.reset_index(level=0, inplace=True)
moons_df.rename(columns={'index': 'moon_ID'}, inplace=True)
moons_df.set_index('moon_ID', inplace=True)

For the Plates DataFrame

In [12]:
# Changes index to id 
plate_df.reset_index(level=0, inplace=True)
plate_df.rename(columns={'index': 'point_ID'}, inplace=True)
plate_df.set_index('point_ID', inplace=True)

For the Quakes DataFrame

In [13]:
# Selects earthquake-derived seismic activity only 
type_mask = quake_df['type'] == 'earthquake'
quake_df = quake_df[type_mask]

# Selects quakes measured with moment magnitude scale only
mag_type_mask = quake_df['magType'] == 'mw'
quake_df = quake_df[mag_type_mask]

In [14]:
# Drops unneeded columns
quake_df.drop(columns=['Unnamed: 0','magType', 'nst', 'dmin','rms', 'net', 'id', 'updated', 'place', 'type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status', 'locationSource', 'magSource', 'gap'], inplace=True)

In [15]:
# Changes index to id 
quake_df.reset_index(level=0, inplace=True)
quake_df.rename(columns={'index': 'quake_ID'}, inplace=True)
quake_df.set_index('quake_ID', inplace=True)

## Checking for null values

In [42]:
print("MOONS DF", "\n", moons_df.isna().sum(), "\n") 
print("QUAKE DF", "\n", quake_df.isna().sum(), "\n")
print("PLATE DF", "\n", plate_df.isna().sum())

MOONS DF 
 time    0
dtype: int64 

QUAKE DF 
 time         0
latitude     0
longitude    0
depth        0
mag          0
dtype: int64 

PLATE DF 
 plate    0
lat      0
lon      0
dtype: int64


In [43]:
#fill null values in 'depth' column of quake_df with 0 
quake_df['depth'].fillna(0, inplace=True)

## Store as SQL

In [15]:
# Creates and connects to repo SQL database
conn = sqlite3.connect('../datasets/database.db')

In [16]:
# Creates SQL tables from each DataFrame
moons_table_name = 'moons_table'
moons_df.to_sql(moons_table_name, conn, if_exists='replace', index=True)

plate_table_name = 'plate_table'
plate_df.to_sql(plate_table_name, conn, if_exists='replace', index=True)

quake_table_name = 'quake_table'
quake_df.to_sql(quake_table_name, conn, if_exists='replace', index=True)

24260