In [12]:
# Sam Brown
# sam_brown@mines.edu
# June 9, 2025
# Goal: Form and save a dataframe that will be used to train a neural net (GZ stations)

import sys
sys.path.append("/Users/sambrown04/Documents/SURF/whillans-surf/notebooks/SURF")

import my_lib.funcs
import Stations

# tide calcs (move to library eventually)
import Tides
import util.coordinate_transforms
import pyTMD
import datetime
import time
import scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# gz stations: 2011 to 2013

# Load in Raw Data
events_list2011 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2011_2011Events2stas")
events_list2012 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2012_2012Events2stas")
events_list2013 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2013_2013Events2stas")

# Use preprocessing function to get some of the features
pre_11 = my_lib.funcs.extract_event_features(events_list2011)
pre_12 = my_lib.funcs.extract_event_features(events_list2012)
pre_13 = my_lib.funcs.extract_event_features(events_list2013)

# Merge into one large list of Dataframes
tot_dat = pre_11 + pre_12 + pre_13

# Keep only Gz stations
for i in range(len(tot_dat)):
    gz_rows = tot_dat[i][tot_dat[i]['station'].str.contains('gz')]
    # if gz rows exist, pass them
    if len(gz_rows) > 0:
        tot_dat[i] = gz_rows
    else: # If no gz rows exist (only 15 instances), pass a row with only start_time
        
        first_row = tot_dat[i].iloc[0].copy()
        
        # Set all columns to NaN except 'start_time'
        cols_to_nan = first_row.index.difference(['start_time'])
        first_row[cols_to_nan] = np.nan
        
        # 1-row DataFrame again
        tot_dat[i] = pd.DataFrame([first_row])
    
# Define df
avg_dat = pd.DataFrame(columns = ['pre_slip_area', 'total_delta', 'start_time', 'slip_severity', 'tide_height', 'tide_change', 'form_factor', 'mins_since'])
count = 0
# Loop to collect averages
for event in tot_dat:
    avg_dat.loc[len(avg_dat)] = {
        "pre_slip_area": event['pre-slip_area'].mean(),
        "total_delta": event['total_delta'].mean(),
        "start_time": event.iloc[0].loc['start_time'], # Start time is same for all so just take first station's
        "slip_severity": event['slip_severity'].mean()
    }
    
# Organize by time
avg_dat['start_time'] = pd.to_datetime(avg_dat['start_time'], errors='coerce')
avg_dat = avg_dat.sort_values('start_time')

# First 15 events don't have gz stations so we will remove them
avg_dat = avg_dat.iloc[15:]

# Calculate minutes net_d[since last event
avg_dat['mins_since'] = avg_dat['start_time'].diff().dt.total_seconds() / 60

# Retrieve Tide Height. We will use the average coors of all gz stations for the tide model. Code for how average coordinates are retrieved in severity_classification
x_cor = -168955.1491394913 
y_cor = -599694.5432784811

tide_df = my_lib.funcs.get_tide_height(1100, x_cor, y_cor, "2011-01-01 00:00:00") # tide height is in centimeters (1100 days = 3 years)

# Only down to minutes, 
tide_df['time'] = tide_df['time'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M"))
avg_dat['start_time'] = avg_dat['start_time'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M"))

# Put corresponding tide height into our main dataframe
for i, row in avg_dat.iterrows():
    # Identify start time for row
    time_str = row['start_time']

    # Find the index that has this time in the tide data
    index = tide_df[tide_df['time'] == time_str].index

    # Insert time into our df
    if not index.empty:
        tide_r = index[0]
        avg_dat.at[i, 'tide_height'] = tide_df.at[tide_r, 'tide_height']

# Insert Tide Derivatives into our dataset
tide_d = my_lib.funcs.tide_derivative(tide_df)

for i, row in avg_dat.iterrows():
    time = row['start_time']

    index = tide_d[tide_d['time'] == time].index

    if not index.empty:
        idx = index[0]
        avg_dat.at[i, 'tide_change'] = tide_d.at[idx, 'tide_deriv']

# Add new feature to be minutes until next event to predict how long until the next event
avg_dat['mins_until'] = avg_dat['mins_since'].shift(-1)

Elapsed time: 28.791157007217407 seconds


In [5]:
# Turn time column back to datetime
tide_df['time'] = pd.to_datetime(tide_df['time'])

In [6]:
# Get form factor for event
form_fac = my_lib.funcs.form_factor_calc(tide_df)
# Need to think about how we want this in our dataset

  popt, pcov = scipy.optimize.curve_fit(sines, seconds_tide, tide_window, p0=initial_guess)


In [8]:
# Ensure the same type 
avg_dat['start_time'] = pd.to_datetime(avg_dat['start_time'])
form_fac['dates'] = pd.to_datetime(form_fac['dates'])
for
# Ensure sorted
avg_dat = avg_dat.sort_values('start_time')
form_fac = form_fac.sort_values('dates')

# Add date-only column to form_fac
form_fac['date_only'] = form_fac['dates'].dt.date

# Loop through each row in avg_dat
for i, event in avg_dat.iterrows():
    time = event['start_time']
    target_date = time.date()

    # Select all rows with matching date
    rows_date = form_fac[form_fac['date_only'] == target_date]

    # Compute average form factor for that date
    avg_dat.at[i, 'form_factor'] = rows_date['form_factors'].mean()

In [9]:
# Want new columns h_event, l_event
avg_dat['h_event'] = (avg_dat['tide_height'] > 0).astype(int)
# Maybe add previous event column in future to capture pattern, for now mins_since shoudl capture skipped events



In [10]:
# Next find a way to encode whether it is a high-tide event or low tide event, and whether the previous event was skipped.\
avg_dat.head(40)

Unnamed: 0,pre_slip_area,total_delta,start_time,slip_severity,tide_height,tide_change,form_factor,mins_since,mins_until,h_event
122,107.805138,0.348783,2011-01-11 08:05:00,8.194637e-07,4.578164,-0.001927,1.647468,,665.0,1
67,126.125599,0.312254,2011-01-11 19:10:00,6.707494e-07,-19.514214,-0.24613,1.647468,665.0,815.0,0
135,138.865041,0.3348,2011-01-12 08:45:00,6.132234e-07,19.337788,-0.079343,1.702534,815.0,760.0,1
238,90.95051,0.30365,2011-01-12 21:25:00,9.796908e-07,-53.653778,-0.15199,1.702534,760.0,790.0,0
474,128.884896,0.324098,2011-01-13 10:35:00,7.178703e-07,25.038245,-0.115554,2.101911,790.0,1385.0,1
371,120.203366,0.473986,2011-01-14 09:40:00,1.042288e-06,53.094581,-0.159486,2.778525,1385.0,605.0,1
535,157.434854,0.334752,2011-01-14 19:45:00,9.561945e-07,-36.566615,-0.277287,2.778525,605.0,850.0,0
158,101.864204,0.359634,2011-01-15 09:55:00,5.087065e-07,70.904331,-0.129913,4.023792,850.0,600.0,1
516,191.433275,0.330781,2011-01-15 19:55:00,8.102387e-07,-44.46517,-0.280397,4.023792,600.0,920.0,0
71,120.716482,0.365788,2011-01-16 11:15:00,5.827073e-07,77.532816,-0.124508,6.6543,920.0,1525.0,1


In [15]:
avg_dat.to_csv('averages_events_2011-13', index = False)