In [11]:
# Sam Brown
# sam_brown@mines.edu
# June19
# Goal: Preprocess data and create dataframe for analysis of long-term slip patterns

import sys
sys.path.append("../")

import my_lib.funcs
import Stations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# NOTES use GZ05 for coordinates to retrieve tide data.

In [12]:
# Load the paths

# df_2008 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2008_2008Events2stas")
# df_2009 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2009_2009Events2stas")
df_2010 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2010_2010Events2stas")
df_2011 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2011_2011Events2stas")
df_2012 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2012_2012Events2stas")
df_2013 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2013_2013Events2stas")
df_2014 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2014_2014Events2stas")
df_2015 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2015_2015Events2stas")
df_2016 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2016_2016Events2stas")
df_2017 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2017_2017Events2stas")
df_2018 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2018_2018Events2stas")
df_2019 = my_lib.funcs.load_evt("/Users/sambrown04/Documents/SURF/Events/2019_2019Events2stas")

# One Large list
all_dfs = (
    df_2010 + df_2011 + df_2012 + df_2013 +
    df_2014 + df_2015 + df_2016 + df_2017 + df_2018 + df_2019
)

# Preprocess
clean_df = my_lib.funcs.extract_event_features(all_dfs)

standards = pd.read_csv('../station_standards.csv')

In [24]:
# Loop through each event. Loop through each station in events, append the standardized delta to the list then average it.
# add start time to data frame

# Initialize dataframe
net_df = pd.DataFrame(columns = [ 'tide_deriv', 'form_fac', 'time_since', 'slip_size_standardized', 'high_t_evt', 'start_time', 'sev_stds', 'pre-s_stds']) # Tide height will be added by merge

for event in clean_df:

    # Initialize list for slip sizes
    slip_deltas = []
    slip_sevs = []
    slip_pre = []

    # Loop through rows
    for i, row in event.iterrows():
        station = row['station'][:4]
        if station == 'slw1' or station =='ws04' or station =='ws05':
            continue

        row_sch = standards[standards['Station'] == station]

        # if row_sch.empty:
        #     raise ValueError(f"Station not found in standards: '{station}'")
        
        # Standardization size
        station_mean = row_sch['slip_size'].iloc[0]
        station_sd = row_sch['slip_size_sd'].iloc[0]
        standardized_val = (row['total_delta'] - station_mean) / station_sd
        # print(f"{row['total_delta']}, {station}, {station_mean}, {station_sd}")

        # Standardization slip severity
        station_sev_u = row_sch['slip_severity'].iloc[0]
        station_sev_o = row_sch['slip_severity_sd'].iloc[0]

        standardized_val_sev = (row['slip_severity'] - station_sev_u) / station_sev_o

        # Standardization pre-slip area
        station_slip_u = row_sch['pre-slip_area'].iloc[0]
        station_slip_o = row_sch['pre-slip_area_sd'].iloc[0]

        standardized_val_slip = (row['pre-slip_area'] - station_slip_u) / station_slip_o        

        slip_deltas.append(standardized_val)
        slip_sevs.append(standardized_val_sev)
        slip_pre.append(standardized_val_slip)
        

    net_df.loc[len(net_df)] = {
        "slip_size_standardized": sum(slip_deltas) / len(slip_deltas),
        "start_time": event.at[0, 'start_time'],
        "sev_stds": sum(slip_sevs) / len(slip_sevs),
        "pre-s_stds": sum(slip_pre) / len(slip_pre)
    }

In [26]:
# Load tide Data
# NOTE: previously averaged coors of gz stations and had missing vals; will use gz05 now

#Average coordinates for gz stations (source code in severity_class nb)
# x_cor = -168955.1491394913 
# y_cor = -599694.5432784811

# GZ05 coors
x_cor = -155992.359664
y_cor = -604863.615642

tide_df = my_lib.funcs.get_tide_height(4380, x_cor, y_cor, "2008-01-01 00:00:00") # 12 years worth of data

# Organize events by time
net_df = net_df.sort_values('start_time')

# Calculate time since in minutes
net_df['time_since'] = net_df['start_time'].diff().dt.total_seconds() / 60

# Get start times down to minutes
tide_df['time'] = tide_df['time'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M"))
net_df['start_time'] = net_df['start_time'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M"))

# Insert Tide values
net_df['start_time'] = pd.to_datetime(net_df['start_time'])
tide_df['time'] = pd.to_datetime(tide_df['time'])

# Merge tide height into net_df based on matching timestamps
merged_df = pd.merge(net_df, tide_df[['time', 'tide_height']], 
                     left_on='start_time', right_on='time', how='left')

# Drop extra 'time' column if you want
merged_df = merged_df.drop(columns=['time'])

# Insert tide derivatives into data
tide_d = my_lib.funcs.tide_derivative(tide_df)
for i, row in merged_df.iterrows():
    time = row['start_time']

    index = tide_d[tide_d['time'] == time].index

    if not index.empty:
        idx = index[0]
        merged_df.at[i, 'tide_deriv'] = tide_d.at[idx, 'tide_deriv']

Elapsed time: 44.83895397186279 seconds


In [33]:
# Form factor calculation

form_fac = my_lib.funcs.form_factor_calc(tide_df)

  popt, pcov = scipy.optimize.curve_fit(sines, seconds_tide, tide_window, p0=initial_guess)


In [34]:
# Add date-only column to form_fac
form_fac['date_only'] = form_fac['dates'].dt.date

# Loop through each row in avg_dat
for i, event in merged_df.iterrows():
    time = event['start_time']
    target_date = time.date()

    # Select all rows with matching date
    rows_date = form_fac[form_fac['date_only'] == target_date]

    # Compute average form factor for that date
    merged_df.at[i, 'form_fac'] = rows_date['form_factors'].mean()

# Encode high tide vs low tide event
merged_df['high_t_evt'] = (merged_df['tide_height'] > 0).astype(int)

In [55]:
# We would like to add feature(s) that takes the time period between events and retrieves the form factor for the time period between these events. 
# Two different features, semi diurnal fit and diurnal fit 

# Loop through each event 
for i, evt in merged_df.iterrows():

    # Can't find time before first event
    if i < 1:
        continue
    
    # get start time and duration minutes
    curDate = merged_df.at[i, 'start_time']
    prevDate = merged_df.at[i-1, 'start_time']

    diff = curDate - prevDate
    minutes = diff.total_seconds() / 60.0

    # calculate form factor for this time period
    [form_fac, A1, A2, phi1, phi2] = my_lib.funcs.form_factor_window(tide_df, prevDate, minutes)
    # print([form_fac, A1, A2, phi1, phi2])

    # insert the features into the data set for that row
    merged_df.loc[i, 'inter_form_Fac'] = form_fac
    merged_df.loc[i, 'A_diurn'] = A1
    merged_df.loc[i, 'A_semidiurn'] = A2
    # merged_df.loc[i, 'phi1'] = phi1
    # merged_df.loc[i, 'phi2'] = phi2

In [56]:
# Export to csv file

merged_df.to_csv("/Users/sambrown04/Documents/SURF/Preproc_data/10-18.csv", index = False)