In [1]:
# Sam Brown
# sam_brown@mines.edu
# Jul 25
# Goal: Preprocess the position data so we can use the inter event movement as a feature for our models

import sys
sys.path.append("/Users/sambrown/Documents/SURF/whillans-surf/notebooks/SURF")

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.linear_model import LinearRegression

import my_lib.funcs
from InterEvt import InterEvt


In [2]:
# Want to get information about what happens in between events so we will load in the time frames for each event and what stations were operational.
df_2010 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2010_2010Events2stas")
df_2011 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2011_2011Events2stas")
df_2012 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2012_2012Events2stas")
df_2013 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2013_2013Events2stas")
df_2014 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2014_2014Events2stas")
df_2015 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2015_2015Events2stas")
df_2016 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2016_2016Events2stas")
df_2017 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2017_2017Events2stas")
df_2018 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2018_2018Events2stas")
df_2019 = my_lib.funcs.load_evt("/Users/sambrown/Documents/SURF/Events/2019_2019Events2stas")

# One Large list
all_dfs = (
    df_2010 
    + df_2011 
    + df_2012 + df_2013 +
    df_2014 + df_2015 + df_2016 + df_2017 + df_2018 
)
# Sort by time
dfs_sorted = sorted(all_dfs, key=lambda df: df['time'].iloc[0])

# Preprocess
clean_df = my_lib.funcs.extract_event_features(dfs_sorted)

In [6]:
# Loop through and get the start_time end end_time for each event

events_dict = []

for i, event in enumerate(dfs_sorted):
    start_time = event['time'].iloc[0]
    end_time = event['time'].iloc[-1]

    stations = clean_df[i]['station'].tolist()
    event = {
        'start_time': start_time,
        'end_time': end_time,
        'stations': stations
    }

    events_dict.append(event)

In [7]:
# Get inter-event positional data -> load in by year and station

input_dir = "/Users/sambrown/Documents/SURF/POS Data"
dfs_by_year = {}
years_to_include = [2010,2011,2012,2013,2014,2015,2016,2017,2018]

# loop over year folders
for year_folder in os.listdir(input_dir):
    year_path = os.path.join(input_dir, year_folder)

    if not year_folder.isdigit():
        continue

    year = int(year_folder)

    # Process only desired years
    if year not in years_to_include:
        continue

    year = int(year_folder)
    dfs_by_year[year] = []

    # loop over CSVs in that year folder
    for file in os.listdir(year_path):
        if file.endswith('.csv'):
            file_path = os.path.join(year_path, file)
            try:
                df = pd.read_csv(file_path)
                dfs_by_year[year].append(df)
            except Exception as e: # File is corrupted or not csv for some reason...
                print(f"Failed to load {file_path}: {e}")

In [8]:
# For each event, create a dataframe that has the station as the columns and then a column for the datetime values
for year, df_list in dfs_by_year.items():
    for df in df_list:
        df['time'] = pd.to_datetime(df['time'])

In [10]:
# Parse through the start times of the events and pull the data from the end time of the previous evt to the start time of the current event and make a dataframe

events_dfs = []
end_times = []
print("start")

# Loop through event start times defined above
for i, event in enumerate(events_dict):
    if i == 0: # we are looking back so skip first
        continue
        
    print(f"Loading Event {i}")
    
    # Define the interval between events and the stations up
    end_interval = datetime.strptime(events_dict[i]['start_time'], '%Y-%m-%d %H:%M:%S')
    start_interval = datetime.strptime(events_dict[i-1]['end_time'], '%Y-%m-%d %H:%M:%S')
    stations = events_dict[i]['stations']
    
    year = end_interval.year
    end_times.append(end_interval)

    if year not in dfs_by_year:
        print(f"Missing data for year {year}")
        continue

    inter_df = pd.DataFrame()

    # Loop through stations in the year and add the correct time frame to the inter_df
    for df in dfs_by_year[year]:
        station = df['station'].iloc[0]
        
        # Select data during the interval
        mask = (df['time'] >= start_interval) & (df['time'] <= end_interval)
        filtered_df = df.loc[mask]
        
        xdat = filtered_df['x'].reset_index(drop=True)
        ydat = filtered_df['y'].reset_index(drop=True)
        zdat = filtered_df['elevation'].reset_index(drop=True)
        time = filtered_df['time'].reset_index(drop=True)

        partial_data = {
        f'{station}_x': xdat,
        f'{station}_y': ydat,
        f'{station}_z': zdat
        }

        partial_df = pd.DataFrame(partial_data)
        inter_df = pd.concat([inter_df, partial_df], axis=1)

    
    events_dfs.append(inter_df)
    print(f"Event {i} DataFrame shape: {inter_df.shape}")
    

    
    

start
Loading Event 1
Event 1 DataFrame shape: (1522, 46)
Loading Event 2
Event 2 DataFrame shape: (3632, 46)
Loading Event 3
Event 3 DataFrame shape: (4982, 46)
Loading Event 4
Event 4 DataFrame shape: (1222, 46)
Loading Event 5
Event 5 DataFrame shape: (3152, 46)
Loading Event 6
Event 6 DataFrame shape: (1682, 46)
Loading Event 7
Event 7 DataFrame shape: (2902, 46)
Loading Event 8
Event 8 DataFrame shape: (2262, 46)
Loading Event 9
Event 9 DataFrame shape: (2582, 46)
Loading Event 10
Event 10 DataFrame shape: (5342, 46)
Loading Event 11
Event 11 DataFrame shape: (3782, 46)
Loading Event 12
Event 12 DataFrame shape: (1442, 46)
Loading Event 13
Event 13 DataFrame shape: (3182, 46)
Loading Event 14
Event 14 DataFrame shape: (1562, 46)
Loading Event 15
Event 15 DataFrame shape: (3422, 46)
Loading Event 16
Event 16 DataFrame shape: (1612, 46)
Loading Event 17
Event 17 DataFrame shape: (3432, 46)
Loading Event 18
Event 18 DataFrame shape: (1632, 46)
Loading Event 19
Event 19 DataFrame shap

In [12]:
# Standardize values based on which stations were operational
stas = ["la01", "la02", "la03", "la04", "la05", "la06", "la07", "la08", "la09",
        "la10", "la11", "la12", "la13", "la14", "la15", "la16", "la17", "la18",
        "ws04", "ws05",
        "gz01", "gz02", "gz03", "gz04", "gz05", "gz06", "gz07", "gz08", "gz09",
        "gz10", "gz11", "gz12", "gz13", "gz14", "gz15", "gz16", "gz17", "gz18",
        "gz19", "gz20",
        "mg01", "mg02", "mg03", "mg04", "mg05", "mg06", "mg07",
        "slw1"
        ]

In [13]:
intsta_objects = {}
for i, name in enumerate(stas, start = 1):
    print(f"[{i}/{len(stas)}] Creating station object for: {name}")
    intsta_objects[name] = InterEvt(name, events_dfs)

[1/48] Creating station object for: la01
[2/48] Creating station object for: la02
[3/48] Creating station object for: la03
[4/48] Creating station object for: la04
[5/48] Creating station object for: la05
[6/48] Creating station object for: la06
[7/48] Creating station object for: la07
[8/48] Creating station object for: la08
[9/48] Creating station object for: la09
[10/48] Creating station object for: la10
[11/48] Creating station object for: la11
[12/48] Creating station object for: la12
[13/48] Creating station object for: la13
[14/48] Creating station object for: la14
[15/48] Creating station object for: la15
[16/48] Creating station object for: la16
[17/48] Creating station object for: la17
[18/48] Creating station object for: la18
[19/48] Creating station object for: ws04
[20/48] Creating station object for: ws05
[21/48] Creating station object for: gz01
[22/48] Creating station object for: gz02
[23/48] Creating station object for: gz03
[24/48] Creating station object for: gz04
[

In [14]:
inter_df = pd.DataFrame(columns = ['Station', 'displacement', 'displacement_sd', 'slope', 'slope_sd', 'r2', 'r2_sd'])

for station_name, station in intsta_objects.items():
    inter_df.loc[len(inter_df)] = {
        "Station": station.name,
        "displacement": station.avg_disp,
        "displacement_sd": station.std_disp,
        "slope": station.avg_slope,
        "slope_sd": station.std_slope,
        "r2": station.avg_r2,
        "r2_sd": station.std_r2
    }

In [18]:
inter_df.to_csv('inter_standards', index = False)

In [16]:
print(len(events_dict), len(events_dfs), len(end_times))

4282 4281 4281


In [19]:
#Make data set and fill with NAs
inter_dat = pd.DataFrame({
    'time': end_times,
    'disp_standardized': [pd.NA] * len(end_times),
    'r2_standardized': [pd.NA] * len(end_times),
    'slope_standardized': [pd.NA] * len(end_times)
})

for i, event in enumerate(events_dfs):

    disp_std_list = []
    r2_std_list = []
    slope_std_list = []

    for col in event.columns:
        if '_x' not in col:
            continue  # skip non-station x columns

        if event[col].notna().all():
            station = col[:4]  
            row_sch = inter_df[inter_df['Station'] == station]

            # Calculate metrics
            displacement = event[col].iloc[-1] - event[col].iloc[0]
            X = np.arange(event.shape[0]).reshape(-1, 1)
            y = event[col].values.reshape(-1, 1)

            reg = LinearRegression().fit(X, y)
            slope = reg.coef_[0][0]
            r2 = reg.score(X, y)

            # standardization values
            
                disp_mean = row_sch['displacement'].iloc[0]
                disp_sd = row_sch['displacement_sd'].iloc[0]
                slope_mean = row_sch['slope'].iloc[0]
                slope_sd = row_sch['slope_sd'].iloc[0]
                r2_mean = row_sch['r2'].iloc[0]
                r2_sd = row_sch['r2_sd'].iloc[0]
            

            # Standardize
            disp_std = (displacement - disp_mean) / disp_sd 
            slope_std = (slope - slope_mean) / slope_sd 
            r2_std = (r2 - r2_mean) / r2_sd 

            
            disp_std_list.append(disp_std)
            slope_std_list.append(slope_std)
            r2_std_list.append(r2_std)

    # Aggregate 
    if disp_std_list:
        inter_dat.loc[i] = {
            'time': end_times[i],
            'disp_standardized': np.mean(disp_std_list),
            'slope_standardized': np.mean(slope_std_list),
            'r2_standardized': np.mean(r2_std_list)
        }

    

In [23]:
inter_dat.tail(40)

Unnamed: 0,time,disp_standardized,r2_standardized,slope_standardized
4241,2018-11-28 17:59:45,-0.525129,-0.035359,-0.224722
4242,2018-11-29 17:24:45,0.525579,0.074251,-0.218031
4243,2018-11-30 06:04:45,-0.839051,-0.255495,0.036507
4244,2018-11-30 18:49:45,-0.79203,-0.161562,-0.240237
4245,2018-12-01 16:54:45,0.296539,-0.313527,-0.236292
4246,2018-12-02 06:34:45,-0.583565,0.081207,0.017611
4247,2018-12-02 18:19:45,-0.814634,-0.448677,-0.376092
4248,2018-12-03 08:54:45,-0.216748,0.044556,-0.154586
4249,2018-12-04 11:49:45,1.090032,0.718417,-0.016715
4250,2018-12-05 16:04:45,1.222411,0.790417,0.053533


In [22]:
inter_dat.to_csv("/Users/sambrown/Documents/SURF/inter_df")