In [29]:
#Purisa Jasmine Simmons
#Viren Abhyankar
#February 2020

#Overview: Trying to generate a PSD plot (framework for future plots).
#Based on Method V of this paper: https://journals.ametsoc.org/doi/pdf/10.1175/2010JTECHO724.1

#First, parse the data from the .CSV file.
#This data comes from Scripps buoy that recalibrates every 30 minutes 
#all of the vertical accelerations are contained in IMUA2.

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import os
os.environ['PROJ_LIB'] = 'C:/Users/USERNAME/Anaconda3/Lib/site-packages/mpl_toolkits/basemap'
from mpl_toolkits.basemap import Basemap
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns

import pandas as pd
import numpy as np

from scipy import stats
from scipy import constants
from scipy import signal #added
from scipy.interpolate import CubicSpline
from scipy.interpolate import interp1d
from scipy.integrate import simps
from scipy.integrate import cumtrapz

import os
import datetime
import pytz
import re

import peakutils
import statsmodels.api as sm

import requests


from plotly import tools #added all the plotly's
import plotly.offline
import plotly.graph_objs as go

import math #added
import re   #added
import statistics #added
import statsmodels.api as sm


# For the definition of the abstract base class IMU_Base
import abc

import sys

import requests

# %matplotlib notebook
%matplotlib inline

In [43]:
plt.rc("font", size=14) 
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

ride_ids = ['15692']


#%% Fin ID scraper
# Input fin ID, get all ride IDs
# base URL to which we'll append given fin IDs
# fin_url_base = 'http://surf.smartfin.org/fin/'

# Look for the following text in the HTML contents in fcn below
# str_id_ride = 'rideId = \'' # backslash allows us to look for single quote
# str_id_date = 'var date = \'' # backslash allows us to look for single quote

#%% Ride ID scraper


# Input ride ID, get ocean and motion CSVs
# Base URL to which we'll append given ride IDs
ride_url_base = 'https://surf.smartfin.org/ride/'

# Look for the following text in the HTML contents in fcn below to get csv id 
str_id_csv = 'img id="temperatureChart" class="chart" src="' 


def get_csv_from_ride_id(rid):
    
# step 1    
    # Build URL for each individual ride
    ride_url = ride_url_base+str(rid)
#     print("ride_url: " + ride_url)
    
# step 2
    # query smartfin website to retrieve the ride's webpage in HTML  
    html_contents = requests.get(ride_url).text
#     print("html contents: ", html_contents)
    
    # Find CSV file location id in html page by csv file tag
    loc_csv_id = html_contents.find(str_id_csv)
#     print("loc_csv_id: ", loc_csv_id)
    
# step 3
    # log into smartfin website to get request authentication
    # Different based on whether user logged in with FB or Google
    offset_googleOAuth = [46, 114]
    offset_facebkOAuth = [46, 112]
    if html_contents[loc_csv_id+59] == 'f': # Facebook login
        off0 = offset_facebkOAuth[0]
        off1 = offset_facebkOAuth[1]
    else: # Google login
        off0 = offset_googleOAuth[0]
        off1 = offset_googleOAuth[1]

# step 4
    # use csv id and authentication offsets to build query string
    csv_id_longstr = html_contents[loc_csv_id+off0:loc_csv_id+off1]
#     print("csv_id_longstr: ", csv_id_longstr)
    
    # Stitch together full URL for CSV
    if ("media" in csv_id_longstr) & ("Calibration" not in html_contents): # other junk URLs can exist and break everything

# step 5
        # full urls to get csv file       
        ocean_csv_url = f'https://surf.smartfin.org/{csv_id_longstr}Ocean.CSV'
        motion_csv_url = f'https://surf.smartfin.org/{csv_id_longstr}Motion.CSV'
        
        print("ocean_csv_url: ", ocean_csv_url)
        print("motion_csv_url: ", motion_csv_url)
        print('\n\n')

# step 6
        # Go to ocean_csv_url and grab contents (theoretically, a CSV)
        ocean_df_small = pd.read_csv(ocean_csv_url, parse_dates = [0])
        motion_df_small = pd.read_csv(motion_csv_url, parse_dates = [0])


# step 7
        # 7a. add elasped column to show how much time has elapsed since first reading
        elapsed_timedelta = (ocean_df_small['UTC']-ocean_df_small['UTC'][0])
        ocean_df_small['elapsed'] = elapsed_timedelta/np.timedelta64(1, 's')
        
        print("motion_df_small raw: ", motion_df_small)
        print('\n\n')
        print("ocean_df_small raw: ", ocean_df_small)
        print('\n\n')


        
        # 7b. make the index of each df the timestamp
        if len(ocean_df_small) > 1:
            ocean_df_small.set_index('UTC', drop = True, append = False, inplace = True)
            motion_df_small.set_index('UTC', drop = True, append = False, inplace = True)
            
            print("ocean_df_small length pre upsample: ", len(ocean_df_small))
            print("motion_df_small length pre upsample: ", len(motion_df_small))
            
            # 7c. resample data to 33ms intervals (30 Hz)
            #May need to change this sampling interval:
            sample_interval = '33ms'
                        
            ocean_df_small_resample = ocean_df_small.resample(sample_interval).mean()
            motion_df_small_resample = motion_df_small.resample(sample_interval).mean()
    
            
            print('ocean_df_resample length: ', len(ocean_df_small_resample))
            print('motion_df_resample length: ', len(motion_df_small_resample))
            print('\n\n')
            
            # returns all rows that have values in latitude column           
            # No need to save many extra rows with no fix
            # motion_df_small = motion_df_small[~np.isnan(motion_df_small.Latitude)]
            
            return ocean_df_small_resample, motion_df_small_resample

    # if dataframe is empty, just return empty dataframe    
    else:
        ocean_df_small_resample = pd.DataFrame() # empty DF just so something is returned
        motion_df_small_resample = pd.DataFrame() 
        return ocean_df_small_resample, motion_df_small_resample
    
    
    
    
    
# actual script

appended_ocean_list = [] # list of DataFrames from original CSVs
appended_motion_list = []
appended_multiIndex = [] # fin_id & ride_id used to identify each DataFrame

## Nested loops (for each fin ID, find all ride IDs, then build a DataFrame from all ride CSVs)
## (Here, ride IDS are either ocean or motion dataframes)
count_good_fins = 0      # number of dataframes with non empty data
    
# Loop over ride_ids and find CSVs
for rid in ride_ids:
    print("rid: ", rid)
    try:
        # runs code from function defined above
        new_ocean_df, new_motion_df = get_csv_from_ride_id(rid) # get given ride's CSV from its ride ID using function above
        
        # for each non empty df, append to list of already created dataframes        
        if not new_ocean_df.empty: # Calibration rides, for example
            
            # Append only if DF isn't empty. There may be a better way to control empty DFs which are created above
            appended_multiIndex.append(str(rid)) # build list to be multiIndex of future DataFrame
            appended_ocean_list.append(new_ocean_df)
            appended_motion_list.append(new_motion_df)
            
            count_good_fins += 1
        
    except: 
        print("Ride threw an exception!")
        #print("Ride ", rid, "threw an exception!")    

#%% Build the "Master" DataFrame
# keys for each diferent dataframe in the big dataframes
df_keys = tuple(appended_multiIndex) # keys gotta be a tuple, a list which data in it cannot be changed

# concatinate all dataframes in each list into one big dataframe
ocean_df = pd.concat(appended_ocean_list, keys = df_keys, names=['ride_id'])
motion_df = pd.concat(appended_motion_list, keys = df_keys, names = ['ride_id'])


##Here, maybe just use info from the motion_df and don't worry about ocean_df data for now.
##If you do want ocean_df data, look at how Phil was getting it from "July 10th and 11th Calibration" jupyter notebook file.
#We can also check to see if the surfboard was recording "in-water-freq" or 
#"out-of-water-freq" based on how many NaN values we see. 



# 7d. clear na values from dataframes
#Drop the latitude and longitude values since most of them are Nan:
print('motion df length pre na drop: ', len(motion_df))
motion_df_dropped = motion_df.drop(columns=['Latitude', 'Longitude'])

#Drop the NAN values from the motion data:
motion_df_dropped = motion_df_dropped.dropna(axis=0, how='any')
print('motion_df_dropped length post na drop: ', len(motion_df_dropped))
print('\n\n')

# finished clean dataframes
print('motion_df_dropped: ', motion_df_dropped)
print('ocean_df: ', ocean_df)

rid:  15692
ocean_csv_url:  https://surf.smartfin.org/media/201811/google_105349665704999793400_0006667E229D_181109191556_Ocean.CSV
motion_csv_url:  https://surf.smartfin.org/media/201811/google_105349665704999793400_0006667E229D_181109191556_Motion.CSV



motion_df_small raw:                                     UTC        Time  IMU A1  IMU A2  IMU A3  \
0     2018-11-09 19:16:03.806000+00:00  1414742884     NaN     NaN     NaN   
1     2018-11-09 19:16:03.809000+00:00  1414742887   493.0    48.0   110.0   
2     2018-11-09 19:16:04.061000+00:00  1414743138   513.0    89.0    62.0   
3     2018-11-09 19:16:04.312000+00:00  1414743387   494.0    92.0    80.0   
4     2018-11-09 19:16:04.565000+00:00  1414743639   421.0   205.0  -104.0   
...                                ...         ...     ...     ...     ...   
22552 2018-11-09 20:38:14.334000+00:00  1419643689   501.0   -11.0    99.0   
22553 2018-11-09 20:38:14.500000+00:00  1419643854     NaN     NaN     NaN   
22554 2018-11-09 20

[21645 rows x 10 columns]
ocean_df:                                                    Time  Temperature 1  \
ride_id UTC                                                             
15692   2018-11-09 19:16:03.360000+00:00  1.414742e+09          375.0   
        2018-11-09 19:16:03.393000+00:00           NaN            NaN   
        2018-11-09 19:16:03.426000+00:00           NaN            NaN   
        2018-11-09 19:16:03.459000+00:00           NaN            NaN   
        2018-11-09 19:16:03.492000+00:00           NaN            NaN   
...                                                ...            ...   
        2018-11-09 20:38:13.296000+00:00           NaN            NaN   
        2018-11-09 20:38:13.329000+00:00           NaN            NaN   
        2018-11-09 20:38:13.362000+00:00           NaN            NaN   
        2018-11-09 20:38:13.395000+00:00           NaN            NaN   
        2018-11-09 20:38:13.428000+00:00  1.419643e+09          446.0   

             

In [47]:
## Read CSV from URL

#Gets all columns
#link will come from 'motion_csv_url' from fin ID

motion_df = pd.read_csv('https://surf.smartfin.org/media/201811/google_105349665704999793400_0006667E229D_181109191556_Motion.CSV')
# motion_df.dropna()
# #motion_df.drop(['Latitude'], axis=1)
# #motion_df = motion_df.drop(['Time'], axis=1)
print(motion_df.head())
print(len(motion_df))


                              UTC        Time  IMU A1  IMU A2  IMU A3  IMU G1  \
0  2018-11-09T19:16:03.8060+00:00  1414742884     NaN     NaN     NaN     NaN   
1  2018-11-09T19:16:03.8090+00:00  1414742887   493.0    48.0   110.0    75.0   
2  2018-11-09T19:16:04.0610+00:00  1414743138   513.0    89.0    62.0    34.0   
3  2018-11-09T19:16:04.3120+00:00  1414743387   494.0    92.0    80.0    69.0   
4  2018-11-09T19:16:04.5650+00:00  1414743639   421.0   205.0  -104.0   192.0   

   IMU G2  IMU G3  IMU M1  IMU M2  IMU M3   Latitude   Longitude  
0     NaN     NaN     NaN     NaN     NaN  3285871.0 -11725690.0  
1  -124.0   -86.0  -309.0   209.0    39.0        NaN         NaN  
2   -36.0   -92.0  -320.0   194.0    38.0        NaN         NaN  
3   -63.0   -42.0  -329.0   189.0    49.0        NaN         NaN  
4   -92.0   -37.0  -330.0   180.0    64.0        NaN         NaN  
22557


In [None]:
saved_copy_motion_df = motion_df.copy(deep=True) #make a copy of the dataframe with raw data included

In [5]:
time_array = []
utc_time = motion_df[['UTC']].to_numpy()


START = utc_time[0]
END = utc_time[-1]

start_index = np.where(utc_time==START)[0][0]
end_index = np.where(utc_time==END)[0][0]

length = end_index-start_index

mins_per_measure = 60/length
sec_per_measure = 60*mins_per_measure

for i in range(0, length):
    time_array.append(sec_per_measure*i)

In [10]:
# Function to parse time string and calculate half hour
def add_half_hour(time_str):
    # Parse string
    hrs = int(time_str[:2])
    mins = int(time_str[3:5])
    
    # Set minutes and hours
    if (mins < 30):
        mins += 30
    else:
        mins -= 30
        if (hrs == 23):
            hrs = 0
        else:
            hrs += 1
            
    # Check if mins and hrs are single digits
    if (mins < 10):
        mins = '0'+str(mins)
    if (hrs < 10):
        hrs = '0'+str(hrs)
        
    return str(hrs)+':'+str(mins)

# Parse int and compare functions in case we need them in the future

def parse_int(time_str):
    return 60*int(time_str[:2])+int(time_str[3:5])

def is_less_than_eq(str1, str2):
    if (str1[:2]=='23' and str2[:2]=='00'):
        return True
    
    str1 = parse_int(str1)
    str2 = parse_int(str2)
    
    return (str1 <= str2)

In [11]:
%%time

### Goal: create an array of arrays of IMU A2 data ###
surf_sessions = []
height_labels = []
utc_labels = []
date_labels = []

## Step 1: get 5 columns of the data frame to iterate over - ID, Time, IMU A2, Date, Height ##
df_parser = motion_df[['UTC', 'Time', 'IMU A2']]

# Convert to numpy array
time_parser = df_parser[['Time']].to_numpy()
utc_parser = df_parser[['UTC']].to_numpy()
imuA2_parser = df_parser[['IMU A2']].to_numpy()

## Step 2: loop through every data point ##
working_array = []
skip_check = '00:00'
end_index_check = None

for start_index in range(0, len(time_parser)):
    ## Goal 1: Append IMU data if we are in the half-hour segment ##
    if (end_index_check != None):
        # Check if we've reached the end of the half hour before appending
        if (start_index==end_index_check):
            surf_sessions.append(working_array)
            working_array = []
            end_index_check = None
        else:
            working_array.append(imuA2_parser[start_index][0])
        continue
        
    ## Goal 2: Check if height of current and half-hour times are equal ##
    start = time_parser[start_index][0]
    
    # Skip through a minute if it has already been checked and invalidated
    if (start==skip_check):
        continue
        
    # Set end point
    end = add_half_hour(start)
    
    # Check if ride is still valid for half hour time
    try:
        # End index is where the time is equal to end and we are in the same ride as we started
        end_index = np.where((time_parser==end) & (id_parser==id_parser[start_index]))[0][0]
    except IndexError:
        # Skip through minutes
        skip_check = start
        continue
    else:
        # (occurs after try block) skip if the heights don't match
        if (height_parser[start_index]!=height_parser[end_index-1]):
            skip_check = start
        else:
            # add to array if heights do match
            working_array.append(imuA2_parser[start_index][0])
            height_labels.append(height_parser[start_index][0])
            utc_labels.append(utc_parser[start_index][0])
            date_labels.append(date_parser[start_index][0])
            
            end_index_check = end_index
            skip_check = '00:00'

            
            print("Session: ", len(surf_sessions)+1, ", Date: ", date_parser[start_index][0])
            print("At ", start, " and ", end)
            print("Start: ", height_parser[start_index], " and End: ", height_parser[end_index-1], "\n")

print("Total sessions: ", len(surf_sessions))
print("Total heights: ", len(height_labels))

IndexError: invalid index to scalar variable.