### Web scrape valid surf rides from the Smartfin website: 

In [1]:
#Imports 
import requests
import lxml.html as lh
import pandas as pd
import numpy as np
print("Done.")

Done.


### Scrape verified surf sessions into a dataframe (can add more later):

In [2]:
# Start by just looking at my surf sessions in SD (near CDIP buoy): 

# 15218 - First VIRB filmed session - Oct. 24, 2018
# 15669 - Second VIRB filmed session - Nov. 7, 2018
# 15692 - Third VIRB filmed session - Nov. 9, 2018
# 15686 - Fourth VIRB filmed session - Nov. 11, 2018

ride_ids = [15218, 15669, 15692, 15686]

In [3]:
#Get urls for surf sessions in Scripps beach area that most likely correspond to surfing time intervals: 
url_45_60 = 'https://surf.smartfin.org/advanced_search/?northEastLat=32.6&northEastLon=-117.24&southWestLat=32.9&southWestLon=-117.33&dateTimeBegin=&dateTimeEnd=&timeZone=PDT&durationMin=45&durationMax=60&sensorTypeWave=true#searchResultsContainer'
url_61_80 = 'https://surf.smartfin.org/advanced_search/?northEastLat=32.6&northEastLon=-117.24&southWestLat=32.9&southWestLon=-117.33&dateTimeBegin=&dateTimeEnd=&timeZone=PDT&durationMin=61&durationMax=80&sensorTypeWave=true#searchResultsContainer'
url_81_100 = 'https://surf.smartfin.org/advanced_search/?northEastLat=32.6&northEastLon=-117.24&southWestLat=32.9&southWestLon=-117.33&dateTimeBegin=&dateTimeEnd=&timeZone=PDT&durationMin=81&durationMax=100&sensorTypeWave=true#searchResultsContainer'
url_101_130 = 'https://surf.smartfin.org/advanced_search/?northEastLat=32.6&northEastLon=-117.24&southWestLat=32.9&southWestLon=-117.33&dateTimeBegin=&dateTimeEnd=&timeZone=PDT&durationMin=101&durationMax=130&sensorTypeWave=true#searchResultsContainer'

urls = [url_45_60]
#urls = [url_45_60, url_61_80, url_81_100, url_101_130]

In [4]:
# Define a function that will parse rows from data tables on CDIP's website:
def parse_rows(row_number):
    name_string = ""
    name_list = []
    for t in tr_elements[row_number]:
        name = t.text_content()
        for i in name: 
            name_string += i
    #print(name_string)

    # Create a string from the values
    name_string = name_string.split(" ")
    
    # Remove all spaces from the list
    for i in name_string: 
        if len(i) > 0:
            name_list.append(i)
      
    # Ensure that time stays with 'Date (UTC)'' header
    name_list[0] = name_list[0] + " " +  name_list[1]
    name_list.pop(1)
    
    return name_list

In [5]:
# Iterate over each url and get all ride_ids located at that url: 
ride_ids = []
for u in urls: 
    
    # Create a handle, page, to handle the contents of the website
    page = requests.get(u)

    # Store the contents of the website under doc
    doc = lh.fromstring(page.content)

    # Parse data stored between <tr>..</tr> of HTML
    tr_elements = doc.xpath('//tr')

    # Retrieve all of the ride ids: 
    for j in range(1, len(tr_elements)):
        data = parse_rows(j)
        data = str(data[0]).strip("\n")
        ride_ids.append(data)

print(len(ride_ids))
print(ride_ids[:10])

265
[' 16345', ' 16318', ' 16300', ' 16289', ' 16278', ' 16211', ' 16197', ' 16194', ' 16191', ' 16169']


In [6]:
#%% Fin ID scraper
# Input fin ID, get all ride IDs
# base URL to which we'll append given fin IDs
fin_url_base = 'http://surf.smartfin.org/fin/'

# Look for the following text in the HTML contents in fcn below
str_id_ride = 'rideId = \'' # backslash allows us to look for single quote
str_id_date = 'var date = \'' # backslash allows us to look for single quote

#%% Ride ID scraper
# Input ride ID, get ocean and motion CSVs
# Base URL to which we'll append given ride IDs
ride_url_base = 'https://surf.smartfin.org/ride/'

# Look for the following text in the HTML contents in fcn below
str_id_csv = 'img id="temperatureChart" class="chart" src="' 

def get_csv_from_ride_id(rid):
    # Build URL for each individual ride
    ride_url = ride_url_base+str(rid)
    #print(ride_url)
    
    # Get contents of ride_url
    html_contents = requests.get(ride_url).text
    
    # Find CSV identifier 
    loc_csv_id = html_contents.find(str_id_csv)
    
    # Different based on whether user logged in with FB or Google
    offset_googleOAuth = [46, 114]
    offset_facebkOAuth = [46, 112]
    if html_contents[loc_csv_id+59] == 'f': # Facebook login
        off0 = offset_facebkOAuth[0]
        off1 = offset_facebkOAuth[1]
    else: # Google login
        off0 = offset_googleOAuth[0]
        off1 = offset_googleOAuth[1]
        
    csv_id_longstr = html_contents[loc_csv_id+off0:loc_csv_id+off1]
        
    # Stitch together full URL for CSV
    if ("media" in csv_id_longstr) & ("Calibration" not in html_contents): # other junk URLs can exist and break everything
        
        ocean_csv_url = 'https://surf.smartfin.org/'+csv_id_longstr+'Ocean.CSV'
        motion_csv_url = 'https://surf.smartfin.org/'+csv_id_longstr+'Motion.CSV'
        
        # Go to ocean_csv_url and grab contents (theoretically, a CSV)
        ocean_df_small = pd.read_csv(ocean_csv_url, parse_dates = [0])
        elapsed_timedelta = (ocean_df_small['UTC']-ocean_df_small['UTC'][0])
        ocean_df_small['elapsed'] = elapsed_timedelta/np.timedelta64(1, 's')
        
        motion_df_small = pd.read_csv(motion_csv_url, parse_dates = [0])
        
        # Reindex on timestamp if there are at least a few rows
        if len(ocean_df_small) > 1:
            ocean_df_small.set_index('UTC', drop = True, append = False, inplace = True)
            motion_df_small.set_index('UTC', drop = True, append = False, inplace = True)            
            
            #May need to change this sampling interval:
            sample_interval = '33ms'
            
            ocean_df_small_resample = ocean_df_small.resample(sample_interval).mean()
            motion_df_small_resample = motion_df_small.resample(sample_interval).mean()
            
            # No need to save many extra rows with no fix
            motion_df_small = motion_df_small[~np.isnan(motion_df_small.Latitude)]
            
            return ocean_df_small_resample, motion_df_small_resample

    else:
        ocean_df_small_resample = pd.DataFrame() # empty DF just so something is returned
        motion_df_small_resample = pd.DataFrame() 
        return ocean_df_small_resample, motion_df_small_resample

In [7]:
appended_ocean_list = [] # list of DataFrames from original CSVs
appended_motion_list = []
appended_multiIndex = [] # fin_id & ride_id used to identify each DataFrame

print("Once the counter gets to 1100 it will be done printing.")

## Nested loops (for each fin ID, find all ride IDs, then build a DataFrame from all ride CSVs)
## (Here, ride IDS are either ocean or motion dataframes)
count_good_fins = 0
    
# Loop over ride_ids and find CSVs
counter = 0
for rid in ride_ids:
    counter += 1
    if counter % 10 == 0:
        print(counter)
    try:
        new_ocean_df, new_motion_df = get_csv_from_ride_id(rid) # get given ride's CSV from its ride ID using function above
        if not new_ocean_df.empty: # Calibration rides, for example
            # Append only if DF isn't empty. There may be a better way to control empty DFs which are created above
            appended_multiIndex.append(str(rid)) # build list to be multiIndex of future DataFrame
            appended_ocean_list.append(new_ocean_df)
            appended_motion_list.append(new_motion_df)
            #print("Ride data has been uploaded.")
            #print("Ride: ", rid, "data has been uploaded.")
            count_good_fins += 1
     
    except: 
        print("Ride threw an exception!")
        #print("Ride ", rid, "threw an exception!")    

#%% Build the "Master" DataFrame

# appended_ocean_df.summary()
df_keys = tuple(appended_multiIndex) # keys gotta be a tuple, a list which data in it cannot be changed
ocean_df = pd.concat(appended_ocean_list, keys = df_keys, names=['ride_id'])
motion_df = pd.concat(appended_motion_list, keys = df_keys, names = ['ride_id'])


##Here, maybe just use info from the motion_df and don't worry about ocean_df data for now.
##If you do want ocean_df data, look at how Phil was getting it from "July 10th and 11th Calibration" jupyter notebook file.
#print(motion_df)
print("Done.")

Once the counter gets to 1100 it will be done printing.
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
Done.


In [8]:
# View data in a python dataframe: 
print(len(motion_df))
motion_df[0:10]

# I think that we're resampling each surf session at too small of a rate? 
# Why do we have so little data? 

#Sampling interval of 33ms: 3,290,682 initial data points, 438,099 final data points (adding .033 to UTC time)
#Sampling interval of 20ms: 5,429,605 initial data points, 438,099 final data points (adding .020 to UTC time)

764242


Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,Latitude,Longitude
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
16345,2019-11-19 15:02:50.061000+00:00,3888630000.0,502.0,41.0,60.0,-18.0,-10.0,-1.0,32.0,-110.0,-468.0,,
16345,2019-11-19 15:02:50.094000+00:00,,,,,,,,,,,,
16345,2019-11-19 15:02:50.127000+00:00,,,,,,,,,,,,
16345,2019-11-19 15:02:50.160000+00:00,,,,,,,,,,,,
16345,2019-11-19 15:02:50.193000+00:00,,,,,,,,,,,,
16345,2019-11-19 15:02:50.226000+00:00,,,,,,,,,,,,
16345,2019-11-19 15:02:50.259000+00:00,,,,,,,,,,,,
16345,2019-11-19 15:02:50.292000+00:00,,,,,,,,,,,,
16345,2019-11-19 15:02:50.325000+00:00,3888631000.0,501.0,37.0,63.0,-18.0,-10.0,0.0,39.0,-115.0,-461.0,,
16345,2019-11-19 15:02:50.358000+00:00,,,,,,,,,,,,


In [9]:
motion_df.shape

(764242, 12)

In [10]:
# Drop latitude/longitude columns and remove NaN rows: 
motion_df = motion_df.drop(["Latitude", "Longitude"], axis=1)
motion_df = motion_df.dropna(axis=0, how='any')
print(len(motion_df))
motion_df[0:20]

106036


Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
16345,2019-11-19 15:02:50.061000+00:00,3888630000.0,502.0,41.0,60.0,-18.0,-10.0,-1.0,32.0,-110.0,-468.0
16345,2019-11-19 15:02:50.325000+00:00,3888631000.0,501.0,37.0,63.0,-18.0,-10.0,0.0,39.0,-115.0,-461.0
16345,2019-11-19 15:02:50.589000+00:00,3888631000.0,503.0,36.0,59.0,-18.0,-9.0,0.0,38.0,-110.0,-462.0
16345,2019-11-19 15:02:50.820000+00:00,3888631000.0,500.0,35.0,62.0,-19.0,-9.0,3.0,37.0,-111.0,-457.0
16345,2019-11-19 15:02:51.084000+00:00,3888631000.0,501.0,33.0,61.0,-18.0,-10.0,0.0,38.0,-116.0,-460.0
16345,2019-11-19 15:02:51.315000+00:00,3888632000.0,500.0,34.0,63.0,-18.0,-10.0,0.0,30.0,-108.0,-460.0
16345,2019-11-19 15:02:51.579000+00:00,3888632000.0,501.0,34.0,61.0,-19.0,-9.0,3.0,39.0,-115.0,-453.0
16345,2019-11-19 15:02:51.843000+00:00,3888632000.0,502.0,35.0,62.0,-19.0,-11.0,11.0,41.0,-113.0,-465.0
16345,2019-11-19 15:02:52.074000+00:00,3888632000.0,502.0,23.0,61.0,-19.0,-12.0,16.0,42.0,-118.0,-454.0
16345,2019-11-19 15:02:52.338000+00:00,3888633000.0,503.0,21.0,61.0,-20.0,-10.0,1.0,43.0,-111.0,-457.0


### Scrape data from same-day CDIP and add it as an additional column to the dataframe:


In [11]:
# Format should be appending year, month
# Example: Sept. 2019 -> 201909
def create_url_string(year, month):
    url='https://cdip.ucsd.edu/themes/cdip?tz=UTC&numcolorbands=10&palette=cdip_classic&zoom=auto&ll_fmt=dm&high=6.096&r=999&un=1&pb=1&d2=p70&u2=s:201:st:1:v:parameter:dt:'
    if 2014 <= int(year) <= 2019:
        year = year
    if 1 <= int(month) <= 12 and len(month) == 2:
        month = month
    url += year
    url += month
    return url 

In [12]:
# For each row in the dataframe, parse the index ('UTC' column) to get the year, month information. 
motion_df.columns

dates = []
times = []
for row in motion_df.index:
    time = str(row[1])
    date = time.split(" ")
    time = date[1]
    date = date[0]
    time = time.split(".")
    time = time[0]
    time = time.split(":")
    time = time[0] + ":" + time[1]
    times.append(time)
    dates.append(date)
print(dates[0:10], times[0:10])

motion_df["Date"] = dates
motion_df["Time"] = times

['2019-11-19', '2019-11-19', '2019-11-19', '2019-11-19', '2019-11-19', '2019-11-19', '2019-11-19', '2019-11-19', '2019-11-19', '2019-11-19'] ['15:02', '15:02', '15:02', '15:02', '15:02', '15:02', '15:02', '15:02', '15:02', '15:02']


In [24]:
motion_df[-10:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,Date,Hs(ft),Tp(s),Dp(deg)
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
15686,2018-11-11 21:28:44.016000+00:00,21:28,1.340753,-9.672575,-1.398214,0.731707,2.560976,0.121951,-273.0,327.0,155.0,2018-11-11,0.98,11.76,287.0
15686,2018-11-11 21:28:44.214000+00:00,21:28,1.340753,-9.691728,-1.474828,0.731707,2.560976,0.121951,-269.0,321.0,149.0,2018-11-11,0.98,11.76,287.0
15686,2018-11-11 21:28:44.412000+00:00,21:28,1.340753,-9.672575,-1.398214,0.731707,2.560976,0.121951,-267.0,315.0,157.0,2018-11-11,0.98,11.76,287.0
15686,2018-11-11 21:28:44.610000+00:00,21:28,1.359907,-9.672575,-1.474828,0.731707,2.560976,0.121951,-279.0,317.0,147.0,2018-11-11,0.98,11.76,287.0
15686,2018-11-11 21:28:44.808000+00:00,21:28,1.340753,-9.672575,-1.455675,0.731707,2.682927,0.121951,-265.0,327.0,155.0,2018-11-11,0.98,11.76,287.0
15686,2018-11-11 21:28:45.006000+00:00,21:28,1.321599,-9.672575,-1.417367,0.731707,2.682927,0.121951,-273.0,323.0,147.0,2018-11-11,0.98,11.76,287.0
15686,2018-11-11 21:28:45.204000+00:00,21:28,1.321599,-9.672575,-1.398214,0.731707,2.439024,0.121951,-270.0,320.0,156.0,2018-11-11,0.98,11.76,287.0
15686,2018-11-11 21:28:45.402000+00:00,21:28,1.340753,-9.672575,-1.455675,0.731707,2.560976,0.121951,-272.0,330.0,146.0,2018-11-11,0.98,11.76,287.0
15686,2018-11-11 21:28:45.633000+00:00,21:28,1.340753,-9.672575,-1.455675,0.609756,2.560976,0.121951,-272.0,322.0,160.0,2018-11-11,0.98,11.76,287.0


In [14]:
# Define a function that will find the closest time on the CDIP table to the surf ride's time: 
def find_closest_single_time(date, time_utc):
    time_list = []
    
    # Iterate over each date
    for x in df_data['Date(UTC)']:
        a = x.split(" ")
        if date == a[0]:
            a = str(a[1]).split(':')
            y = str(time_utc).split(":")
            #print(a)
            #print(y)
            
            # Find the closest timestamp (less than 15 minutes away)
            time1 = int(a[0])*60 + int(a[1])
            time2 = int(y[0])*60 + int(y[1])
            
            if abs(time1 - time2) <= 15:
                time_string = str(a[0]) + ":" + str(a[1])
                time_list.append(time_string)
           
    return time_list

### Create the output labels from CDIP data for each Smartfin timestamp: 


In [15]:
# Create the output labels from CDIP data for each Smartfin timestamp: 
old_url = 'old_url'
old_date = 'old_date'
old_time = 'xx:xx'

wave_heights = []
wave_periods = []
wave_directions = []
for d, time in zip(dates,times):
    di = d.split("-")
    year = di[0]
    month = di[1]
    url = create_url_string(year, month)
    
    #We need to load a new table since we have a new surf ride session:
    if (url != old_url): 
        old_url = url
        # Create a handle, page, to handle the contents of the website
        page = requests.get(url)

        # Store the contents of the website under doc
        doc = lh.fromstring(page.content)

        # Parse data stored between <tr>..</tr> of HTML
        tr_elements = doc.xpath('//tr')

        # Need to drop the first 3 rows since they aren't in the table
        tr_elements = tr_elements[3:]
        
        # Parse the first row as the header
        tr_elements = doc.xpath('//tr')

        # Create empty list
        headers = []
        i = 0

        # For each row, store each first element (header) and an empty list
        for t in tr_elements[3]:
            i+=1
            name=t.text_content()
            #print(name)
            headers.append(name)

        #print(headers)

        # Create a Pandas dataframe: 
        data_list = []

        #Since out first row is the header, data is stored on the second row onwards
        for j in range(4, len(tr_elements)):
            data = parse_rows(j)
            data_list.append(data)

        df_data = pd.DataFrame(data_list, columns=headers)
        #print(df_data[:10])
        
        #Find the closest time in the CDIP data table that corresponds to the Smartfin data: 
        date = d #YYYY-MM-DD format
        print("date, time ", date, time)
        time_list = find_closest_single_time(date, time)
        print("time list ", time_list)
        
        # Now compute the average significant wave height for that time period:
        length = df_data['Date(UTC)'].size
        wave_height_list = []
        wave_period_list = []
        wave_direction_list = []
        for time in time_list: 
            date_data = date + " " + time
            for i in range(0, length):
                if df_data['Date(UTC)'][i] == date_data:
                    print("height ", df_data['Hs(ft)'][i])
                    wave_height_list.append(float(df_data['Hs(ft)'][i]))
                    wave_period_list.append(float(df_data['Tp(s)'][i]))
                    wave_direction_list.append(float(df_data['Dp(deg)'][i]))
                    
        # Throws error when date doesn't exist on CDIP data (ex: 2018-01-18)
        if len(wave_height_list) == 0 or len(wave_period_list) == 0 or len(wave_direction_list) == 0: 
            avg_wave_height = np.nan
            avg_wave_period = np.nan
            avg_wave_direction = np.nan
            
        else: 
            #print(wave_height_list)
            avg_wave_height = sum(wave_height_list)/len(wave_height_list)
            avg_wave_period = sum(wave_period_list)/len(wave_period_list)
            avg_wave_direction = sum(wave_direction_list)/len(wave_direction_list)
            #print(avg_wave_height)
            #print(avg_wave_period)
            #print(avg_wave_direction) 
            
    # Same date/month but maybe a different day or time:
    elif old_date != date or old_time[3] != time[3]:
        date = d #YYYY-MM-DD format
        #print(date, time)
        time_list = find_closest_single_time(date, time)
        #print(time_list)
        
        old_date = date
        old_time = time
        
        # Now compute the average significant wave height for that time period:
        length = df_data['Date(UTC)'].size
        wave_height_list = []
        wave_period_list = []
        wave_direction_list = []
        for time in time_list: 
            date_data = date + " " + time
            for i in range(0, length):
                if df_data['Date(UTC)'][i] == date_data:
                    #print(df_data['Hs(ft)'][i])
                    wave_height_list.append(float(df_data['Hs(ft)'][i]))
                    wave_period_list.append(float(df_data['Tp(s)'][i]))
                    wave_direction_list.append(float(df_data['Dp(deg)'][i]))
                    
      
        # Throws error when date doesn't exist on CDIP data (ex: 2018-01-18)
        if len(wave_height_list) == 0 or len(wave_period_list) == 0 or len(wave_direction_list) == 0: 
            avg_wave_height = np.nan
            avg_wave_period = np.nan
            avg_wave_direction = np.nan
            
        else: 
            #print(wave_height_list)
            avg_wave_height = sum(wave_height_list)/len(wave_height_list)
            avg_wave_period = sum(wave_period_list)/len(wave_period_list)
            avg_wave_direction = sum(wave_direction_list)/len(wave_direction_list)
            #print(avg_wave_height)
            #print(avg_wave_period)
            #print(avg_wave_direction)    

    # If they have the same url, date, and time then they will have the same value computed. 
    wave_heights.append(avg_wave_height)
    wave_periods.append(avg_wave_period)
    wave_directions.append(avg_wave_direction)

date, time  2019-11-19 15:02
time list  ['15:02']
height  1.74
date, time  2019-10-24 17:37
time list  ['17:32']
height  2.00
date, time  2019-02-26 21:06
time list  ['21:02']
height  0.92
date, time  2018-11-11 20:34
time list  ['20:30']
height  1.02


In [16]:
print(len(dates))
print(len(wave_heights))
print(len(wave_periods))
print(len(wave_directions))


# Check to make sure different wave heights were appended throughout the dataframe: 
motion_df["Hs(ft)"] = wave_heights
motion_df["Tp(s)"] = wave_periods
motion_df["Dp(deg)"] = wave_directions

for x in range(0, 60000, 10000):
    print(motion_df.iloc[[x]]['Hs(ft)'])
    print(motion_df.iloc[[x]]['Tp(s)'])
    print(motion_df.iloc[[x]]['Dp(deg)'])

motion_df[0:5]

106036
106036
106036
106036
ride_id  UTC                             
 16345   2019-11-19 15:02:50.061000+00:00    1.74
Name: Hs(ft), dtype: float64
ride_id  UTC                             
 16345   2019-11-19 15:02:50.061000+00:00    10.53
Name: Tp(s), dtype: float64
ride_id  UTC                             
 16345   2019-11-19 15:02:50.061000+00:00    285.0
Name: Dp(deg), dtype: float64
ride_id  UTC                             
 16345   2019-11-19 15:43:24.933000+00:00    1.77
Name: Hs(ft), dtype: float64
ride_id  UTC                             
 16345   2019-11-19 15:43:24.933000+00:00    11.76
Name: Tp(s), dtype: float64
ride_id  UTC                             
 16345   2019-11-19 15:43:24.933000+00:00    282.0
Name: Dp(deg), dtype: float64
ride_id  UTC                             
 16318   2019-11-09 21:07:19.788000+00:00    1.97
Name: Hs(ft), dtype: float64
ride_id  UTC                             
 16318   2019-11-09 21:07:19.788000+00:00    5.26
Name: Tp(s), dtype: float64
r

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,Date,Hs(ft),Tp(s),Dp(deg)
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
16345,2019-11-19 15:02:50.061000+00:00,15:02,502.0,41.0,60.0,-18.0,-10.0,-1.0,32.0,-110.0,-468.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.325000+00:00,15:02,501.0,37.0,63.0,-18.0,-10.0,0.0,39.0,-115.0,-461.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.589000+00:00,15:02,503.0,36.0,59.0,-18.0,-9.0,0.0,38.0,-110.0,-462.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.820000+00:00,15:02,500.0,35.0,62.0,-19.0,-9.0,3.0,37.0,-111.0,-457.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:51.084000+00:00,15:02,501.0,33.0,61.0,-18.0,-10.0,0.0,38.0,-116.0,-460.0,2019-11-19,1.74,10.53,285.0


In [17]:
# Now drop all of the nan values that I introduced earlier: 
motion_df = motion_df.dropna(axis=0, how='any')
print(len(motion_df))
motion_df[0:5]

106036


Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,Date,Hs(ft),Tp(s),Dp(deg)
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
16345,2019-11-19 15:02:50.061000+00:00,15:02,502.0,41.0,60.0,-18.0,-10.0,-1.0,32.0,-110.0,-468.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.325000+00:00,15:02,501.0,37.0,63.0,-18.0,-10.0,0.0,39.0,-115.0,-461.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.589000+00:00,15:02,503.0,36.0,59.0,-18.0,-9.0,0.0,38.0,-110.0,-462.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.820000+00:00,15:02,500.0,35.0,62.0,-19.0,-9.0,3.0,37.0,-111.0,-457.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:51.084000+00:00,15:02,501.0,33.0,61.0,-18.0,-10.0,0.0,38.0,-116.0,-460.0,2019-11-19,1.74,10.53,285.0


In [18]:
# We're collecting IMU data at 3-4 Hz here. 
# We're using 4 surf sessions and we have 60,000 data points. 
# If we only calculated Hs once per minute then we would have 330 data points. 

# Instead of looking at significant wave height, we could look at wave direction 
# or we could look at Hs, dir, and period and see which one we predict best. 



In [19]:
# Convert raw IMU values to real values: 
g_const = 512 #Raw acceleration constant 512 = 1g (accelerometer's measured force due to gravity)
gravity = 9.80665 #Approximate measurement for gravity

# Correct the IMU Acceleration columns into units of meters
# Dividing by 512 is equivalent to muliplying by 4 to correct the bit shifting by 2 places and dividing by 2048 to convert bits to G's
# Multiplying by the 9.81 afterwards is simply to convert G's into m/s^2
motion_df['IMU A1'] = motion_df['IMU A1'].apply(lambda x: x / g_const * gravity)
motion_df['IMU A2'] = motion_df['IMU A2'].apply(lambda x: x / g_const * gravity)
motion_df['IMU A3'] = motion_df['IMU A3'].apply(lambda x: x / g_const * gravity)

In [20]:
# Gyroscopic Rotation converts to deg/s
gyro_const = 8.2 # Raw gyrscope constant 8.2 bits per degree
motion_df['IMU G1'] = motion_df['IMU G1'].apply(lambda x: x / gyro_const)
motion_df['IMU G2'] = motion_df['IMU G2'].apply(lambda x: x / gyro_const)
motion_df['IMU G3'] = motion_df['IMU G3'].apply(lambda x: x / gyro_const)

In [21]:
# Magnetometer values
# Offset variables help in recentering the magnetic data in order to define direction and use trig functions
'''
M1_offset_var = 219.786
M2_offset_var = 180
M3_offset_var = 280

motion_df['IMU M1'] = motion_df['IMU M1'].apply(lambda x: x - M1_offset_var)
motion_df['IMU M2'] = motion_df['IMU M2'].apply(lambda x: x - M2_offset_var)
motion_df['IMU M3'] = motion_df['IMU M3'].apply(lambda x: x - M3_offset_var)
'''

"\nM1_offset_var = 219.786\nM2_offset_var = 180\nM3_offset_var = 280\n\nmotion_df['IMU M1'] = motion_df['IMU M1'].apply(lambda x: x - M1_offset_var)\nmotion_df['IMU M2'] = motion_df['IMU M2'].apply(lambda x: x - M2_offset_var)\nmotion_df['IMU M3'] = motion_df['IMU M3'].apply(lambda x: x - M3_offset_var)\n"

In [22]:
print(len(motion_df))
motion_df[:20]

106036


Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,Date,Hs(ft),Tp(s),Dp(deg)
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
16345,2019-11-19 15:02:50.061000+00:00,15:02,9.615114,0.785298,1.149217,-2.195122,-1.219512,-0.121951,32.0,-110.0,-468.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.325000+00:00,15:02,9.59596,0.708684,1.206678,-2.195122,-1.219512,0.0,39.0,-115.0,-461.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.589000+00:00,15:02,9.634267,0.68953,1.130063,-2.195122,-1.097561,0.0,38.0,-110.0,-462.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:50.820000+00:00,15:02,9.576807,0.670376,1.187524,-2.317073,-1.097561,0.365854,37.0,-111.0,-457.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:51.084000+00:00,15:02,9.59596,0.632069,1.16837,-2.195122,-1.219512,0.0,38.0,-116.0,-460.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:51.315000+00:00,15:02,9.576807,0.651223,1.206678,-2.195122,-1.219512,0.0,30.0,-108.0,-460.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:51.579000+00:00,15:02,9.59596,0.651223,1.16837,-2.317073,-1.097561,0.365854,39.0,-115.0,-453.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:51.843000+00:00,15:02,9.615114,0.670376,1.187524,-2.317073,-1.341463,1.341463,41.0,-113.0,-465.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:52.074000+00:00,15:02,9.615114,0.440533,1.16837,-2.317073,-1.463415,1.95122,42.0,-118.0,-454.0,2019-11-19,1.74,10.53,285.0
16345,2019-11-19 15:02:52.338000+00:00,15:02,9.634267,0.402226,1.16837,-2.439024,-1.219512,0.121951,43.0,-111.0,-457.0,2019-11-19,1.74,10.53,285.0


In [23]:
# After converting all to real values, try to export data to .CSV so everyone else doesn't have to webscrape it:
#motion_df.to_csv('CSE258_A2_Data.csv')