In [44]:
import geopandas as gpd
import shapely
import pandas as pd
import numpy as np

import plotly.graph_objs as go
import matplotlib.pyplot as plt
import folium
from folium.plugins import MeasureControl

from datetime import datetime, timedelta

import glob
import zipfile
from bs4 import BeautifulSoup
import re

import os
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
class SFMCGliderData():
    """
    A tool for reading and processing manually extracted glider data from the SFMC.
    
    """
    
    def __init__(self, folder_path:str):
        self.sci_data = self.process_data(folder_path=folder_path)
        self.timeseries = self.plot_timeseries(data=self.sci_data)
        file_name = self.save_plot_as_html(plot=self.timeseries, file_name="glider_sci_data_timeseries.html")

    def grab_txt_files(self, folder_path:str):
        folder_path = os.path.join(folder_path, "*.txt")
        return glob.glob(folder_path, recursive=False)
    
    def load_individual_file(self, filepath:str, sep:str=","):
        return pd.read_csv(filepath, sep=sep)

    def load_all_files(self, folder_path:str):
        files = self.grab_txt_files(folder_path=folder_path)
        for file in files:
            if 'global_data' not in locals():
                global_data = self.load_individual_file(file, sep=" ")
            else:
                data = self.load_individual_file(file, sep=" ")
                global_data = self.merge_sci_data(data1=global_data, data2=data)
        
        global_data = global_data[global_data.columns.sort_values()]
        return global_data        


    def convert_data_types(self, data:pd.DataFrame):
        data_types= {"time": int,
                     "m_depth": float, 
                     "sci_seaowl_fdom_scaled": float, 
                     "sci_rbrctd_salinity_00": float,
                     "sci_oxy4_oxygen": float, 
                     "sci_rbrctd_pressure_00": float,
                     "sci_seaowl_chl_sig": float,
                     "sci_rbrctd_temperature_00": float,
                     "sci_oxy4_saturation": float,
                     "sci_rbrctd_conductivity_00": float
                }
        for column in data_types:
            if column in data.columns:
                data[column] = data[column].astype(data_types[column])
                
        return data
    

    def drop_unwanted_columns(self, data:pd.DataFrame):
        columns_to_drop = data.filter(regex="Unnamed").columns
        return data.drop(columns=columns_to_drop)

    def get_units(self, data:pd.DataFrame):
        params = data.columns
        units = data.iloc[0].values
        units_params = dict(zip(params,units))
        self.units_params = units_params
        return units_params

    def drop_units_row(self, data:pd.DataFrame):
        return data.drop(index=0)

    def timestamp_to_datetime(self, timestamp:pd.Series):
        return timestamp.apply(lambda x : datetime.fromtimestamp(x))

    def rename_columns(self):
        pass
    
    def merge_sci_data(self, data1:pd.DataFrame, data2:pd.DataFrame):
        return pd.merge(data1, data2.drop(columns=["m_depth"]), on="time", how="outer")
    
    def process_data(self, folder_path):
        data = self.load_all_files(folder_path=folder_path)
        data = self.drop_unwanted_columns(data=data)
        units = self.get_units(data=data)
        data = self.drop_units_row(data=data)

        data = self.convert_data_types(data=data)
        date_time = self.timestamp_to_datetime(timestamp=data['time'])
        data.insert(0, column="date_time", value=date_time)
        data = data.set_index("date_time").sort_index(ascending=False)        

        return data



    def plot_timeseries(self, data:pd.DataFrame):
        traces = []
        parameters = data.drop(columns="time").columns
        
        for parameter in parameters:
            trace = go.Scatter(x=data.index, y=data[parameter],
                                mode='lines+markers', name=parameter)
            traces.append(trace)

        layout = go.Layout(
            xaxis=dict(
                rangeslider=dict(visible=True),
                rangeselector=dict(
                    buttons=list([
                        dict(count=1, label="1 mês", step="month", stepmode="backward"),
                        dict(count=6, label="6 meses", step="month", stepmode="backward"),
                        dict(count=1, label="1 ano", step="year", stepmode="backward"),
                        dict(count=1, label="Início do ano", step="year", stepmode="todate"),
                        dict(label="Todo o período", step="all")
                    ])
                ),
                title="Datahora",
                title_font=dict(size=14),
                showgrid=True,
                gridcolor="lightgrey",
                range=[data.index[0] - timedelta(days=30), data.index[0] + timedelta(hours=12)]
            ),
            yaxis=dict(
                title_font=dict(size=14),
                showgrid=True,
                gridcolor="lightgrey")
            # ),
            # updatemenus=[
            #     {
            #         'buttons': [
            #             {'label': parameter, 'method': 'update', 'args': [{'visible': [i == j for i in range(len(parameters))]}]} for j, parameter in enumerate(parameters)
            #         ],
            #         'direction': 'down',
            #         'showactive': True,
            #         'x': 0.1,
            #         'xanchor': 'left',
            #         'y': 1.5,
            #         'yanchor': 'top',
            #     }
            # ],
            # font={
            #     'family': 'Arial',
            #     'size': 15,
            # },
        )

        fig = go.Figure(data=traces, layout=layout)

        fig.update_layout(
            margin=dict(r=40, t=100),
        )

        return fig

    def save_plot_as_html(self, plot, file_name:str):
        plot.write_html(file_name)

    def open_timeseries_in_webbrowser(self, html_file_path):
        webbrowser.open("file://" + html_file_path, new=2)
            

In [130]:
os.path.abspath("glider_sci_data_timeseries.html")

'/home/thiagocaminha/PNBOIA/glider/notebooks/glider_sci_data_timeseries.html'

In [127]:
type(gd.timeseries)

plotly.graph_objs._figure.Figure

In [128]:
gd = SFMCGliderData(folder_path="/home/thiagocaminha/PNBOIA/glider/data/sfmc_data/")
gd.open_timeseries_in_webbrowser(html_file_path=file_name)

TypeError: open_timeseries_in_webbrowser() missing 1 required positional argument: 'html_file_path'

In [117]:
import webbrowser

# Specify the path to the generated HTML file
html_file_path = '/home/thiagocaminha/PNBOIA/glider/notebooks/glider_sci_data_timeseries.html'

# Open the HTML file in the default web browser
webbrowser.open('file://' + html_file_path, new=2)


True

In [91]:
data.columns

Index(['Unnamed: 6_x', 'Unnamed: 6_y', 'm_depth', 'sci_oxy4_oxygen',
       'sci_oxy4_saturation', 'sci_rbrctd_conductivity_00',
       'sci_rbrctd_pressure_00', 'sci_rbrctd_salinity_00',
       'sci_rbrctd_temperature_00', 'sci_seaowl_chl_sig',
       'sci_seaowl_fdom_scaled', 'time'],
      dtype='object')

In [92]:
data = gd.load_all_files(folder_path="/home/thiagocaminha/PNBOIA/glider/data/sfmc_data/")
data = gd.drop_unwanted_columns(data=data)
units = gd.get_units(data=data)
data = gd.drop_units_row(data=data)

data = gd.convert_data_types(data=data)
date_time = gd.timestamp_to_datetime(timestamp=data['time'])
data.insert(0, column="date_time", value=date_time)
data = data.set_index("date_time").sort_index(ascending=False)

In [94]:
data.columns

Index(['m_depth', 'sci_oxy4_oxygen', 'sci_oxy4_saturation',
       'sci_rbrctd_conductivity_00', 'sci_rbrctd_pressure_00',
       'sci_rbrctd_salinity_00', 'sci_rbrctd_temperature_00',
       'sci_seaowl_chl_sig', 'sci_seaowl_fdom_scaled', 'time'],
      dtype='object')

In [95]:
gd.plot_timeseries(data=data, parameter="sci_seaowl_fdom_scaled")

In [25]:
class KMZParser:
    def __init__(self, filepath:str):
        
        # file handling
        self.kml = self.convert_to_kml(filepath=filepath)
        self.soup = self.parse_kml_as_soup(kml=self.kml)
        self.folders = self.parse_folders(soup=self.soup)
        self.folders_names = self.parse_all_folders_names(folders=self.folders)
        
        # strings handling
        self._gps_time_string_pattern = r"Time of GPS Position: (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
        self._glider_track_range_string_pattern = r"Range: ([-+]?\d*\.\d+|\d+|NaN)[A-Za-z/]+.*?Speed: ([-+]?\d*\.\d+|\d+|NaN)[A-Za-z/]+ @ (\d)"
        self._depth_current_avg_string_pattern = r"Speed: ([-+]?\d*\.\d+|\d+|NaN)[A-Za-z/]+ @ (\d)"

        
        # surfacings coords
        self.surfacings_coords = self.parse_surfacings_coordinates(folders=self.folders)
        self.surfacings_coords_cols_names = ["folder_name", "gps_date_time", "longitude", "latitude" ]
        self.surfacings_coords_df = self.generate_coordinates_dataframe(coordinates=self.surfacings_coords, columns_names=self.surfacings_coords_cols_names)

        # surface movements coords
        self.surface_movements_coords_cols_names = ["folder_name", "gps_date_time", "longitude", "latitude" ]
        self.surface_movements_coords = self.parse_surface_movements_coordinates(folders=self.folders)
        self.surface_movements_coords_df = self.generate_coordinates_dataframe(coordinates=self.surface_movements_coords,
                                                                    columns_names=self.surface_movements_coords_cols_names)
        
        # glider track
        self.glider_track_coords_cols_names = ["folder_name", "range", "speed", "degree","start_longitude", "start_latitude", "end_longitude", "end_latitude"]
        self.glider_track_coords = self.parse_glider_tracks_coordinates(folders=self.folders)
        self.glider_track_coords_df = self.generate_coordinates_dataframe(coordinates=self.glider_track_coords,
                                                                    columns_names=self.glider_track_coords_cols_names)
        

        # depth current avg vectors
        self.depth_current_avg_coords_cols_names = ["folder_name", "speed", "degree","start_longitude", "start_latitude", "end_longitude", "end_latitude"]
        self.depth_current_avg_coords = self.parse_depth_current_coordinates(folders=self.folders)
        self.depth_current_avg_coords_df = self.generate_coordinates_dataframe(coordinates=self.depth_current_avg_coords,
                                                                    columns_names=self.depth_current_avg_coords_cols_names)
        

        # interactive map
        self.interactive_map = self.plot_map(surfacings_data=self.surfacings_coords_df,
                surface_movements_data=self.surface_movements_coords_df,
                glider_tracks_data=self.glider_track_coords_df,
                depth_avg_currents_data=self.depth_current_avg_coords_df)


    def convert_to_kml(self, filepath:str):
        with zipfile.ZipFile(filepath, 'r') as kmz:
            return kmz.open(kmz.filelist[0].filename, 'r').read()

    def parse_kml_as_soup(self, kml:bytes):
        return BeautifulSoup(kml, 'html.parser')

    def parse_folders(self, soup:BeautifulSoup):
        return soup.find_all("folder")

    def parse_all_folders_names(self, folders):
        folders_names = []
        for folder in self.folders:
            folders_names.append(folder.find("name").text)
        return folders_names

    def get_folder_index(self, folder_name:str):
        return self.folders_names.index(folder_name)

    def generate_coordinates_dataframe(self, coordinates:list, columns_names:list):
        return pd.DataFrame(columns=columns_names, data=coordinates)

    def generate_coordinates_geodataframe(self, coordinates:list, columns_names:list):
        return pd.DataFrame(columns=columns_names, data=coordinates) 

    def parse_find_all(self, folder, child_name:str):
        parsed = folder.find_all(child_name)
        if not parsed:
            raise ValueError(f"No matches for {child_name} found. Aborting parsing.")
        return parsed

    def parse_find(self, parent, child_name:str):
        parsed = parent.find(child_name)
        if not parsed:
            raise ValueError(f"No matches for {child_name} found. Aborting parsing.")
        return parsed

    def parse_surfacings_coordinates(self, folders, folder_name:str="Surfacings"):

        index = self.get_folder_index(folder_name=folder_name)
        folder_name = self.folders_names[index]
        folder = self.folders[index]
        
        coordinates = []
        
        placemarks = self.parse_find_all(folder=folder, child_name="placemark")

        for placemark in placemarks:
            coordinates_text = self.parse_find(parent=placemark, child_name="coordinates").text
            try:
                gps_time_text = placemark.find("description").text
                gps_time = re.search(self._gps_time_string_pattern, gps_time_text).group(1)
            except:
                gps_time = np.nan

            coordinates_list = [tuple(map(float, coord.split(","))) for coord in coordinates_text.strip().split()]
            
            for coord in coordinates_list:
                coordinates.append((folder_name, gps_time, coord[0], coord[1]))
    
        return coordinates

    def parse_surface_movements_coordinates(self, folders, folder_name:str="Surface Movements"):
        
        index = self.get_folder_index(folder_name=folder_name)
        folder_name = self.folders_names[index]
        folder = self.folders[index]
        
        coordinates = []

        placemarks = self.parse_find_all(folder=folder, child_name="placemark")
        
        for placemark in placemarks:
            coordinates_text = self.parse_find(parent=placemark, child_name="coordinates").text
            try:
                gps_time_text = placemark.find("description").text
                gps_time = re.search(self._gps_time_string_pattern, gps_time_text).group(1)
            except:
                gps_time = np.nan

            coordinates_list = [tuple(map(float, coord.split(","))) for coord in coordinates_text.strip().split()]
            
            for coord in coordinates_list:
                coordinates.append((folder_name, gps_time, coord[0], coord[1]))
        
        return coordinates

    def parse_glider_tracks_coordinates(self, folders, folder_name:str="Glider Tracks"):
        
        index = self.get_folder_index(folder_name=folder_name)
        folder_name = self.folders_names[index]
        folder = self.folders[index]
        
        coordinates = []

        placemarks = self.parse_find_all(folder=folder, child_name="placemark")
        
        for placemark in placemarks:
            coordinates_text = self.parse_find(parent=placemark, child_name="coordinates").text
            track_info_text = placemark.find("description").text
            track_info = re.search(self._glider_track_range_string_pattern, track_info_text)
            range = track_info.group(1)
            speed = track_info.group(2)
            deg = track_info.group(3)

            coordinates_list = [tuple(map(float, coord.split(","))) for coord in coordinates_text.strip().split()]

            coordinates.append((folder_name,
                            range,
                            speed,
                            deg,
                            coordinates_list[0][0],
                            coordinates_list[0][1],
                            coordinates_list[1][0],
                            coordinates_list[1][1]))
        
        return coordinates

    def parse_depth_current_coordinates(self, folders, folder_name:str="Depth Averaged Current Vectors"):
        
        index = self.get_folder_index(folder_name=folder_name)
        folder_name = self.folders_names[index]
        folder = self.folders[index]
        
        coordinates = []

        placemarks = self.parse_find_all(folder=folder, child_name="placemark")
        
        for placemark in placemarks:
            coordinates_text = self.parse_find(parent=placemark, child_name="coordinates").text
            track_info_text = placemark.find("description").text
            track_info = re.search(self._depth_current_avg_string_pattern, track_info_text)
            speed = track_info.group(1)
            deg = track_info.group(2)

            coordinates_list = [tuple(map(float, coord.split(","))) for coord in coordinates_text.strip().split()]

            coordinates.append((folder_name,
                            speed,
                            deg,
                            coordinates_list[0][0],
                            coordinates_list[0][1],
                            coordinates_list[1][0],
                            coordinates_list[1][1]))
        
        return coordinates

    def plot_map(self, 
                surfacings_data:pd.DataFrame,
                surface_movements_data:pd.DataFrame,
                glider_tracks_data:pd.DataFrame,
                depth_avg_currents_data:pd.DataFrame,
                zoom_start=10,
                center=None):
        
        # map_center = [data['latitude'].iloc[0], data['longitude'].iloc[0]]
        map = folium.Map(zoom_start=zoom_start, control_scale=True, location=(-22.92830339525606, -43.137900250593106))

        tile_layer = folium.TileLayer(
            tiles='https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}',
            attr='Esri World Imagery',
            name='Bathymetry',
            overlay=True,
            control=True
        ).add_to(map)

        surfacings_layer = folium.FeatureGroup(name='Surfacings', overlay=True).add_to(map)
        surface_movements_layer = folium.FeatureGroup(name='Surface Movements', overlay=True).add_to(map)
        glider_tracks_layer = folium.FeatureGroup(name='Glider Tracks', overlay=True).add_to(map)
        depth_currents = folium.FeatureGroup(name='Depth Avg Currents', overlay=True).add_to(map)


        for idx, row in surfacings_data.iterrows():
            coord = [row['latitude'],row['longitude']] 
            # marker = folium.CircleMarker(coord,
            #                     radius=3,
            #                     fill_color='cornflowerblue',
            #                     color=None,
            #                     fill_opacity=1,
            #                     fill=True,z_index=1000).add_to(surfacings_layer)

            text = f'''<div style="font-family: sans-serif; font-size: 12px;">
                <b>unit_1094</b><br>
                <b>Datahora:</b> {row.gps_date_time}Z<br>
                <b>Latitude:</b> {round(row['latitude'],6)}<br>
                <b>Longitude:</b> {round(row['longitude'], 6)}
            </div>'''

            iframe = folium.IFrame(text, width=220, height=75)
            popup = folium.Popup(iframe, max_width=300, popup_class='folium.features.LatLngPopup')
        
            icon_image="https://i.imgur.com/BJqEyd0.png"
            icon_size=(25,20)
            marker = folium.Marker(coord,
                                popup=popup,
                                icon=folium.CustomIcon(icon_image=icon_image, icon_size=icon_size)
                                ).add_to(surfacings_layer)

            if idx < len(surfacings_data) - 1:
                next_location = [surfacings_data['latitude'].iloc[idx + 1], surfacings_data['longitude'].iloc[idx + 1]]
                lines = folium.PolyLine(locations=[coord, next_location],
                                        color='white',
                                        dash_array='4, 4',
                                        weight=1,z_index=1000).add_to(surfacings_layer)

        

        for idx, row in surface_movements_data.iterrows():
            coord = [row['latitude'],row['longitude']] 
            # marker = folium.CircleMarker(coord,
            #                     radius=3,
            #                     fill_color='red',
            #                     color=None,
            #                     fill_opacity=1,
            #                     fill=True,z_index=1000).add_to(surface_movements_layer)
            
            text = f'''<div style="font-family: sans-serif; font-size: 12px;">
                <b>unit_1094</b><br>
                <b>Datahora:</b> {row.gps_date_time}Z<br>
                <b>Latitude:</b> {round(row['latitude'],6)}<br>
                <b>Longitude:</b> {round(row['longitude'], 6)}
            </div>'''

            iframe = folium.IFrame(text, width=220, height=75)
            popup = folium.Popup(iframe, max_width=300, popup_class='folium.features.LatLngPopup')

        
            icon_image="https://i.imgur.com/BJqEyd0.png"
            icon_size=(25,20)
            marker = folium.Marker(coord,
                                popup=popup,
                                icon=folium.CustomIcon(icon_image=icon_image, icon_size=icon_size)
                                ).add_to(surface_movements_layer)

            # if idx < len(surface_movements_data) - 1:
            #     next_location = [surface_movements_data['latitude'].iloc[idx + 1], surface_movements_data['longitude'].iloc[idx + 1]]
            #     lines = folium.PolyLine(locations=[coord, next_location],
            #                             color='white',
            #                             dash_array='4, 4',
            #                             weight=1,z_index=1000).add_to(surface_movements_layer)


        for idx, row in glider_tracks_data.iterrows():
            lines = folium.PolyLine(locations=[[row["start_latitude"], row["start_longitude"]], [row["end_latitude"],row["end_longitude"]]],
                                        color='white',
                                        dash_array='4, 4',
                                        weight=1,z_index=1000).add_to(glider_tracks_layer)

        for idx, row in depth_avg_currents_data.iterrows():
            lines = folium.PolyLine(locations=[[row["start_latitude"], row["start_longitude"]], [row["end_latitude"],row["end_longitude"]]],
                                        color='green',
                                        weight=3,z_index=1000).add_to(depth_currents)
        # surfacings_layer = folium.FeatureGroup(name='Surfacings', overlay=True).add_to(map)
        folium.LayerControl().add_to(map)

        MeasureControl(primary_length_unit='meters',
                        primary_area_unit='sqmeters').add_to(map)

        return map


In [26]:
filepath="../data/sfmc_data/DLD12332275641507894295.kmz"

p = KMZParser(filepath=filepath)

In [28]:
p.depth_current_avg_coords_df

Unnamed: 0,folder_name,speed,degree,start_longitude,start_latitude,end_longitude,end_latitude
0,Depth Averaged Current Vectors,0.1,1,-43.085583,-22.96935,-43.077555,-22.970616
1,Depth Averaged Current Vectors,0.12,1,-43.08565,-22.969033,-43.08335,-22.960369
2,Depth Averaged Current Vectors,0.08,3,-43.085733,-22.96895,-43.090681,-22.964568
3,Depth Averaged Current Vectors,0.07,3,-43.08585,-22.968833,-43.087509,-22.963414
4,Depth Averaged Current Vectors,0.16,3,-43.08625,-22.968717,-43.095904,-22.960883
5,Depth Averaged Current Vectors,0.14,3,-43.086867,-22.968417,-43.095908,-22.961693
6,Depth Averaged Current Vectors,0.21,3,-43.087017,-22.9685,-43.098257,-22.956388
7,Depth Averaged Current Vectors,0.19,3,-43.087883,-22.968617,-43.098009,-22.957896
8,Depth Averaged Current Vectors,0.17,3,-43.088417,-22.96885,-43.098331,-22.959328
9,Depth Averaged Current Vectors,0.18,3,-43.0897,-22.969167,-43.099225,-22.958441


In [27]:
p.interactive_map

In [27]:
p.glider_track_coords_df

Unnamed: 0,folder_name,range,speed,degree,start_longitude,start_latitude,end_longitude,end_latitude
0,Glider Tracks,0.00,0.00,0,-70.610417,41.640317,-70.610417,41.640317
1,Glider Tracks,3.96,0.00,2,-70.610417,41.640317,-70.610433,41.640283
2,Glider Tracks,7711.61,97.81,1,-70.610433,41.640283,-43.217317,-22.865650
3,Glider Tracks,19.75,0.00,2,-43.217317,-22.865650,-43.217250,-22.865483
4,Glider Tracks,19.57,0.00,1,-43.217250,-22.865483,-43.217150,-22.865633
...,...,...,...,...,...,...,...,...
94,Glider Tracks,9.41,25.03,3,-43.104400,-22.964983,-43.132833,-22.884500
95,Glider Tracks,3.71,0.00,1,-43.132833,-22.884500,-43.132833,-22.884533
96,Glider Tracks,34.05,0.01,1,-43.132833,-22.884533,-43.132767,-22.884833
97,Glider Tracks,8.16,0.00,1,-43.132767,-22.884833,-43.132733,-22.884900


In [464]:
p.surfacings_coords_df

Unnamed: 0,folder_name,gps_date_time,longitude,latitude
0,Surfacings,2023-07-20 15:33:28,-43.085583,-22.96935
1,Surfacings,2023-07-20 15:47:03,-43.08565,-22.969033
2,Surfacings,2023-07-20 15:56:39,-43.085733,-22.96895
3,Surfacings,2023-07-20 16:06:28,-43.08585,-22.968833
4,Surfacings,2023-07-20 16:17:44,-43.08625,-22.968717
5,Surfacings,2023-07-20 16:30:45,-43.086867,-22.968417
6,Surfacings,2023-07-20 16:36:24,-43.087017,-22.9685
7,Surfacings,2023-07-20 16:58:43,-43.087883,-22.968617
8,Surfacings,2023-07-20 17:18:52,-43.088417,-22.96885
9,Surfacings,2023-07-20 17:43:52,-43.0897,-22.969167


In [443]:
p.glider_track_coords_df

Index(['folder_name', 'range', 'speed', 'degree', 'start_longitude',
       'start_latitude', 'end_longitude', 'end_latitude'],
      dtype='object')

In [444]:
gdf = gpd.GeoDataFrame(
    p.surfacings_coords_df, geometry=gpd.points_from_xy(p.surfacings_coords_df.longitude, p.surfacings_coords_df.latitude), crs="EPSG:4326"
)

In [456]:
shapely.geometry.LineString(p.glider_track_coords_df[["start_longitude", "start_latitude"]])

KeyError: 0

In [449]:
gdf = gpd.GeoDataFrame(
    p.glider_track_coords_df, 
    geometry=gpd.points_from_xy((p.glider_track_coords_df.start_longitude, p.glider_track_coords_df.start_latitude),
                                    (p.glider_track_coords_df.start_longitude, p.glider_track_coords_df.start_latitude)
    ), crs="EPSG:4326"
)

TypeError: only size-1 arrays can be converted to Python scalars

In [448]:
gdf

Unnamed: 0,folder_name,range,speed,degree,start_longitude,start_latitude,end_longitude,end_latitude,geometry
0,Glider Tracks,0.00,0.00,0,-70.610417,41.640317,-70.610417,41.640317,POINT (-70.61042 41.64032)
1,Glider Tracks,3.96,0.00,2,-70.610417,41.640317,-70.610433,41.640283,POINT (-70.61042 41.64032)
2,Glider Tracks,7711.61,97.81,1,-70.610433,41.640283,-43.217317,-22.865650,POINT (-70.61043 41.64028)
3,Glider Tracks,19.75,0.00,2,-43.217317,-22.865650,-43.217250,-22.865483,POINT (-43.21732 -22.86565)
4,Glider Tracks,19.57,0.00,1,-43.217250,-22.865483,-43.217150,-22.865633,POINT (-43.21725 -22.86548)
...,...,...,...,...,...,...,...,...,...
94,Glider Tracks,9.41,25.03,3,-43.104400,-22.964983,-43.132833,-22.884500,POINT (-43.10440 -22.96498)
95,Glider Tracks,3.71,0.00,1,-43.132833,-22.884500,-43.132833,-22.884533,POINT (-43.13283 -22.88450)
96,Glider Tracks,34.05,0.01,1,-43.132833,-22.884533,-43.132767,-22.884833,POINT (-43.13283 -22.88453)
97,Glider Tracks,8.16,0.00,1,-43.132767,-22.884833,-43.132733,-22.884900,POINT (-43.13277 -22.88483)


In [404]:
text = "<b>Start: 22°51.934'S 43°13.024'W<br/>End: 22°51.934'S 43°13.024'W<br/>Range: 0.00m<br/>Speed: 100000m/s @ 0&deg;<br/></b>"

In [408]:
pattern = r"Range: (\d+\.\d{2}[A-Za-z/]+|NaN[A-Za-z/]).*Speed: (\d+\.\d{2}[A-Za-z/]+)"
pattern2 = r"Range: ([\d.]+[A-Za-z/]+).*Speed: ([\d.]+[A-Za-z/]+) @ (\d+&deg)"
pattern3 = r"Range: (\d+\.\d{2}[A-Za-z/]+|NaN).*Speed: (\d+\.\d{2}[A-Za-z/]+|NaN) @ (\d+&deg;)"
pattern4 = 

In [409]:
match = re.search(pattern4,text)

In [411]:
match.group(3)

'0'

In [304]:
p.glider_track_coords_df

Unnamed: 0,folder_name,gps_date_time,start_longitude,start_latitude,end_longitude,end_latitude
0,Glider Tracks,,-70.610417,41.640317,-70.610417,41.640317
1,Glider Tracks,,-70.610417,41.640317,-70.610433,41.640283
2,Glider Tracks,,-70.610433,41.640283,-43.217317,-22.865650
3,Glider Tracks,,-43.217317,-22.865650,-43.217250,-22.865483
4,Glider Tracks,,-43.217250,-22.865483,-43.217150,-22.865633
...,...,...,...,...,...,...
94,Glider Tracks,,-43.104400,-22.964983,-43.132833,-22.884500
95,Glider Tracks,,-43.132833,-22.884500,-43.132833,-22.884533
96,Glider Tracks,,-43.132833,-22.884533,-43.132767,-22.884833
97,Glider Tracks,,-43.132767,-22.884833,-43.132733,-22.884900


In [289]:
a, b = p.parse_glider_tracks_coordinates(folders=p.folders)

-70.61041666666667,41.64031666666666 -70.61041666666667,41.64031666666666
-70.61041666666667,41.64031666666666 -70.61043333333333,41.640283333333336
-70.61043333333333,41.640283333333336 -43.21731666666666,-22.86565
-43.21731666666666,-22.86565 -43.21725,-22.865483333333334
-43.21725,-22.865483333333334 -43.21715000000001,-22.865633333333335
-43.21715000000001,-22.865633333333335 -43.217283333333334,-22.865366666666667
-43.217283333333334,-22.865366666666667 -43.217299999999994,-22.8653
-43.217299999999994,-22.8653 -43.217299999999994,-22.8653
-43.217299999999994,-22.8653 -43.217233333333326,-22.86538333333333
-43.217233333333326,-22.86538333333333 -43.21721666666667,-22.865366666666667
-43.21721666666667,-22.865366666666667 -43.21721666666667,-22.865366666666667
-43.21721666666667,-22.865366666666667 -43.21711666666667,-22.865583333333333
-43.21711666666667,-22.865583333333333 -43.21708333333333,-22.865483333333334
-43.21708333333333,-22.865483333333334 -43.217066666666675,-22.8653333

In [264]:
a

[('Glider Tracks', nan, -70.61041666666667, 41.64031666666666),
 ('Glider Tracks', nan, -70.61041666666667, 41.64031666666666),
 ('Glider Tracks', nan, -70.61041666666667, 41.64031666666666),
 ('Glider Tracks', nan, -70.61043333333333, 41.640283333333336),
 ('Glider Tracks', nan, -70.61043333333333, 41.640283333333336),
 ('Glider Tracks', nan, -43.21731666666666, -22.86565),
 ('Glider Tracks', nan, -43.21731666666666, -22.86565),
 ('Glider Tracks', nan, -43.21725, -22.865483333333334),
 ('Glider Tracks', nan, -43.21725, -22.865483333333334),
 ('Glider Tracks', nan, -43.21715000000001, -22.865633333333335),
 ('Glider Tracks', nan, -43.21715000000001, -22.865633333333335),
 ('Glider Tracks', nan, -43.217283333333334, -22.865366666666667),
 ('Glider Tracks', nan, -43.217283333333334, -22.865366666666667),
 ('Glider Tracks', nan, -43.217299999999994, -22.8653),
 ('Glider Tracks', nan, -43.217299999999994, -22.8653),
 ('Glider Tracks', nan, -43.217299999999994, -22.8653),
 ('Glider Tracks',

In [113]:
coordinates = []
gps_time_string_pattern = r"Time of GPS Position: (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"

folders = soup.find_all("folder")
for folder in folders:
    folder_name = folder.find("name").text
    placemarks = folder.find_all("placemark")
    
    for placemark in placemarks:
        coordinates_text = placemark.find("coordinates").text
        try:
            gps_time_text = placemark.find("description").text
            gps_time = re.search(gps_time_string_pattern, gps_time_text).group(1)
        except:
            gps_time = np.nan

        coordinates_list = [tuple(map(float, coord.split(","))) for coord in coordinates_text.strip().split()]
        
        for coord in coordinates_list:
            coordinates.append((folder_name, gps_time, coord[0], coord[1]))

In [119]:
folders[-1]

<folder>
<name>Depth Averaged Current Vectors</name>
<placemark>
<name>unit_1094 Depth Averaged Current Vector</name>
<visibility>1</visibility>
<description>&lt;b&gt;Speed: 0.10m/s @ 100&amp;deg;&lt;/b&gt;</description>
<styleurl>#DepthAveragedCurrentVectors</styleurl>
<linestring>
<coordinates>-43.08558333333334,-22.969350000000002 -43.077555255974175,-22.97061557319618</coordinates>
</linestring>
</placemark>
<placemark>
<name>unit_1094 Depth Averaged Current Vector</name>
<visibility>1</visibility>
<description>&lt;b&gt;Speed: 0.12m/s @ 14&amp;deg;&lt;/b&gt;</description>
<styleurl>#DepthAveragedCurrentVectors</styleurl>
<linestring>
<coordinates>-43.08565,-22.969033333333332 -43.08335032414431,-22.960369376190815</coordinates>
</linestring>
</placemark>
<placemark>
<name>unit_1094 Depth Averaged Current Vector</name>
<visibility>1</visibility>
<description>&lt;b&gt;Speed: 0.08m/s @ 314&amp;deg;&lt;/b&gt;</description>
<styleurl>#DepthAveragedCurrentVectors</styleurl>
<linestring>


In [115]:
pd.DataFrame(coordinates)

Unnamed: 0,0,1,2,3
0,Surfacings,2023-07-20 15:33:28,-43.085583,-22.969350
1,Surfacings,2023-07-20 15:47:03,-43.085650,-22.969033
2,Surfacings,2023-07-20 15:56:39,-43.085733,-22.968950
3,Surfacings,2023-07-20 16:06:28,-43.085850,-22.968833
4,Surfacings,2023-07-20 16:17:44,-43.086250,-22.968717
...,...,...,...,...
328,Depth Averaged Current Vectors,,-43.098331,-22.959328
329,Depth Averaged Current Vectors,,-43.089700,-22.969167
330,Depth Averaged Current Vectors,,-43.099225,-22.958441
331,Depth Averaged Current Vectors,,-43.090667,-22.969350


In [91]:
text = folders[0].find_all("placemark")[0].find("description").text

In [96]:
pattern = r"Time of GPS Position: (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
match = re.search(pattern, text)

In [100]:
match.group(1)

'2023-07-20 15:33:28'

In [None]:
steps = soup.find_all

In [41]:
gdf = gpd.read_file(kml,driver='libkml')


TypeError: startswith first arg must be bytes or a tuple of bytes, not str

In [40]:
gpd.read_file?

[0;31mSignature:[0m
[0mgpd[0m[0;34m.[0m[0mread_file[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfilename[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbbox[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmask[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrows[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mengine[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns a GeoDataFrame from a file or URL.

.. versionadded:: 0.7.0 mask, rows

Parameters
----------
filename : str, path object or file-like object
    Either the absolute or relative path to the file or URL to
    be opened, or any object with a read() method (such as an open file
    or StringIO)
bbox : tuple | GeoDataFrame or GeoSeries | shapely Geometry, default None
    Filter features by given bo

In [None]:
filepath = '../data/sfmc_data/DLD12332275641507894295.kmz'

# Create a temporary directory to extract the KML file
with zipfile.ZipFile(filepath, 'r') as kmz:
    # Extract the KML file from the KMZ archive
    kmz.extractall('temp_dir')

# Read the extracted KML file using geopandas
kml_file = 'temp_dir/doc.kml'
gdf = gpd.read_file(kml_file,driver='libkml')
