https://towardsdatascience.com/how-tracking-apps-analyse-your-gps-data-a-hands-on-tutorial-in-python-756d4db6715d
https://towardsdatascience.com/how-to-make-your-pandas-loop-71-803-times-faster-805030df4f06
https://realpython.com/fast-flexible-pandas/

In [1]:
!pip install haversine



In [2]:
import gpxpy
import matplotlib.pyplot as plt
import datetime
from geopy import distance
from math import sqrt, floor
import numpy as np
import pandas as pd
import haversine
import os
import re
from os import listdir
from os.path import isfile, join
idx = pd.IndexSlice

In [3]:
df_input = pd.read_csv('gpx_analysis.csv', header=0, index_col=[0,1])

df_input['date'] = pd.to_datetime(df_input['date'], utc=True)
df_input['date'] = df_input['date'].dt.tz_localize(tz=None)

In [4]:
path = os.path.join(os.path.abspath(''), 'tracks')
allfiles = [f for f in listdir(path) if isfile(join(path, f))]

# Lijst met unieke files:
set_from_df = set([x[0] for x in df_input.index.values])
set_from_allfiles = set(allfiles)

newfiles = set_from_allfiles.difference(set_from_df)
newfiles = list(newfiles)

# Probleemgevallen verwijderen:
# while '20120704_192600.gpx' in allfiles: allfiles.remove('20120704_192600.gpx')
while '20170429_123929.gpx' in newfiles: newfiles.remove('20170429_123929.gpx')
while '.DS_Store' in newfiles: newfiles.remove('.DS_Store')

print("New files:")
print(newfiles)

New files:
['20200212_191853.gpx', '20200208_102811.gpx']


In [5]:
if not newfiles:
    print("No new files found.")
    
    df_final = df_input.copy()
    
# if lists are not equal    
else:
    print("New files found!")
    
    sections = [1000,(1000*1.60934),3000,5000,(5000*1.60934),10000,15000,(10000*1.60934),20000,21097.5,25000,30000,40000,42195]

    df_final = pd.DataFrame(columns=['time', 'distance', 'minutes_per_kilometer'])

    for file in newfiles:

        print('File:', file)

        path = os.path.join(os.path.abspath(''), 'tracks', file)

        gpx_file = open(path, 'r')
        gpx = gpxpy.parse(gpx_file)

        df = pd.DataFrame(columns=['lon', 'lat', 'alt', 'time'])

        # -----------------------------------------------------
        # Inlezen van alle files geeft problemen, omdat de bestanden verschillen van elkaar.

        if re.search('^Running', file) is None: # Als de bestandsnaam niet begint met 'Running', dan is het een Endomondo file

            if len(gpx.tracks[0].segments) == 1: # Soms komt het voor dat een bestand maar uit 1 segment bestaat, ook bij Endomondo.

                segment = gpx.tracks[0].segments[0]

                data = segment.points

                for point in data:
                    df = df.append({'lon': point.longitude, 'lat' : point.latitude, 'alt' : point.elevation, 'time' : point.time}, ignore_index=True)

            else: 
                for segment in gpx.tracks[0].segments[:-1]: # alle segments, behalve de laatste.

                    data = segment.points

                    for point in data:
                        df = df.append({'lon': point.longitude, 'lat' : point.latitude, 'alt' : point.elevation, 'time' : point.time}, ignore_index=True)

        else: # Als de bestandsnaam wel met 'Running' begint, dan is het een oud bestand. Daarvan moeten we alle segments inlezen.

            for segment in gpx.tracks[0].segments: # all segments

                data = segment.points

                for point in data:
                    df = df.append({'lon': point.longitude, 'lat' : point.latitude, 'alt' : point.elevation, 'time' : point.time}, ignore_index=True)

        # -----------------------------------------------------

        df = df.sort_values(by=['time'])
        df = df.reset_index()

        df = df.fillna(method='ffill')
        df = df.fillna(method='bfill')

        # Create a column with values that are 'shifted' one forwards, so we can create calculations for differences.
        df['lon-start'] = df['lon']
        df['lon-start'].iloc[-1] = np.nan
        df['lon-start'] = np.roll(df['lon-start'], 1)
        df['lat-start'] = df['lat']
        df['lat-start'].iloc[-1] = np.nan
        df['lat-start'] = np.roll(df['lat-start'], 1)
        df['alt-start'] = df['alt']
        df['alt-start'].iloc[-1] = np.nan
        df['alt-start'] = np.roll(df['alt-start'], 1)
        df['time-start'] = df['time']
        df['time-start'].iloc[-1] = np.nan
        df['time-start'] = np.roll(df['time-start'], 1)
        df = df.fillna(method='bfill')

        df['time'] = pd.to_datetime(df['time'], utc=True)
        df['time'] = df['time'].dt.tz_localize(tz=None)
        df['time-start'] = pd.to_datetime(df['time-start'], utc=True)
        df['time-start'] = df['time-start'].dt.tz_localize(tz=None)

        #Calculate distances and time deltas
        df['distance_dis_2d'] = df.apply(lambda x: distance.distance((x['lat-start'], x['lon-start']), (x['lat'], x['lon'])).m, axis = 1)
        df['alt_dif'] = df.apply(lambda x: x['alt-start'] - x['alt'], axis=1)
        df['distance_dis_3d'] = df.apply(lambda x: sqrt(x['distance_dis_2d']**2 + (x['alt_dif'])**2), axis=1)
        df['time_delta'] = df.apply(lambda x: (x['time'] - x['time-start']).total_seconds(), axis=1)

        df_selected = df.loc[:, ['distance_dis_3d','time_delta']]

        df_selected['distance_cumsum'] = df_selected['distance_dis_3d'].cumsum()
        df_selected['time_cumsum'] = df_selected['time_delta'].cumsum()

        # Here we loop over sections
        for section in sections:

            if df['distance_dis_3d'].sum() < section: # If the total distance of the workout is smaller then the section we're looking for we can skip this iteration.
                continue

            print(' - Section:', section)
            df_output = pd.DataFrame(columns=['date', 'section', 'filename', 'time', 'distance', 'minutes_per_kilometer', 'total_distance', 'total_time'])

            for i in range(len(df_selected.index)):

                df_section = df_selected[(df_selected['distance_cumsum'] - df_selected['distance_cumsum'].iat[i]) >= section]
                if(len(df_section.index) != 0):
                    time = df_section['time_cumsum'].iat[0] - df_selected['time_cumsum'].iat[i]
                    distance_i = df_section['distance_cumsum'].iat[0] - df_selected['distance_cumsum'].iat[i]
                    minutes_per_kilometer = (time/60)/(distance_i/1000)
                    df_output = df_output.append({'date': df['time'].min(), 'section': section, 'filename': file, 'time': time, 'distance': distance_i, 'minutes_per_kilometer': minutes_per_kilometer, 'total_distance': df['distance_dis_3d'].sum(), 'total_time': df['time_delta'].sum()}, ignore_index=True)

            if df_output.empty == True:
                print('Error?!')
            else:
                s_best = df_output.loc[df_output['minutes_per_kilometer'].idxmin()]
                df_final = df_final.append(s_best)

    df_final['start_index_best_section'] = df_final.index
    df_final = df_final.set_index(['filename','section'])

    df_final = df_final.append(df_input)

    df_final.to_csv('gpx_analysis.csv')

print('Finished!')

New files found!
File: 20200212_191853.gpx


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
	To accept the future behavior, pass 'dtype=object'.
	To keep the old behavior, pass 'dtype="datetime64[ns]"'.
  a = asanyarray(a)


 - Section: 1000
 - Section: 1609.34
 - Section: 3000
 - Section: 5000
File: 20200208_102811.gpx
 - Section: 1000
 - Section: 1609.34
 - Section: 3000
 - Section: 5000
 - Section: 8046.7
 - Section: 10000
Finished!


Total distance in kilometers:

In [6]:
df_final.loc[idx[:,1000],'total_distance'].sum() / 1000

2458.431532401525