In [1]:
import geodb.model
from geodb.model import GPSPoint, db_url, GPSTrack, clone_model

In [2]:
import sqlalchemy
from sqlalchemy.orm import sessionmaker

In [3]:
engine = sqlalchemy.create_engine(db_url(), echo=False)
Session = sessionmaker(bind=engine)
session = Session()

In [4]:
import pandas as pd
import numpy as np

# Cleanup orphan points

In [13]:
session.query(GPSPoint).filter(GPSPoint.track_id==None).count()

0

In [12]:
session.query(GPSPoint).filter(GPSPoint.track_id==None).delete()

20947

In [15]:
session.commit()

# Cleanup empty tracks

In [42]:
for track in session.query(GPSTrack):
    if len(track.points) == 0:
        print(f"{track.id}")
        session.delete(track)

15591
15593
15600
15601
15604
15607
15614
15615
15618
15623
15626
15632
15633
15637
15647
15652
15656
15658
15662
15666
15668
15670
15675
15677
15680
15683
15684
15688
15692
15702
15705
15707
15708
15710
15711
15715
15718
15719
15720
15722
15725
15727
15729
15731
15735
15739
15741
15745
15747
15748
15750
15751
15754
15755
15756
15758
15759
15763
15765
15767
15770
15772
15773
15775
15777
15781
15789
15797
2506
2507
2510
12364
12365
12368
12369
12371
12372
12375
12376
12572
12792


In [43]:
session.commit()

# Cleanup identical tracks

In [5]:
def points_are_same(p1, p2):
    return (p1.time == p2.time and
            p1.latitude == p2.latitude and
            p1.longitude == p2.longitude and
            p1.elevation == p2.elevation)

In [6]:
# why do we have multiple points in the same second?
def ultrasort(points):
    return sorted(points, key=lambda p: (p.time, p.latitude, p.longitude, p.elevation))

In [18]:
from collections import defaultdict
length_query = """
select length, array_agg(track_id) as track_ids,
array_agg(start) as starts,
array_agg(stop) as stops
from (
select track_id, count(*) as length, min(time) as start, max(time) as stop 
from point
group by track_id) x
group by length
order by length
"""
length_df = pd.read_sql_query(length_query, engine)
length_df

Unnamed: 0,length,track_ids,starts,stops
0,1,"[2850, 4790, 3936, 5963, 9399, 4543, 3189, 400...","[2019-11-27 10:28:48+00:00, 2017-08-06 20:12:0...","[2019-11-27 10:28:48+00:00, 2017-08-06 20:12:0..."
1,2,[12955],[2019-12-09 01:36:10+00:00],[2019-12-09 01:37:09+00:00]
2,3,[12556],[2019-11-25 07:28:34+00:00],[2019-11-25 07:29:19+00:00]
3,4,"[12715, 2486, 12696, 12966]","[2019-11-30 07:17:26+00:00, 2019-11-24 10:05:4...","[2019-11-30 07:17:50+00:00, 2019-11-24 10:35:4..."
4,6,[12550],[2019-11-25 04:04:22+00:00],[2019-11-25 04:05:29+00:00]
...,...,...,...,...
251,2146,[2492],[2019-11-29 22:55:15+00:00],[2019-11-30 08:34:43+00:00]
252,2419,[12530],[2019-11-23 18:08:00+00:00],[2019-11-24 07:58:46+00:00]
253,2601,[2499],[2019-12-05 23:34:00+00:00],[2019-12-06 12:44:36+00:00]
254,3449,[12542],[2019-11-24 12:50:22+00:00],[2019-11-24 23:34:30+00:00]


In [19]:
dupes = []
for row in length_df.itertuples():
    track_ids = row.track_ids
    starts = row.starts
    stops = row.stops
    by_times = defaultdict(list)
    for (start, stop, track_id) in zip(starts, stops, track_ids):
        by_times[(start, stop)].append(track_id)
    # print(f"{len(track_ids)} tracks with {len(by_times)} unique time bounds")
    for bounds, tracks_ids in by_times.items():
        if len(tracks_ids) > 1:
            tracks_ids.sort()
            uniques = [session.query(GPSTrack).get(tracks_ids[0])]
            for track_id in tracks_ids[1:]:
                t = session.query(GPSTrack).get(track_id)
                for u in uniques:
                    
                    for p1, p2 in zip(ultrasort(u.points), ultrasort(t.points)):
                        if not points_are_same(p1, p2):
                            break # not the same as this unique
                    else:
                        # all points matched, so this is a duplicate
                        print(f"{row.length}: {track_id} is a duplicate of {u.id}")
                        print(f"{u.id}\t{u.name}\t{u.filename}\t{u.source}\t{u.parent}")
                        print(f"{t.id}\t{t.name}\t{t.filename}\t{t.source}\t{t.parent}")
                        print()
                        dupes.append(t)
                        break
                else:
                    # no u matched
                    uniques.append(t)


In [20]:
[t.id for t in dupes]

[]

In [21]:
for t in dupes:
    session.delete(t)

In [22]:
session.execute("delete from track where id = 1417")

<sqlalchemy.engine.result.ResultProxy at 0x7fe813b983d0>

In [23]:
session.commit()

In [24]:
query = """
select * from track where id in (15100, 15099)
"""
pd.read_sql_query(query, engine)

Unnamed: 0,id,name,comment,description,source,type,filename,properties,raw,parent_id


In [25]:
query = """
select * from track where id in (2492, 12385)
"""
pd.read_sql_query(query, engine)

Unnamed: 0,id,name,comment,description,source,type,filename,properties,raw,parent_id
0,2492,Aneel Nazareth (9931878),,Aneel Nazareth (9931878),,,inreach.gpx,{},,


In [27]:
a = session.query(GPSTrack).get(2492)
b = session.query(GPSTrack).get(12385)

In [28]:
len(a.points), str(a.start.time), str(a.end.time)

(2146, '2019-11-29 22:55:15+00:00', '2019-11-30 08:34:43+00:00')

In [29]:
len(b.points), str(b.start.time), str(b.end.time)

AttributeError: 'NoneType' object has no attribute 'points'

In [30]:
for p1, p2 in zip(a.points, b.points):
    if not points_are_same(p1, p2):
        print(p1.time, p2.time,
            p1.latitude, p2.latitude,
            p1.longitude, p2.longitude,
            p1.elevation, p2.elevation)

AttributeError: 'NoneType' object has no attribute 'points'