In [1]:
import logging
import time
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
from distributed import LocalCluster, Client

from matplotlib import pyplot as plt
import hvplot.pandas

import holoviews as hv

In [2]:
base_path = Path('../')

In [3]:
unsorted = pd.read_csv(base_path / 'compared_cases/unsorted/unsorted_results_df_2020-06-16 11:59:03.460061.csv', index_col=0).drop('sort_time_sec', axis=1)
sorted_geohash = pd.read_csv(base_path / 'compared_cases/sorted_geohash/2020-06-15 23:01:55.622255_geohash_sorted_results.csv', index_col=0)
sorted_geohash_no_sjoin = pd.read_csv(base_path / 'compared_cases/sorted_geohash_no_sjoin/geohash_sorted_no_sjoin_results_2020-06-17 10:50:26.667292.csv', index_col=0)
spatially_sorted = pd.read_csv(base_path / 'compared_cases/spatialsort/spatially_sorted_results_2020-06-16 23:34:00.804790.csv', index_col=0)

In [4]:
# add sort/query times in seconds
# unsorted
unsorted['sort_time_sec'] = 0
unsorted['method'] = 'unsorted'
unsorted['query_time_sec'] = unsorted['time_min']*60
unsorted['geohash_time_sec'] = 0
unsorted['total_time'] = unsorted.sort_time_sec + unsorted.geohash_time_sec + unsorted.query_time_sec

# sorted_geohash
sorted_geohash['sort_time_sec'] = 0.04551050259007348 * 60 * 60 
sorted_geohash['method'] = 'sorted_geohash'
sorted_geohash['query_time_sec'] = sorted_geohash['time_min']*60
sorted_geohash['geohash_time_sec'] = 0.44220667448308737 * 60 * 60 # need to fix this one, once I time the geohash creation
sorted_geohash['total_time'] = sorted_geohash.sort_time_sec + sorted_geohash.geohash_time_sec + sorted_geohash.query_time_sec

# sorted_geohash_no_sjoin
sorted_geohash_no_sjoin['sort_time_sec'] = 0.04551050259007348 * 60 * 60 
sorted_geohash_no_sjoin['method'] = 'sorted_geohash_no_sjoin'
sorted_geohash_no_sjoin['query_time_sec'] = sorted_geohash_no_sjoin['time_min']*60
sorted_geohash_no_sjoin['geohash_time_sec'] = 0.44220667448308737 * 60 * 60 # need to fix this one, once I time the geohash creation
sorted_geohash_no_sjoin['total_time'] = sorted_geohash_no_sjoin.sort_time_sec + sorted_geohash_no_sjoin.geohash_time_sec + sorted_geohash_no_sjoin.query_time_sec

# spatially_sorted
spatially_sorted['sort_time_sec'] = 1.1824355443318686 * 60
spatially_sorted['method'] = 'spatially_sorted'
spatially_sorted['query_time_sec'] = spatially_sorted['time_min']*60
spatially_sorted['geohash_time_sec'] = 0
spatially_sorted['total_time'] = spatially_sorted.sort_time_sec + spatially_sorted.geohash_time_sec + spatially_sorted.query_time_sec

# drop unneeded columns

# concatenate results
results = pd.concat([unsorted, sorted_geohash, sorted_geohash_no_sjoin, spatially_sorted], axis=0).reset_index(drop=True)

In [5]:
unsorted_display = unsorted.loc[:, ['num_polygons', 'num_points', 'geohash_time_sec', 'sort_time_sec', 'query_time_sec', 'num_result_points', 'total_time']]
unsorted_display = unsorted_display.rename({'num_polygons': '# Polygons',
                                            'num_points': '# Points',
                                            'geohash_time_sec': 'Geohash Time (s)',
                                            'sort_time_sec': 'Sort Time (s)',
                                            'query_time_sec': 'Query Time (s)',
                                            'num_result_points': '# Result Points',
                                            'total_time': 'Total Time (s)'}, axis=1)
unsorted_display

Unnamed: 0,# Polygons,# Points,Geohash Time (s),Sort Time (s),Query Time (s),# Result Points,Total Time (s)
0,1,113944489,0,0,2464.586632,1031,2464.586632
1,10,113944489,0,0,2478.647653,6551,2478.647653
2,100,113944489,0,0,2843.214864,203284,2843.214864


In [6]:
sorted_geohash_display = sorted_geohash.loc[:, ['num_polygons', 'num_points', 'geohash_time_sec', 'sort_time_sec', 'query_time_sec', 'num_result_points', 'total_time']]
sorted_geohash_display = sorted_geohash_display.rename({'num_polygons': '# Polygons',
                                            'num_points': '# Points',
                                            'geohash_time_sec': 'Geohash Time (s)',
                                            'sort_time_sec': 'Sort Time (s)',
                                            'query_time_sec': 'Query Time (s)',
                                            'num_result_points': '# Result Points',
                                            'total_time': 'Total Time (s)'}, axis=1)
sorted_geohash_display

Unnamed: 0,# Polygons,# Points,Geohash Time (s),Sort Time (s),Query Time (s),# Result Points,Total Time (s)
0,1,113944489,1591.944028,163.837809,4.4773,1031,1760.259138
1,10,113944489,1591.944028,163.837809,9.065676,6551,1764.847514
2,100,113944489,1591.944028,163.837809,184.535968,203284,1940.317805
3,1000,113944489,1591.944028,163.837809,1603.065409,2403824,3358.847246


In [7]:
sorted_geohash_no_sjoin_display = sorted_geohash_no_sjoin.loc[:, ['num_polygons', 'num_points', 'geohash_time_sec', 'sort_time_sec', 'query_time_sec', 'num_result_points', 'total_time']]
sorted_geohash_no_sjoin_display = sorted_geohash_no_sjoin_display.rename({'num_polygons': '# Polygons',
                                            'num_points': '# Points',
                                            'geohash_time_sec': 'Geohash Time (s)',
                                            'sort_time_sec': 'Sort Time (s)',
                                            'query_time_sec': 'Query Time (s)',
                                            'num_result_points': '# Result Points',
                                            'total_time': 'Total Time (s)'}, axis=1)
sorted_geohash_no_sjoin_display

Unnamed: 0,# Polygons,# Points,Geohash Time (s),Sort Time (s),Query Time (s),# Result Points,Total Time (s)
0,1,113944489,1591.944028,163.837809,3.486654,26413,1759.268491
1,10,113944489,1591.944028,163.837809,3.791607,156625,1759.573444
2,100,113944489,1591.944028,163.837809,5.40204,5269528,1761.183878
3,1000,113944489,1591.944028,163.837809,11.513147,38934176,1767.294984
4,10000,113944489,1591.944028,163.837809,24.692601,90177639,1780.474439


In [8]:
spatially_sorted_display = spatially_sorted.loc[:, ['num_polygons', 'num_points', 'geohash_time_sec', 'sort_time_sec', 'query_time_sec', 'num_result_points', 'total_time']]
spatially_sorted_display = spatially_sorted_display.rename({'num_polygons': '# Polygons',
                                            'num_points': '# Points',
                                            'geohash_time_sec': 'Geohash Time (s)',
                                            'sort_time_sec': 'Sort Time (s)',
                                            'query_time_sec': 'Query Time (s)',
                                            'num_result_points': '# Result Points',
                                            'total_time': 'Total Time (s)'}, axis=1)
spatially_sorted_display

Unnamed: 0,# Polygons,# Points,Geohash Time (s),Sort Time (s),Query Time (s),# Result Points,Total Time (s)
0,1,113944489,0,70.946133,0.5661,1031,71.512233
1,10,113944489,0,70.946133,3.155046,6551,74.101179
2,100,113944489,0,70.946133,10.313305,203284,81.259438
3,1000,113944489,0,70.946133,21.178246,2403824,92.124378
4,10000,113944489,0,70.946133,57.123616,25877947,128.069749


In [9]:
results.sort_values('num_polygons')

Unnamed: 0,num_polygons,num_points,num_result_points,time_min,sort_time_sec,method,query_time_sec,geohash_time_sec,total_time
0,1,113944489,1031,41.076444,0.0,unsorted,2464.586632,0.0,2464.586632
12,1,113944489,1031,0.009435,70.946133,spatially_sorted,0.5661,0.0,71.512233
3,1,113944489,1031,0.074622,163.837809,sorted_geohash,4.4773,1591.944028,1760.259138
7,1,113944489,26413,0.058111,163.837809,sorted_geohash_no_sjoin,3.486654,1591.944028,1759.268491
13,10,113944489,6551,0.052584,70.946133,spatially_sorted,3.155046,0.0,74.101179
8,10,113944489,156625,0.063193,163.837809,sorted_geohash_no_sjoin,3.791607,1591.944028,1759.573444
4,10,113944489,6551,0.151095,163.837809,sorted_geohash,9.065676,1591.944028,1764.847514
1,10,113944489,6551,41.310794,0.0,unsorted,2478.647653,0.0,2478.647653
9,100,113944489,5269528,0.090034,163.837809,sorted_geohash_no_sjoin,5.40204,1591.944028,1761.183878
2,100,113944489,203284,47.386914,0.0,unsorted,2843.214864,0.0,2843.214864


In [207]:
results['total_time_min'] = results.total_time/60
results['query_time_min'] = results.query_time_sec/60
results.method = results.method.replace({'unsorted': 'Unsorted',
                        'sorted_geohash': 'Sorted Geohash',
                        'sorted_geohash_no_sjoin': 'Sorted Geohash No Sjoin',
                        'spatially_sorted': 'Spatially Sorted'})

In [273]:
method_color = {k:v for k,v in zip(results.method.unique(), ['red', 'blue', 'orange', 'green'])}

plots = []
for method in results.method.unique():
    plots.append(
        results[results.method == method].hvplot.scatter(x='num_polygons',
                       y='total_time_min',
                       logx=True,
                       logy=False,
                       xlabel='Number of Polygons',
                       ylabel='Total Time (min)',
                       label=method,
                       title='Total Time vs Number of Polygons for Various Algorithms').opts(color=method_color[method]) *
         results[results.method == method].hvplot.line(x='num_polygons',
                       y='total_time_min',
                         line_dash='dashed',
                       logx=True,
                       label=method,
                       logy=False).opts(color=method_color[method])
    )

In [274]:
(plots[0] * plots[1] * plots[2] * plots[3]).opts(width=750)

In [222]:
results.method.unique()

array(['Unsorted', 'Sorted Geohash', 'Sorted Geohash No Sjoin',
       'Spatially Sorted'], dtype=object)

In [259]:
method_color = {k:v for k,v in zip(results.method.unique(), ['red', 'blue', 'orange', 'green'])}

plots = []
for method in results.method.unique():
    plots.append(
        results[results.method == method].hvplot.scatter(x='num_polygons',
                       y='query_time_min',
                       logx=True,
                       logy=True,
                       xlabel='Number of Polygons',
                       ylabel='Total Time (min)',
                       label=method,
                       title='Query Time vs Number of Polygons for Various Algorithms').opts(color=method_color[method]) *
         results[results.method == method].hvplot.line(x='num_polygons',
                       y='query_time_min',
                         line_dash='dashed',
                       logx=True,
                       label=method,
                       logy=True).opts(color=method_color[method])
    )
    

In [263]:
(plots[0] * plots[1] * plots[2] * plots[3]).opts(width=750)

In [213]:
results.hvplot.scatter(x='num_polygons', y='query_time_min', color='method', logx=True, logy=False, xlabel='Number of Polygons', ylabel='Total Time (min)', title='Query Time vs Number of Polygons for Three Querying Algorithms', alpha=1)

In [173]:
results['preprocess_time_sec'] = results.geohash_time_sec + results.sort_time_sec

In [46]:
results[results.num_polygons==1]

Unnamed: 0,num_polygons,num_points,num_result_points,time_min,sort_time_sec,method,query_time_sec,geohash_time_sec,total_time,preprocess_time_sec
0,1,113944489,1031,41.076444,0.0,unsorted,2464.586632,0.0,2464.586632,0.0
3,1,113944489,1031,0.074622,163.837809,sorted_geohash,4.4773,1591.944028,1760.259138,1755.781837
7,1,113944489,26413,0.058111,163.837809,sorted_geohash_no_sjoin,3.486654,1591.944028,1759.268491,1755.781837
12,1,113944489,1031,0.009435,70.946133,spatially_sorted,0.5661,0.0,71.512233,70.946133


In [160]:
bar_results = results.loc[results.num_polygons==1,
                          ['method', 'geohash_time_sec', 'sort_time_sec']].rename({'geohash_time_sec': 'Geohash Time',
                                                                                    'sort_time_sec': 'Sort Time'},
                                                                                  axis=1).melt(id_vars=['method'])
bar_results.value /= 60
bar_results

Unnamed: 0,method,variable,value
0,unsorted,Geohash Time,0.0
1,sorted_geohash,Geohash Time,26.5324
2,sorted_geohash_no_sjoin,Geohash Time,26.5324
3,spatially_sorted,Geohash Time,0.0
4,unsorted,Sort Time,0.0
5,sorted_geohash,Sort Time,2.73063
6,sorted_geohash_no_sjoin,Sort Time,2.73063
7,spatially_sorted,Sort Time,1.182436


In [168]:
hv.Bars(bar_results,
        kdims=[('method', 'Case'), 'variable'],
        vdims=('value', 'Preprocess Time (min)')).opts(stacked=True,
                                                       width=600,
                                                       show_grid=True,
                                                       title='Preprocess Time')