
Testing SpatialDataFrame Ingestion Rates
- 5000 features
- 10000 features
- 50000 features
- 100000 features


In [4]:
import time
import datetime
from arcgis.features import SpatialDataFrame
from arcgis.gis import GIS
import pandas as pd

# from_featureclass Load Time Metrics

In [5]:
fc = r"D:\GIS\random_points.gdb\random_pts"
queries = ["OID < 5001",
           "OID < 10001",
           "OID < 50001",
           "OID < 100001",
           None
           ]
counts = [
    5000,
    10000,
    50000,
    100000,
    200000
]

In [15]:
logs = []
for q in queries:
    start = datetime.datetime.now()
    sdf = SpatialDataFrame.from_featureclass(fc, where_clause=q)
    end = datetime.datetime.now()
    logs.append(["%s (s)" % ((end-start).microseconds * 1e-6), 
                 q,
                 len(sdf)
                ])

In [16]:
pd.DataFrame(logs, columns=['Time (S)', 'Query', 'Size'])

Unnamed: 0,Time (S),Query,Size
0,0.687062 (s),OID < 5001,5000
1,0.920951 (s),OID < 10001,10000
2,0.519927 (s),OID < 50001,50000
3,0.8962199999999999 (s),OID < 100001,100000
4,0.485875 (s),,200000



# SpatialDataFrame Creation Times

In [6]:
def generate_random_df(ncols=4, nrows=10, random_text=False):
    """
    generates an row x col dataframe with randoms data
    """
    import string
    import random
    import numpy as np
    if ncols > len(string.ascii_letters):
        ncols = len(string.ascii_letters)
    col_names = string.ascii_letters[:ncols]
    df = pd.DataFrame(np.random.randint(0, 10000,size=(nrows, len(col_names))),
                      columns=list(col_names))
    if random_text:

        for i in range(random.randint(0, ncols)):
            col = random.choice(list(col_names))
            df[col] = df[col].apply(lambda x: ''.join(random.choice(
                string.ascii_uppercase + string.digits) for _ in range(random.randint(1, 255))))
    return df

In [7]:
def generate_random_geometry(count=10, geom_type="polyline",
                             xmin=-90, xmax=90, ymin=-90, ymax=90,
                             sr=4326):
    import random
    from arcgis.geometry._types import Geometry, SpatialReference
    geom_type = geom_type.lower()
    pts = 1
    if geom_type == "polyline":
        pts = random.randint(2,5)
    elif geom_type == "polygon":
        pts = random.randint(3,5)
    elif geom_type == "point":
        pts = 1
    else:
        geom_type = "polyline"
        pts = random.randint(2,5)
    g = []
    t='arcgis'
    sr = SpatialReference({'wkid' : sr})
    for i in range(count):
        coords = []
        for pt in range(pts):
            coords.append([random.randint(xmin, xmax),
                           random.randint(ymin, ymax)])
        if geom_type == "polyline":
            g.append(Geometry({"paths":[coords],"spatialReference":sr}))
        elif geom_type == "point":
            g.append(Geometry({"x":coords[0][0], "y": coords[0][1],
                               "spatialReference":sr}))
        else: # polygon
            g.append(Geometry({"rings":[coords],"spatialReference":sr}))

    return g

In [17]:
logs = []
columns = ['Time', 'Count', 'GeometryType', 'Method']
for size in [1000, 5000, 10000, 100000, 200000]:
    df = generate_random_df(ncols=3, nrows=5000, random_text=True)
    g = generate_random_geometry(count=5000, geom_type="point")
    start_time = datetime.datetime.now()
    sdf = SpatialDataFrame(data=df, geometry=g)
    end = datetime.datetime.now()
    logs.append(["%s s" % ((end-start_time).microseconds * 1e-6),
                 size,
                 'point',
                 "geoms from list"])

    df = generate_random_df(ncols=3, nrows=size, random_text=True)
    g = generate_random_geometry(count=size)
    df['SHAPE'] = g
    start_time = datetime.datetime.now()
    sdf = SpatialDataFrame(data=df, geometry='SHAPE')
    end = datetime.datetime.now()
    logs.append(["%s s" % ((end-start_time).microseconds * 1e-6),
                 size,
                 'polyline',
                 "geoms already in pd.DataFrame, assign by name"])

    df = generate_random_df(ncols=3, nrows=size, random_text=True)
    g = generate_random_geometry(count=size, geom_type="polygon")
    df['SHAPE'] = g
    start_time = datetime.datetime.now()
    sdf = SpatialDataFrame(data=df)
    end = datetime.datetime.now()
    logs.append(["%s s" % ((end-start_time).microseconds * 1e-6),
                 size,
                 'polygon',
                 "geoms already in pd.DataFrame, no explicit assignment"])

pd.DataFrame(data=logs, columns=columns)

Unnamed: 0,Time,Count,GeometryType,Method
0,0.07921399999999999 s,1000,point,geoms from list
1,0.0040149999999999995 s,1000,polyline,"geoms already in pd.DataFrame, assign by name"
2,0.000971 s,1000,polygon,"geoms already in pd.DataFrame, no explicit ass..."
3,0.005013999999999999 s,5000,point,geoms from list
4,0.007019 s,5000,polyline,"geoms already in pd.DataFrame, assign by name"
5,0.003005 s,5000,polygon,"geoms already in pd.DataFrame, no explicit ass..."
6,0.007019 s,10000,point,geoms from list
7,0.013510999999999999 s,10000,polyline,"geoms already in pd.DataFrame, assign by name"
8,0.0065249999999999996 s,10000,polygon,"geoms already in pd.DataFrame, no explicit ass..."
9,0.010008 s,100000,point,geoms from list


In [8]:
size = 10
df = generate_random_df(ncols=3, nrows=size, random_text=True)
g = generate_random_geometry(count=size)

In [9]:
sdf = SpatialDataFrame(data=df, geometry=g)

In [11]:
sdf.__feature_set__

ValueError: Length of values does not match length of index