In [1]:
import warnings

warnings.filterwarnings("ignore")

import os

import duckdb

from streetscapes import conf

# Convert CSV files to parquet and merging them together

The CSV files of the original Global Streetscapes dataset add up to 64GB in total. Moreover, data is split in several files which can make it a bit cumbersome to work with. Here, we convert the data to Parquet, which reduces file size and makes it easier to load and manipulate the data. 

Additionally, we combine columns from several sources into a single dataset that should serve most usecases.

In [2]:
# Convert all csvs in data dir to parquet
for file in (conf.DATA_DIR / "data").glob("*.csv"):
    print(file.stem)
    duckdb.sql(f"""
        COPY (SELECT * FROM read_csv_auto('{file}', sample_size=-1))
        TO '{file.with_suffix(".parquet")}'
        (FORMAT 'parquet', COMPRESSION 'zstd')
    """)

metadata_common_attributes


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

ghsl


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

places365


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

contextual


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

segmentation


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

simplemaps


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

osm


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

perception


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [3]:
csv_size = sum(file.stat().st_size for file in (conf.DATA_DIR / "data").glob("*.csv") if file.is_file())
parquet_size = sum(file.stat().st_size for file in (conf.DATA_DIR / "data").glob("*.parquet") if file.is_file() and file.name not in ["combined.parquet", "streetscapes.parquet"])

def human_readable(size):
    """Format byte size in human readable format"""
    order_of_magnitude = size.bit_length() // 10  # Dividing by 10 for base-1024 magnitude
    match order_of_magnitude:
        case 3:
            return f"{size / 1024**3:.2f} GB"
        case 2:
            return f"{size / 1024**2:.2f} MB"
        case 1:
            return f"{size / 1024:.2f} KB"
        case _:
            return f"{size} bytes"

print(f"{human_readable(csv_size)=}, {human_readable(parquet_size)=}")

reduction_factor = csv_size/parquet_size
print(f"{reduction_factor=}")

human_readable(csv_size)='13.09 GB', human_readable(parquet_size)='3.39 GB'
reduction_factor=3.861472856397811


 We may want to combine multiple csv files together into a single parquet file. If we use JOIN like above on the full table, we quickly run into memory issues. This is because `duckdb.sql(...)` creates an in-memory database to load the data and keep track of intermediate results. Alternatively, duckdb can create a persistent database on disk using `duckdb.connect('database_filename')`. 

In [4]:
files = [
        "contextual",
        "metadata_common_attributes",
        "segmentation",
        "simplemaps",
        "ghsl",
        "perception",
        "places365",
        "osm",
    ]

with duckdb.connect("duck.db") as con:
    # Load each dataset onto disk from the each file
    for filename in files:
        con.sql(f"CREATE TABLE {filename} AS SELECT * FROM '{conf.DATA_DIR}/data/{filename}.parquet'")

    # Perform the joins.
    for i, filename in enumerate(files[:-1]):
        # Join the tables one by one and store intermediate results in separate tables
        j = i + 1
        target = filename if i==0 else f"step{i}"
        con.sql(f"CREATE TABLE step{j} AS SELECT * FROM {target} JOIN {files[j]} USING (UUID, source, orig_id)")

    # Finally, we can export the joined table to a new parquet file
    con.sql(f"COPY step{j} TO '{conf.DATA_DIR}/data/streetscapes.parquet' (FORMAT 'parquet', COMPRESSION 'zstd')")

# Remove the database from our filesystem
os.remove("duck.db")

# Show the combined file size:
combined_size = (conf.DATA_DIR / "data" / "streetscapes.parquet").stat().st_size
print(f"{human_readable(combined_size)=}")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

human_readable(combined_size)='1.85 GB'


In [5]:
# Let's inspect the new file to see if the join has worked
duckdb.sql(f"SELECT * FROM '{conf.DATA_DIR}/data/streetscapes.parquet'")

┌──────────────────────────────────────┬───────────┬──────────────────┬─────────┬────────────────────┬─────────────┬─────────────────┬───────────────┬────────────┬────────────────┬─────────┬────────────────────┬─────────────────────┬─────────────────────────┬───────┬───────┬───────┬───────┬────────┬────────┬────────────────────┬─────────────────┬────────────────────┬────────────────────┬────────────────┬────────────────────────┬────────────────────┬────────┬───────────────┬─────────┬──────────┬────────────┬──────────┬──────────┬───────────┬───────────────────┬──────────┬─────────┬─────────────────┬────────────┬──────────┬──────────────┬──────────┬─────────┬───────────┬────────┬─────────┬───────────┬──────────────┬─────────────┬──────────────────────────┬────────────────────────┬──────────┬────────┬───────────┬────────┬─────────┬────────────┬────────┬────────┬────────┬───────────┬───────────┬─────────────┬─────────────┬──────────────┬──────────────┬─────────┬─────────┬─────────────┬────

For some usecases it might be more convenient to select certain columns from different files into a single table. This can be achieved in a similar manner to the previous example. Here, we create a dictionary with the file names and columns we want to select. We also need to specify a column that is common to all files to join on. 

In [6]:
# Create dictionary choosing files and columns
selection = {
    "contextual": ['UUID', 'source', 'orig_id'],
    "osm": ['UUID', 'road_width', 'type_highway'],
    "simplemaps": ['UUID', 'city'],
    "metadata_common_attributes": ['UUID', 'lat', 'lon']
}

with duckdb.connect("duck.db") as con:
    # Load each dataset onto disk from the each file
    for file, columns in selection.items():
        col_str = ', '.join(columns)
        con.sql(f"CREATE TABLE {file} AS SELECT {col_str} FROM '{conf.DATA_DIR}/data/{file}.parquet'")

    # Perform the joins.
    items = list(selection.items())
    for i, (file, columns) in enumerate(items[:-1]):
        # Join the tables one by one and store intermediate results in separate tables
        j = i + 1
        target = file if i==0 else f"step{i}"
        next_file = items[j][0]
        con.sql(f"CREATE TABLE step{j} AS SELECT * FROM {target} JOIN {next_file} USING (UUID)")

    # Finally, we can export the joined table to a new parquet file
    con.sql(f"COPY step{i} TO '{conf.DATA_DIR}/data/combined.parquet' (FORMAT 'parquet', COMPRESSION 'zstd')")

os.remove("duck.db")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
# Let's inspect the new file to see if the join has worked
duckdb.sql(f"SELECT * FROM '{conf.DATA_DIR}/data/combined.parquet'")

┌──────────────────────────────────────┬───────────┬──────────────────┬────────────┬──────────────┬───────────────────────┐
│                 uuid                 │  source   │     orig_id      │ road_width │ type_highway │         city          │
│               varchar                │  varchar  │      int64       │  varchar   │   varchar    │        varchar        │
├──────────────────────────────────────┼───────────┼──────────────────┼────────────┼──────────────┼───────────────────────┤
│ db2aa7fc-4cd8-4c5a-adfe-9a3094e49b7c │ Mapillary │  690934255468488 │ NULL       │ drive        │ Gravatá               │
│ 3d13f77b-b367-4a9b-9bde-376b32ab48f3 │ Mapillary │  874060836814600 │ NULL       │ drive        │ Orléans               │
│ de872416-ea19-4011-a654-f668756ded4a │ Mapillary │  826229008273339 │ NULL       │ drive        │ Port Orange           │
│ 64cb56f4-7f18-4bff-b481-6f732d26e87e │ Mapillary │  160793732662417 │ NULL       │ walk         │ Melbourne             │
│ 9590d2

We are in touch with the developers of the original Open Streetscapes dataset to add these parquet files to the dataset on huggingface.