In [1]:
# --------------------------------------
import warnings

warnings.filterwarnings("ignore")

# --------------------------------------
import ibis

ibis.options.interactive = True

# --------------------------------------
import humanize

# --------------------------------------
from streetscapes import conf
from streetscapes import info
from streetscapes import logger
from streetscapes import functions as scf

# Converting CSV files to Parquet and merging them together

The CSV files of the original Global Streetscapes dataset add up to 64GB in total. Moreover, data is split in several files, which can make it a bit cumbersome to work with. Here, we convert the data to Parquet, which reduces file size and makes it easier to load and manipulate the data. 

Additionally, we combine columns from several sources into a single dataset that should serve most usecases.

First, let's declare some storage locations.

In [2]:
CSV_DIR = scf.mkdir(conf.DATA_DIR / "csv")
PARQUET_DIR = scf.mkdir(conf.DATA_DIR / "parquet")
MERGED_DIR = scf.mkdir(conf.DATA_DIR / "merged")

Create a DuckDB connection via Ibis. This will be used to manipulate all the data below.

In [3]:
con = ibis.connect("duckdb://")

Show some metadata about the available CSV files.

In [4]:
info.render_info_csv()

- [1mclimate.csv[0m - Contains the Koppen climate zone associated with each image's location.
  The calculation is as accurate as the location of the image given by the source, which also relies on the accuracy of the capturing devices. The accuracy could also be affected by the accuracy of the Koppen climate zone classification API from https://github.com/sco-tt/Climate-Zone-API.
    - [1muuid[0m (string) - Universally Unique IDentifier, unique for every image
    - [1msource[0m (string) - Source of the image, either Mapillary or KartaView
    - [1morig_id[0m (int) - Original ID of the image as specified by Mapillary or KartaView
    - [1mkoppen_geiger_zone[0m (string) - A zone code to identify the Koppen climate zone
    - [1mzone_description[0m (string) - Short description of the climate zone
- [1mcontextual.csv[0m - Contains the eight contextual attributes inferred for each image.
  Please refer to Table 3 in the paper for information on accuracy.
    - [1muuid[0m (

We will select and download a subset of the available CSV files to work with below.

In [5]:
file_names = [
    "simplemaps",
    "perception",
    "osm",
    "places365",
    "segmentation",
    "contextual",
    "metadata_common_attributes",
    "ghsl",
]

scf.download_files_hf([f"{f}.csv" for f in file_names], local_dir=CSV_DIR)

[35mStreetscapes[0m | [36m2025-01-30@15:16:15[0m | [1mDownloading files from HuggingFace Hub...[0m


simplemaps.csv:   0%|          | 0.00/1.60G [00:00<?, ?B/s]

perception.csv:   0%|          | 0.00/940M [00:00<?, ?B/s]

osm.csv:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

places365.csv:   0%|          | 0.00/733M [00:00<?, ?B/s]

segmentation.csv:   0%|          | 0.00/4.32G [00:00<?, ?B/s]

contextual.csv:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

metadata_common_attributes.csv:   0%|          | 0.00/2.38G [00:00<?, ?B/s]

ghsl.csv:   0%|          | 0.00/798M [00:00<?, ?B/s]

In [6]:
# List of CSV file paths
csv_files = list(CSV_DIR.glob("*.csv"))

# Convert all csvs in data dir to parquet
for file_name in csv_files:

    # Compile the Parquet file name.
    parquet_file = PARQUET_DIR / file_name.with_suffix('.parquet').name

    if parquet_file.exists():
        logger.info(f"File '{parquet_file}' exists, skipping.")
        continue

    logger.info(f"Converting '{file_name.name}' into '{parquet_file.name}'")
    con.read_csv(file_name).to_parquet(parquet_file, compression="ZSTD")

logger.info("Done!")

# List of Parquet file paths
parquet_files = list(PARQUET_DIR.glob("*.parquet"))

[35mStreetscapes[0m | [36m2025-01-30@15:34:49[0m | [1mConverting 'metadata_common_attributes.csv' into 'metadata_common_attributes.parquet'[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:35:01[0m | [1mConverting 'ghsl.csv' into 'ghsl.parquet'[0m
[35mStreetscapes[0m | [36m2025-01-30@15:35:04[0m | [1mConverting 'places365.csv' into 'places365.parquet'[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:35:06[0m | [1mConverting 'contextual.csv' into 'contextual.parquet'[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:35:10[0m | [1mConverting 'segmentation.csv' into 'segmentation.parquet'[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:35:33[0m | [1mConverting 'simplemaps.csv' into 'simplemaps.parquet'[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:35:40[0m | [1mConverting 'osm.csv' into 'osm.parquet'[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:35:51[0m | [1mConverting 'perception.csv' into 'perception.parquet'[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:35:56[0m | [1mDone![0m


Verify that the CSV and Parquet files contain the same information

In [7]:
csv_file = con.read_csv(CSV_DIR / "osm.csv")
csv_file.head()

In [8]:
parquet_file = con.read_parquet(PARQUET_DIR / "osm.parquet")
parquet_file.head()

In [9]:
csv_size = sum(file.stat().st_size for file in csv_files if file.is_file())
parquet_size = sum(file.stat().st_size for file in parquet_files if file.is_file())
reduction_factor = csv_size/parquet_size

logger.info(f"Total file size | CSV: {humanize.naturalsize(csv_size)} | Parquet: {humanize.naturalsize(parquet_size)} | Reduction factor: {reduction_factor:2.5f}")

[35mStreetscapes[0m | [36m2025-01-30@15:36:02[0m | [1mTotal file size | CSV: 14.1 GB | Parquet: 3.6 GB | Reduction factor: 3.86157[0m


 We may want to combine multiple csv files together into a single parquet file. If we use JOIN like above on the full table, we quickly run into memory issues. This is because `duckdb.sql(...)` creates an in-memory database to load the data and keep track of intermediate results. Alternatively, duckdb can create a persistent database on disk using `duckdb.connect('database_filename')`. 

In [17]:
# Perform the joins.
logger.info(f"Starting merger with '{parquet_files[0].name}'...")

# Load the first file into a table.
# We are going to use it to perform incremental joins on that table.
joined = con.read_parquet(parquet_files[0]).as_table()
for parquet_file in parquet_files[1:]:

    # Join the next Parquet file on the UUID column.
    logger.info(f"Merging '{parquet_file.name}'...")
    joined = joined.join(con.read_parquet(parquet_file).as_table(), "uuid").as_table()

# Save the final joined table to a compressed Parquet file.
logger.info("Saving merged file...")
merged_full = MERGED_DIR / "streetscapes_full.parquet"
joined.to_parquet(merged_full, compression="ZSTD")
logger.info("Done!")

[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mStarting merger with 'places365.parquet'...[0m


[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mMerging 'contextual.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mMerging 'segmentation.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mMerging 'metadata_common_attributes.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mMerging 'perception.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mMerging 'ghsl.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mMerging 'simplemaps.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mMerging 'osm.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:49:38[0m | [1mSaving merged file...[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:50:24[0m | [1mDone![0m


In [18]:
# Show the merged file size
merged_size = merged_full.stat().st_size
logger.info(f"Merged file size: {humanize.naturalsize(merged_size)}")

[35mStreetscapes[0m | [36m2025-01-30@15:50:50[0m | [1mMerged file size: 2.1 GB[0m


In [19]:
con.read_parquet(merged_full).head()

For some usecases it might be more convenient to select certain columns from different files into a single table. This can be achieved in a similar manner to the previous example. Here, we create a dictionary with the file names and columns we want to select. We also need to specify a column that is common to all files to join on. 

In [20]:
# Create dictionary choosing files and columns
selection = {
    "contextual": ['uuid', 'source', 'orig_id'],
    "osm": ['uuid', 'road_width', 'type_highway'],
    "simplemaps": ['uuid', 'city'],
    "metadata_common_attributes": ['uuid', 'lat', 'lon']
}

# Turn the selection into a list for easier traversal
selection = list(selection.items())

# Load the first file into a table.
# We are going to use it to perform incremental joins on that table.
parquet_file = PARQUET_DIR / f"{selection[0][0]}.parquet"
cols = selection[0][1]
logger.info(f"Starting merger with '{parquet_file.name}'...")
joined = con.read_parquet(parquet_file).select(*cols).as_table()

for file_name, cols in selection[1:]:

    parquet_file = PARQUET_DIR / f"{file_name}.parquet"
    logger.info(f"Merging table '{parquet_file.name}'...")

    joined = joined.join(con.read_parquet(parquet_file).select(*cols).as_table(), "uuid").as_table()

# Save the final joined table to a compressed Parquet file.
logger.info("Saving merged file...")
merged_selection = MERGED_DIR / "streetscapes_selection.parquet"
joined.to_parquet(merged_selection, compression="ZSTD")
logger.info("Done!")

[35mStreetscapes[0m | [36m2025-01-30@15:50:50[0m | [1mStarting merger with 'contextual.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:50:50[0m | [1mMerging table 'osm.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:50:50[0m | [1mMerging table 'simplemaps.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:50:50[0m | [1mMerging table 'metadata_common_attributes.parquet'...[0m
[35mStreetscapes[0m | [36m2025-01-30@15:50:50[0m | [1mSaving merged file...[0m


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[35mStreetscapes[0m | [36m2025-01-30@15:50:55[0m | [1mDone![0m


In [21]:
# Show the merged file size
merged_size = merged_selection.stat().st_size
logger.info(f"Merged file size: {humanize.naturalsize(merged_size)}")

[35mStreetscapes[0m | [36m2025-01-30@15:50:55[0m | [1mMerged file size: 333.0 MB[0m


In [22]:
# Let's inspect the new file to see if the join has worked
con.read_parquet(merged_selection).head()

We are in touch with the developers of the original Open Streetscapes dataset to add these parquet files to the dataset on huggingface.