## Create a subset of global-streetscapes dataset

In [3]:
# --------------------------------------
import warnings

warnings.filterwarnings("ignore")

# --------------------------------------
import streetscapes as scs

### Load dataset

In [4]:
# Directory containing CSV files
data_dir = scs.conf.DATA_DIR / "data"

# Directory containing Parquet files
parquet_dir = data_dir / "parquet"

In [5]:
df_city = scs.load_city_subset("Amsterdam")

[35mStreetscapes[0m | [36m2025-02-19@15:33:45[0m | [1mLoading 'Amsterdam.parquet'...[0m


In [9]:
df_city.head()

Unnamed: 0,uuid,source,orig_id,lat,lon,datetime_local,year,month,day,hour,...,city_id,city_lat,city_lon,country,iso2,iso3,admin_name,capital,population,continent
1889009,19c6e51e-fe64-4756-895d-7758889dbbd1,Mapillary,793569314897389,52.367188,4.893492,2020-09-20 12:43:12+02:00,2020,9,20,12,...,1528355309,52.3667,4.8833,Netherlands,NL,NLD,Noord-Holland,primary,862965.0,Europe
1889010,580ce8e7-fbdb-45e6-971b-6d5b6523d0ef,Mapillary,500509417737025,52.373929,4.884667,2020-09-21 18:50:13+02:00,2020,9,21,18,...,1528355309,52.3667,4.8833,Netherlands,NL,NLD,Noord-Holland,primary,862965.0,Europe
1889011,db31e5bc-5180-4320-b267-04ba3da0a980,Mapillary,299548784985576,52.36651,4.882237,2016-08-16 09:28:19.737000+02:00,2016,8,16,9,...,1528355309,52.3667,4.8833,Netherlands,NL,NLD,Noord-Holland,primary,862965.0,Europe
1889012,642298a7-563e-4650-b308-5e4371238b84,Mapillary,300520878202732,52.367038,4.89008,2017-03-02 17:26:02+01:00,2017,3,2,17,...,1528355309,52.3667,4.8833,Netherlands,NL,NLD,Noord-Holland,primary,862965.0,Europe
1889013,32eeddbe-c4f0-4ac1-aff0-a0d19d258ae5,Mapillary,416737606538178,52.369855,4.892274,2022-05-03 15:48:18.941000+02:00,2022,5,3,15,...,1528355309,52.3667,4.8833,Netherlands,NL,NLD,Noord-Holland,primary,862965.0,Europe


### Subset dataset

In this case we are choosing images of Amsterdam, during the day with a viewing direction from the side. 

In [10]:
df_city["lighting_condition"].unique()

array(['day', 'dusk/dawn', 'night'], dtype=object)

In [11]:
df_day = df_city[df_city["lighting_condition"] == "day"]
df_day.columns

Index(['uuid', 'source', 'orig_id', 'lat', 'lon', 'datetime_local', 'year',
       'month', 'day', 'hour',
       ...
       'city_id', 'city_lat', 'city_lon', 'country', 'iso2', 'iso3',
       'admin_name', 'capital', 'population', 'continent'],
      dtype='object', length=107)

In [12]:
df_side = df_day[df_day["view_direction"] == "side"]
df_side.columns

Index(['uuid', 'source', 'orig_id', 'lat', 'lon', 'datetime_local', 'year',
       'month', 'day', 'hour',
       ...
       'city_id', 'city_lat', 'city_lon', 'country', 'iso2', 'iso3',
       'admin_name', 'capital', 'population', 'continent'],
      dtype='object', length=107)

### Create dataframe to download images

Only keep the information needed to download the images and save to a csv file. 

In [13]:
df_to_download = df_side[["uuid", "source", "orig_id"]]
df_to_download.head()

Unnamed: 0,uuid,source,orig_id
1889019,87e77845-fe9c-4f16-882a-322274a26898,Mapillary,121662616604750
1889044,432c6ac2-b3e2-4c9c-80d8-042734fda977,Mapillary,1362561054167230
1889051,3885f9ae-6309-41de-9597-2fbb2422ef03,Mapillary,4008620072553616
1889054,4881ef63-b976-4b9c-8984-6a15fe6611f6,Mapillary,896123137843326
1889055,0f0e1a48-b264-41a6-bbf9-e5053cc47302,Mapillary,981213842624869


In [14]:
df_to_download.to_csv(data_dir / "amsterdam_side.csv")

In [15]:
df_to_download.to_parquet(parquet_dir / "amsterdam_side.parquet")