# Download a subset of Global Streetscapes data

This notebook demostrates how to filter the Global Streetscapes dataset to find the desired subset of data.
As an example, we show how to filter for daytime images from Singapore, and how to prepare the input csv file for download_jpegs.py to download the required jpegs.

In [10]:
import pandas as pd

## Load file with available points

In [12]:
# segmentation.csv
df_seg_all = pd.read_csv(
    "C:/Users/ShiyingLi/Documents/SUE/global-streetscapes/data/segmentation.csv"
)  # update the location of the desired csv file
print("seg all shape: ", df_seg_all.shape)
df_seg_person = df_seg_all[df_seg_all["Person"] > 0]
print("seg person shape: ", df_seg_person.shape)

# uuid, source, orig_id, Person

seg all shape:  (10004539, 69)
seg person shape:  (4908219, 69)


In [14]:
# instances.csv: uuid, source, orig_id, Person, Bicyclist, Motorcyclist, Other-Rider
df_instances_all = pd.read_csv(
    "C:/Users/ShiyingLi/Documents/SUE/global-streetscapes/data/instances.csv"
)  # update the location of the desired csv file

print(df_instances_all.shape)

df_instances_human = df_instances_all[(df_instances_all["Person"]> 0) | (df_instances_all["Bicyclist"]> 0) | (df_instances_all["Motorcyclist"]> 0) | (df_instances_all["Other-Rider"]> 0)]
print(df_instances_human.shape)


In [20]:
# simplemaps.csv: uuid, source, orig_id, Person, Bicyclist, Motorcyclist, Other-Rider
df_simplemaps_all = pd.read_csv(
    "C:/Users/ShiyingLi/Documents/SUE/global-streetscapes/data/simplemaps.csv"
)  # update the location of the desired csv file



(10004457, 68)
(5256997, 68)


## Statistical Analysis of human related image

In [25]:
df_seg_person_sorted = df_seg_person[["uuid", "source", "orig_id", "Person"]].sort_values(by = "Person")
nMapillary = df_seg_person_sorted[df_seg_person_sorted["source"] == "Mapillary"].shape[0]
nKartaView = df_seg_person_sorted[df_seg_person_sorted["source"] == "KartaView"].shape[0]

print(df_seg_person_sorted.shape[0] == nMapillary + nKartaView)
print("Percentage of Mapillary = ", nMapillary /df_seg_person_sorted.shape[0])
print("Percentage of KartaView = ", nKartaView /df_seg_person_sorted.shape[0])

In [30]:
df_instances_human = df_instances_human[["uuid", "source", "orig_id", "Person", "Bicyclist", "Motorcyclist", "Other-Rider"]]
df_instances_human ["PersonCount"] = df_instances_human[["Person", "Bicyclist", "Motorcyclist", "Other-Rider"]].sum(axis=1)
df_instances_human_sorted = df_instances_human [["uuid", "source", "orig_id","PersonCount"]].sort_values(by="PersonCount")

nMapillary = df_instances_human_sorted[df_instances_human_sorted["source"] == "Mapillary"].shape[0]
nKartaView = df_instances_human_sorted[df_instances_human_sorted["source"] == "KartaView"].shape[0]

print(df_instances_human_sorted.shape[0] == nMapillary + nKartaView)
print("Percentage of Mapillary = ", nMapillary /df_instances_human_sorted.shape[0])
print("Percentage of KartaView = ", nKartaView /df_instances_human_sorted.shape[0])

True
Percentage of Mapillary =  0.8816297593473993
Percentage of KartaView =  0.11837024065260071


In [31]:
df_instances_human_sorted["PersonCount"].head

<bound method NDFrame.head of 10004442     1.0
2186349      1.0
2186347      1.0
6097796      1.0
2186344      1.0
            ... 
3163564     49.0
3970271     49.0
2184737     50.0
7289577     51.0
2255366     58.0
Name: PersonCount, Length: 5256997, dtype: float64>

## Filter based on city

In [8]:
df_city = df_all[df_all["city"] == "Singapore"]

# visual confirmation the city name is unique. Otherwise there would be more than one value
df_city["country"].unique()

TypeError: 'TextFileReader' object is not subscriptable

## Filter based on contexual information: lighting condition

In [4]:
# load contextual information
df_contextual = pd.read_csv("data/tables/contextual.csv")

  df_contextual = pd.read_csv("data/global-streetscapes/contextual.csv")


In [None]:
# merge our filtered dataset with contextual data
df_city_merged = df_city.merge(df_contextual, on=["uuid", "source", "orig_id"])
df_city_merged["lighting_condition"].unique()

In [None]:
# filter only the rows during `day`
df_city_merged = df_city_merged[df_city_merged["lighting_condition"] == "day"]
df_city_merged["lighting_condition"].unique()

## Save to csv

In [None]:
# keep the three required columns
df_to_download = df_city_merged[["uuid", "source", "orig_id"]]
# save the file
df_to_download.to_csv("download_imgs/sgp_day.csv")