# Download a subset of Global Streetscapes data

This notebook demostrates how to filter the Global Streetscapes dataset to find the desired subset of data.
As an example, we show how to filter for daytime images from Singapore, and how to prepare the input csv file for download_jpegs.py to download the required jpegs.

In [1]:
import pandas as pd

## Load file with available points

In [2]:
# segmentation.csv
df_seg_all = pd.read_csv(
    "C:/Users/ShiyingLi/Documents/SUE/global-streetscapes/data/segmentation.csv"
)  # update the location of the desired csv file
print("seg all shape: ", df_seg_all.shape)
df_seg_person = df_seg_all[df_seg_all["Person"] > 0]
print("seg person shape: ", df_seg_person.shape)

# uuid, source, orig_id, Person

seg all shape:  (10004539, 69)
seg person shape:  (4908219, 69)


In [3]:
# instances.csv: uuid, source, orig_id, Person, Bicyclist, Motorcyclist, Other-Rider
df_instances_all = pd.read_csv(
    "C:/Users/ShiyingLi/Documents/SUE/global-streetscapes/data/instances.csv"
)  # update the location of the desired csv file

print(df_instances_all.shape)

df_instances_human = df_instances_all[(df_instances_all["Person"]> 0) | (df_instances_all["Bicyclist"]> 0) | (df_instances_all["Motorcyclist"]> 0) | (df_instances_all["Other-Rider"]> 0)]
print(df_instances_human.shape)


(10004457, 68)
(5256997, 68)


In [22]:
df_instances_human.reset_index(drop=True, inplace=True)
df_instances_human["uuid"].iloc[:5]

0    14da5502-276a-4b0e-ab1b-9969bef7cf3a
1    2fa06463-49f6-4d0c-8fdc-4ebf1754b996
2    4fd8d540-1ca2-4cad-a140-3b86cc1bc140
3    affed3dc-a9b7-4f5b-8b88-99fad7ee581e
4    a22726a8-8128-459e-8fce-192ec58924a3
Name: uuid, dtype: object

In [10]:
df_instances_human["uuid"].iloc[:5]

1     14da5502-276a-4b0e-ab1b-9969bef7cf3a
2     2fa06463-49f6-4d0c-8fdc-4ebf1754b996
5     4fd8d540-1ca2-4cad-a140-3b86cc1bc140
6     affed3dc-a9b7-4f5b-8b88-99fad7ee581e
12    a22726a8-8128-459e-8fce-192ec58924a3
Name: uuid, dtype: object

In [5]:
# simplemaps.csv: uuid, source, orig_id, Person, Bicyclist, Motorcyclist, Other-Rider
df_simplemaps_all = pd.read_csv(
    "C:/Users/ShiyingLi/Documents/SUE/global-streetscapes/data/simplemaps.csv"
)  # update the location of the desired csv file

  df_simplemaps_all = pd.read_csv(


In [23]:
print("Simplemaps: ", df_simplemaps_all.shape)

df_country = df_simplemaps_all[df_simplemaps_all["uuid"].isin(df_instances_human["uuid"])]
print(df_country.shape)

Simplemaps:  (10004551, 15)
(5256997, 15)


## Statistical Analysis of human related image

In [27]:
df_seg_person_sorted = df_seg_person[["uuid", "source", "orig_id", "Person"]].sort_values(by = "Person")
nMapillary = df_seg_person_sorted[df_seg_person_sorted["source"] == "Mapillary"].shape[0]
nKartaView = df_seg_person_sorted[df_seg_person_sorted["source"] == "KartaView"].shape[0]

print(df_seg_person_sorted.shape[0] == nMapillary + nKartaView)
print("Percentage of Mapillary = ", nMapillary /df_seg_person_sorted.shape[0])
print("Percentage of KartaView = ", nKartaView /df_seg_person_sorted.shape[0])

True
Percentage of Mapillary =  0.8882672105706775
Percentage of KartaView =  0.11173278942932253


In [28]:
df_instances_human = df_instances_human[["uuid", "source", "orig_id", "Person", "Bicyclist", "Motorcyclist", "Other-Rider"]]
df_instances_human ["PersonCount"] = df_instances_human[["Person", "Bicyclist", "Motorcyclist", "Other-Rider"]].sum(axis=1)
df_instances_human_sorted = df_instances_human [["uuid", "source", "orig_id","PersonCount"]].sort_values(by="PersonCount")

nMapillary = df_instances_human_sorted[df_instances_human_sorted["source"] == "Mapillary"].shape[0]
nKartaView = df_instances_human_sorted[df_instances_human_sorted["source"] == "KartaView"].shape[0]

print(df_instances_human_sorted.shape[0] == nMapillary + nKartaView)
print("Percentage of Mapillary = ", nMapillary /df_instances_human_sorted.shape[0])
print("Percentage of KartaView = ", nKartaView /df_instances_human_sorted.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_instances_human ["PersonCount"] = df_instances_human[["Person", "Bicyclist", "Motorcyclist", "Other-Rider"]].sum(axis=1)


True
Percentage of Mapillary =  0.8816297593473993
Percentage of KartaView =  0.11837024065260071


## Merge country to instance_human dataframe

In [None]:
# reset index
df_instances_human_sorted.reset_index(drop=True, inplace=True)

# Add country to df_instances_human_sorted
df_human_country = pd.merge(df_instances_human_sorted, df_simplemaps_all[["uuid", "country"]], on='uuid')

In [38]:
df_human_country.head

<bound method NDFrame.head of                                          uuid     source       orig_id  \
0        0b72517c-110b-4b7f-8c54-67c2f12e1d28  KartaView  1.248407e+09   
1        f3f25720-5048-4add-a0d4-89335e742726  Mapillary  5.159699e+14   
2        45997a61-beee-4c4a-90b3-429a17c0a9d6  Mapillary  3.697119e+14   
3        ca9b6088-0126-4ef2-a272-38ffa62853d3  Mapillary  1.087089e+15   
4        5399338b-288e-4958-85ca-83db2973fcca  Mapillary  2.766436e+14   
...                                       ...        ...           ...   
5256992  e90cf30c-0a29-44c5-948c-36d495fd9463  Mapillary  5.209638e+14   
5256993  7b41c248-d7a8-4fd2-9993-a7596607b54e  Mapillary  6.535839e+14   
5256994  32e5d81a-ef82-4e51-8985-1f890659537d  Mapillary  7.966561e+14   
5256995  13022174-6cfe-4020-af19-d004aeeb0a81  Mapillary  3.308299e+14   
5256996  c99bfaf7-2a52-407b-9f09-4a55ab3f8744  Mapillary  5.674937e+14   

         PersonCount      country  
0                1.0        Japan  
1        

## Count the number of images per country

In [47]:
print("Number of unique country occured : ", df_human_country["country"].nunique())


df_image_per_country = df_human_country["country"].value_counts()

df_image_per_country.sort_values(ascending=False).head

Number of unique country occured :  210


<bound method NDFrame.head of country
Brazil                               398987
United States                        361536
Japan                                357264
Germany                              343252
Colombia                             296028
                                      ...  
Samoa                                    27
Aruba                                    24
Iraq                                     18
Swaziland                                16
Falkland Islands (Islas Malvinas)         3
Name: count, Length: 210, dtype: int64>

In [53]:
df_image_per_country["Singapore"]

6932

## Download Image from dataframe

In [None]:
# Mapillary

# KartaView


## Save to csv

In [None]:
# keep the three required columns
df_to_download = df_city_merged[["uuid", "source", "orig_id"]]
# save the file
df_to_download.to_csv("download_imgs/sgp_day.csv")