# Append Simplemaps data and continent

In [94]:
import geopandas as gp
import pandas as pd
import os

In [95]:
points = pd.read_csv('../raw_download/sample_output/points.csv')
points.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2319 entries, 0 to 2318
Data columns (total 68 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   mly_captured_at             1892 non-null   float64
 1   heading                     2319 non-null   float64
 2   mly_creator_id              1892 non-null   float64
 3   orig_id                     2319 non-null   int64  
 4   mly_is_pano                 1892 non-null   object 
 5   mly_sequence_id             1892 non-null   object 
 6   mly_organization_id         0 non-null      float64
 7   city_id                     2319 non-null   int64  
 8   lat                         2319 non-null   float64
 9   lon                         2319 non-null   float64
 10  source                      2319 non-null   object 
 11  kv_autoImgProcessingResult  427 non-null    object 
 12  kv_autoImgProcessingStatus  427 non-null    object 
 13  kv_dateAdded                427 n

In [108]:
wc = pd.read_csv('../raw_download/data/worldcities.csv').rename(columns={'id':'city_id', 'lat': 'city_lat', 'lng': 'city_lon'})
cities = wc[wc['city_id'].isin(points.city_id.unique())]
cities

Unnamed: 0,city,city_ascii,city_lat,city_lon,country,iso2,iso3,admin_name,capital,population,city_id
135,Singapore,Singapore,1.3,103.8,Singapore,SG,SGP,Central Singapore,primary,5271000.0,1702341327
1171,Stuttgart,Stuttgart,48.7761,9.1775,Germany,DE,DEU,Baden-Württemberg,admin,630305.0,1276171358


In [97]:
gdf_cities = gp.GeoDataFrame(
    cities, geometry=gp.points_from_xy(cities.city_lon, cities.city_lat), crs="EPSG:4326"
)
world = gp.read_file(gp.datasets.get_path('naturalearth_lowres'))
gdf_cities = gdf_cities.sjoin(world[['continent', 'geometry']], how="left").drop(columns=['index_right'])
gdf_cities.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2 entries, 135 to 1171
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   city        2 non-null      object  
 1   city_ascii  2 non-null      object  
 2   city_lat    2 non-null      float64 
 3   city_lon    2 non-null      float64 
 4   country     2 non-null      object  
 5   iso2        2 non-null      object  
 6   iso3        2 non-null      object  
 7   admin_name  2 non-null      object  
 8   capital     2 non-null      object  
 9   population  2 non-null      float64 
 10  city_id     2 non-null      int64   
 11  geometry    2 non-null      geometry
 12  continent   2 non-null      object  
dtypes: float64(3), geometry(1), int64(1), object(8)
memory usage: 224.0+ bytes


In [98]:
gdf_cities.continent.value_counts()

Asia      1
Europe    1
Name: continent, dtype: int64

In [99]:
# If there's null value for continent, manually search for the continent of the city, update dict_city_continent accordingly, and run the code below to insert values  
# dict_city_continent = {
#     # city_id : continent
#     # for example,
#     # 1360503809: 'Asia'
# } 
# def insert_missing_continent(row):
#     if pd.isna(row['continent']):
#         city_id = row['city_id']
#         return d[city_id]
#     else:
#         return row['continent']

# gdf_cities['continent'] = gdf_cities.apply(lambda x: insert_missing_continent(x), axis=1)
# gdf_cities.info()

In [100]:
sm = points.merge(gdf_cities.drop(columns=['geometry']), on='city_id', how='left')
sm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319 entries, 0 to 2318
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   mly_captured_at             1892 non-null   float64
 1   heading                     2319 non-null   float64
 2   mly_creator_id              1892 non-null   float64
 3   orig_id                     2319 non-null   int64  
 4   mly_is_pano                 1892 non-null   object 
 5   mly_sequence_id             1892 non-null   object 
 6   mly_organization_id         0 non-null      float64
 7   city_id                     2319 non-null   int64  
 8   lat                         2319 non-null   float64
 9   lon                         2319 non-null   float64
 10  source                      2319 non-null   object 
 11  kv_autoImgProcessingResult  427 non-null    object 
 12  kv_autoImgProcessingStatus  427 non-null    object 
 13  kv_dateAdded                427 n

In [101]:
sm.continent.value_counts()

Europe    1725
Asia       594
Name: continent, dtype: int64

In [102]:
cols = ['uuid',
 'source',
 'orig_id',
 'city',
 'city_ascii',
 'city_id',
 'city_lat',
 'city_lon',
 'country',
 'iso2',
 'iso3',
 'admin_name',
 'capital',
 'population',
 'continent']
sm = sm[cols]
sm.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319 entries, 0 to 2318
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   uuid        2319 non-null   object 
 1   source      2319 non-null   object 
 2   orig_id     2319 non-null   int64  
 3   city        2319 non-null   object 
 4   city_ascii  2319 non-null   object 
 5   city_id     2319 non-null   int64  
 6   city_lat    2319 non-null   float64
 7   city_lon    2319 non-null   float64
 8   country     2319 non-null   object 
 9   iso2        2319 non-null   object 
 10  iso3        2319 non-null   object 
 11  admin_name  2319 non-null   object 
 12  capital     2319 non-null   object 
 13  population  2319 non-null   float64
 14  continent   2319 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 289.9+ KB


In [103]:
sm.to_csv('./sample_data/01_simplemaps.csv', index=False)

# Calculate image count per city

In [104]:
sm.city_id.nunique()

2

In [109]:
imgcnt = sm.groupby('city_id').uuid.count().reset_index().rename(columns={'uuid': 'img_count'})
cities = cities.merge(imgcnt, on='city_id', how='left')
cities

Unnamed: 0,city,city_ascii,city_lat,city_lon,country,iso2,iso3,admin_name,capital,population,city_id,img_count
0,Singapore,Singapore,1.3,103.8,Singapore,SG,SGP,Central Singapore,primary,5271000.0,1702341327,594
1,Stuttgart,Stuttgart,48.7761,9.1775,Germany,DE,DEU,Baden-Württemberg,admin,630305.0,1276171358,1725


In [111]:
cities.drop(columns=['geometry']).to_csv('./sample_data/cities.csv', index=False)