In [1]:
import pandas as pd 
import os
import pycountry
import reverse_geocoder

In [2]:
RESULTS_DIR = "results"
SAFECAST_DATA_PATH = os.path.join(RESULTS_DIR, '10_million_with_elevation.csv')

In [3]:
df = pd.read_csv(SAFECAST_DATA_PATH)

In [4]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107


In [5]:
coords = [tuple(coords) for coords in df[["latitude", "longitude"]].values]

In [6]:
res = reverse_geocoder.search(coords)

Loading formatted geocoded file...


In [7]:
res[:3]

[OrderedDict([('lat', '36.03333'),
              ('lon', '140.2'),
              ('name', 'Ami'),
              ('admin1', 'Ibaraki'),
              ('admin2', ''),
              ('cc', 'JP')]),
 OrderedDict([('lat', '37.81667'),
              ('lon', '140.55'),
              ('name', 'Hobaramachi'),
              ('admin1', 'Fukushima'),
              ('admin2', ''),
              ('cc', 'JP')]),
 OrderedDict([('lat', '37.75'),
              ('lon', '140.46778'),
              ('name', 'Fukushima-shi'),
              ('admin1', 'Fukushima'),
              ('admin2', ''),
              ('cc', 'JP')])]

In [8]:
def get_country_code(row):
    return res[row["index"]]["cc"]

df["country_code"] = df.reset_index().apply(get_country_code, axis=1)

In [9]:
countries = {country_code: pycountry.countries.get(alpha_2=country_code).name for country_code in set(df.country_code)}

In [10]:
def get_country_name(row):
    return countries[row.country_code]

df["country"] = df.apply(get_country_name, axis=1)

In [11]:
df = df.drop(columns="country_code")

In [12]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,country
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,Japan
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,Japan
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,Japan
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,Germany
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,Japan


In [13]:
df.to_csv(os.path.join(RESULTS_DIR, "10_million_with_elevation_geohashes_countries.csv"))

In [14]:
plot = df.country.value_counts()[:30].plot(kind='barh', figsize=(20, 15), title="30 countries with most measurements", color='b')

In [15]:
plot.get_figure().savefig(os.path.join(RESULTS_DIR, "30_countries_with_most_measurements.png"))

In [16]:
print(df.country.value_counts().to_string())

Japan                                    3547180
United States                            2076537
Germany                                   876083
Czechia                                   658820
France                                    406071
Netherlands                               383471
Korea, Republic of                        212393
Italy                                     172296
United Kingdom                            114336
Australia                                  97546
Austria                                    94059
Greece                                     83573
Spain                                      77881
Switzerland                                72705
Ukraine                                    65425
Hungary                                    57911
Slovakia                                   52941
Romania                                    51434
Bulgaria                                   50524
Sweden                                     48345
Taiwan, Province of 