In [1]:
import pandas as pd
import os

SAFECAST_DATA_PATH = os.path.join('results', 'first_million.csv')
df = pd.read_csv(SAFECAST_DATA_PATH)

In [2]:
df.head()

Unnamed: 0,captured time,latitude,longitude,value
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0


In [3]:
len(df)

806849

In [26]:
from gmalthgtparser import HgtParser
from functools import lru_cache

HGTDIR = "hgt_files"


def get_file_name(lat, lon):
    
    @lru_cache(maxsize=1024)
    def _get_file_name(latitude, longitude, ns, ew):
        hgt_file = "%(ns)s%(lat)02d%(ew)s%(lon)03d.hgt" % {'lat': latitude, 'lon': longitude, 'ns': ns, 'ew': ew}
        hgt_file_path = os.path.join(HGTDIR, hgt_file)
        if os.path.isfile(hgt_file_path):
            return hgt_file_path
        else:
            return None
        
    if lat >= 0.0:
        ns = 'N'
    else:
        ns = 'S'
        lat -= 1

    if lon >= 0.0:
        ew = 'E'
    else:
        ew = 'W'
        lon -= 1
    latitude = abs(lat)
    longitude = abs(lon)
    return _get_file_name(latitude, longitude, ns, ew)

In [5]:
def read_elevation_from_file(hgt_file, lat, lon):
    with HgtParser(hgt_file) as parser:
        return parser.get_elevation((lat, lon))[2]

In [6]:
def get_elevation(row):
    lat, lon = row.latitude, row.longitude
    hgt_file = get_file_name(lat, lon)
    if hgt_file:
        return read_elevation_from_file(hgt_file, lat, lon)
    # Treat it as data void as in SRTM documentation
    # if file is absent
    return -32768

In [19]:
lat, lon = 36.041080, 140.226816
get_file_name(lat, lon)

N36.04E140.227.hgt
N36E140.hgt


In [27]:
df["elevation"] = df.apply(get_elevation, axis=1)

In [28]:
df.head()

Unnamed: 0,captured time,latitude,longitude,value,elevation
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31.0
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72.0
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141.0
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47.0
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107.0


In [29]:
df.elevation.value_counts()

 47.0      45049
 10.0      40744
 145.0     28078
 8.0       27822
 31.0      26355
 2023.0    21770
 12.0      21629
 3.0       17006
 22.0      14130
 41.0      13870
 7.0       13204
 46.0      13186
 5.0       12906
 17.0      12550
 11.0      12500
 80.0      12493
 23.0      11719
 26.0      11598
 238.0     11518
 71.0      11353
 218.0     11351
 419.0     11347
 585.0     11292
 91.0      11285
 210.0     11271
 107.0     11253
 462.0     11225
 36.0      11032
 131.0     10898
 141.0     10772
           ...  
 988.0         1
 1057.0        1
 1873.0        1
 1224.0        1
 1249.0        1
 1229.0        1
 980.0         1
 1826.0        1
 1440.0        1
-13.0          1
 1704.0        1
 1074.0        1
 1060.0        1
 1171.0        1
 1027.0        1
 1308.0        1
 976.0         1
 1173.0        1
 1177.0        1
 1605.0        1
 1858.0        1
 1496.0        1
 1829.0        1
 1287.0        1
 1284.0        1
 931.0         1
 1421.0        1
 1602.0       

In [30]:
path = os.path.join("results", "first_million_with_elevation.csv")
df.to_csv(path, index=False)

In [1]:
import zipfile
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO

import requests
from bs4 import BeautifulSoup

files_dir = "hgt_files"


def download_file(url):
    resp = requests.get(url)
    with zipfile.ZipFile(BytesIO(resp.content)) as zip_ref:
        zip_ref.extractall(files_dir)
    return url
        

def download_files():
    file_urls = []
    urls = [
        "https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/Africa/",
        "https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/Australia/",
        "https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/Eurasia/",
        "https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/Islands/",
        "https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/North_America/",
        "https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/South_America/"
    ]
    for url in urls:
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html.parser")
        links = [url + link["href"] for link in soup.find_all("a") if "hgt.zip" in link["href"]]
        print(len(links), "files in", url)
        file_urls.extend(links)
    
    print(len(file_urls), "files to download")
    with ThreadPoolExecutor() as pool:
        for downloaded_url in pool.map(download_file, file_urls):
            print(downloaded_url)