In [1]:
import pandas as pd
import os
import numpy as np
from functools import lru_cache

In [2]:
SAFECAST_DATA_PATH = r"F:\safecast\chunks\filtered.csv"
HGTDIR = "hgt_files"
RESULTS_DIR = "results"
NUMBER_OF_ROWS = 10000000

In [3]:
df = pd.read_csv(SAFECAST_DATA_PATH, nrows=NUMBER_OF_ROWS, names=["capture_date", "latitude", "longitude", "value"])

In [4]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0


In [5]:
def get_file_name(lat, lon):
    
    @lru_cache(maxsize=1024)
    def _get_file_name(latitude, longitude, ns, ew):
        hgt_file = "%(ns)s%(lat)02d%(ew)s%(lon)03d.hgt" % {'lat': latitude, 'lon': longitude, 'ns': ns, 'ew': ew}
        hgt_file_path = os.path.join(HGTDIR, hgt_file)
        if os.path.isfile(hgt_file_path):
            return hgt_file_path
        else:
            return None
        
    if lat >= 0.0:
        ns = 'N'
    else:
        ns = 'S'
        lat -= 1

    if lon >= 0.0:
        ew = 'E'
    else:
        ew = 'W'
        lon -= 1
    latitude = abs(lat)
    longitude = abs(lon)
    return _get_file_name(latitude, longitude, ns, ew)

In [6]:
d = {}
SAMPLES = 1201

    
def get_elevation_for_coords(lat, lon):
    hgt_file = get_file_name(lat, lon)
    if not hgt_file:
        return -32768
    
    lat_row = int(round((lat - int(lat)) * (SAMPLES - 1), 0))
    if lat_row < 0:
        lat_row = SAMPLES - 1 + lat_row
    lon_row = int(round((lon - int(lon)) * (SAMPLES - 1), 0))
    if lon_row < 0:
        lon_row = SAMPLES - 1 + lon_row
    if hgt_file not in d:
        with open(hgt_file, 'rb') as hgt_data:
            d[hgt_file] = np.fromfile(hgt_data, np.dtype('>i2'), SAMPLES*SAMPLES).reshape((SAMPLES, SAMPLES))
            
    try:
        return d[hgt_file][SAMPLES - 1 - lat_row, lon_row].astype(int)
    except Exception as e:
        print(lat, lon, repr(e))
        return -32768


In [7]:
def get_elevation(row):
    lat, lon = row.latitude, row.longitude
    return get_elevation_for_coords(lat, lon)

In [80]:
df["elevation"] = df.apply(get_elevation, axis=1)   # expensive

In [81]:
len(df[df.elevation == -32768])

267481

In [83]:
len(d)

841

In [84]:
filtered_df = df[df.elevation != -32768]

In [85]:
len(filtered_df)

9732519

In [86]:
filtered_df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107


In [87]:
filtered_df.to_csv(os.path.join(RESULTS_DIR, "10_million_with_elevation.csv"), index=False)