In [1]:
import pandas as pd 
import folium
import os
import geohash
from folium.plugins import HeatMap
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import numpy as np

In [2]:
RESULTS_DIR = "results"

In [3]:
df = pd.read_csv(os.path.join(RESULTS_DIR, "10_million_with_elevation_and_geohashes.csv"))

In [4]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,geohash
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,xn7tncn498k2
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,xnezj2x7rnh7
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,xneysek46d7w
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,u336qgr9tvzw
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,xneyyr50hk99


In [5]:
len(df)

9732519

In [6]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-geohashgrid-aggregation.html
# geogash precision 
# 8 - 38.2m x 19m
# 4 - 39.1km x 19.5km
# 5 - 4.89km x 4.89km

In [7]:
def get_geohash4(row):
    return row.geohash[:4]

df["geohash4"] = df.apply(get_geohash4, axis=1)

In [8]:
points_geohashes_prec_4 = df.geohash4.value_counts()
points_geohashes_prec_4[:10]
points_geohashes_prec_4.mean(), points_geohashes_prec_4.median()

(2218.99658002736, 206.0)

In [9]:
def get_geohash5(row):
    return row.geohash[:5]

df["geohash5"] = df.apply(get_geohash5, axis=1)

In [10]:
points_geohashes_prec_5 = df.geohash5.value_counts()
points_geohashes_prec_5.mean(), points_geohashes_prec_5.median()

(310.4768877404536, 34.0)

In [11]:
geohashes_for_regression = list(points_geohashes_prec_5.where(lambda count: count >= 50).dropna().index)
len(geohashes_for_regression)

11920

In [12]:
len(points_geohashes_prec_5.where(lambda count: count >= 100).dropna())

6794

In [13]:
def get_timestamp(row):
    try:
        timestamp = datetime.strptime(row.capture_date, '%Y-%m-%d %H:%M:%S.%f').timestamp()
        return timestamp
    except:
        timestamp = datetime.strptime(row.capture_date, '%Y-%m-%d %H:%M:%S').timestamp()
        return timestamp

In [14]:
%%time
df["timestamp"] = df.apply(get_timestamp, axis=1)

Wall time: 6min 22s


In [15]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,geohash,geohash4,geohash5,timestamp
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,xn7tncn498k2,xn7t,xn7tn,1540076000.0
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,xnezj2x7rnh7,xnez,xnezj,1540076000.0
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,xneysek46d7w,xney,xneys,1540076000.0
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,u336qgr9tvzw,u336,u336q,1540076000.0
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,xneyyr50hk99,xney,xneyy,1540076000.0


In [16]:
df = df.drop(columns=["capture_date", "latitude", "longitude", "elevation", "geohash4"])
df.to_csv(os.path.join(RESULTS_DIR, "10_million_with_elevation_geohashes_timestamps.csv"))

In [17]:
groupped_df = df[df.geohash5.isin(geohashes_for_regression)][["value", "geohash5", "timestamp"]].groupby("geohash5")
len(groupped_df)

11920

In [18]:
def prepare_regresion_model(one_group_df):
    result = pd.Series()
    X = np.array(one_group_df.timestamp).reshape(-1, 1)
    y = np.array(one_group_df.value).reshape(-1, 1)
    # sort by timestamp
    train_size = int(0.8 * len(X))
    X_train = X[:train_size]
    X_test = X[train_size:]
    y_train = y[:train_size]
    y_test = y[train_size:]
    
    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)
    
    y_pred = regr.predict(X_test)
    error = r2_score(y_test, y_pred)
    return pd.Series({"regression_error": error})

In [19]:
df_after_training = groupped_df.apply(prepare_regresion_model).reset_index()

In [20]:
df_after_training.head()

Unnamed: 0,geohash5,regression_error
0,6mc5m,0.405935
1,6mc5n,0.013665
2,6mc5p,0.698166
3,6mc5t,0.00664
4,6msem,-2.723922


In [21]:
df_after_training = df_after_training[df_after_training.regression_error <= 100]

In [22]:
def get_coordinates(geo):
    bbox = geohash.bbox(geo)
    return [[bbox["w"], bbox["n"]], [bbox["e"], bbox["n"]], [bbox["e"], bbox["s"]], [bbox["w"], bbox["s"]]]

def get_geo_data(ghash_df):
    features = []

    for geo, *_ in ghash_df.values:
        features.append({"type": "Feature", "id": geo, 
                         "geometry": {"type": "Polygon", "coordinates": [get_coordinates(geo)]}})

    return {"type": "FeatureCollection", "features": features}

def visualize_with_folium(df):
    m = folium.Map(location=[37.760806, 140.474722], zoom_start=10)

    m.choropleth(
        geo_data=get_geo_data(df),
        name='choropleth',
        data=df,
        columns=['geohash5', 'regression_error'],
        key_on='feature.id',
        fill_color='YlOrRd',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name='regression mean square error'
    )
    folium.LayerControl().add_to(m)
    return m

In [23]:
m = visualize_with_folium(df_after_training)

In [24]:
m.save(os.path.join(RESULTS_DIR, "regression_errors.html"))