In [1]:
import pandas as pd 
import folium
import os
import geohash
from folium.plugins import HeatMap
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import numpy as np

In [2]:
df = pd.read_csv("10_million_with_elevation_and_geohashes.csv")  

In [3]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,geohash
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,xn7tncn498k2
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,xnezj2x7rnh7
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,xneysek46d7w
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,u336qgr9tvzw
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,xneyyr50hk99


In [4]:
df.shape

(9732519, 6)

In [5]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-geohashgrid-aggregation.html
# geogash precision 8 - 38.2m x 19m
# 4 - 39.1km x 19.5km

In [6]:
def get_geohash8(row):
    return row.geohash[:4]

df["geohash8"] = df.apply(get_geohash8, axis=1)

In [7]:
points_in_smallest_geohashes = df.geohash8.value_counts()
points_in_smallest_geohashes[:10]

xn76    351804
xney    309497
xnsm    304731
c20g    219053
xneu    193008
xn7t    188532
9q5f    184063
drtd    148120
xn77    146484
xnsn    130833
Name: geohash8, dtype: int64

In [8]:
points_in_smallest_geohashes.mean()

2218.99658002736

In [9]:
def get_timestamp(row):
    try:
        timestamp = datetime.strptime(row.capture_date, '%Y-%m-%d %H:%M:%S.%f').timestamp()
        return timestamp
    except:
        timestamp = datetime.strptime(row.capture_date, '%Y-%m-%d %H:%M:%S').timestamp()
        return timestamp

In [10]:
%%time
df["timestamp"] = df.apply(get_timestamp, axis=1)

CPU times: user 10min 6s, sys: 12.4 s, total: 10min 18s
Wall time: 11min 10s


In [11]:
df.shape

(9732519, 8)

In [12]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,geohash,geohash8,timestamp
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,xn7tncn498k2,xn7t,1540076000.0
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,xnezj2x7rnh7,xnez,1540076000.0
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,xneysek46d7w,xney,1540076000.0
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,u336qgr9tvzw,u336,1540076000.0
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,xneyyr50hk99,xney,1540076000.0


In [13]:
def prepare_regresion_model(data):
    new_data = pd.DataFrame(data=[[]])
    size = data.shape[0]
    new_data['count'] = size
    
    if size <= 1:
        new_data['regresion'] = None
        new_data['reg_error'] = None
        return new_data
    
    train_size = int(0.8 * size)
    test_size = size - train_size
    
    X_train = np.array(data.timestamp[:train_size]).reshape(-1, 1)
    X_test = np.array(data.timestamp[train_size:]).reshape(-1, 1)
    
    Y_train = np.array(data.value[:train_size]).reshape(-1, 1)
    Y_test = np.array(data.value[train_size:]).reshape(-1, 1)
    
    regr = linear_model.LinearRegression()

    regr.fit(X_train, Y_train)
    new_data['regresion'] = regr
    
    Y_pred = regr.predict(X_test)
    new_data['reg_error'] = mean_squared_error(Y_test, Y_pred)
    return new_data

In [26]:
# todo reject if only one value per geohash
tmp = df[:1000000]
tmp.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,geohash,geohash8,timestamp
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,xn7tncn498k2,xn7t,1540076000.0
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,xnezj2x7rnh7,xnez,1540076000.0
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,xneysek46d7w,xney,1540076000.0
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,u336qgr9tvzw,u336,1540076000.0
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,xneyyr50hk99,xney,1540076000.0


In [27]:
%%time
res = tmp.groupby('geohash8').apply(prepare_regresion_model)

CPU times: user 5.84 s, sys: 288 ms, total: 6.13 s
Wall time: 4.07 s


In [28]:
%%time
sdfsdafsfs = tmp.groupby('geohash8').apply(prepare_regresion_model)

CPU times: user 4.78 s, sys: 46.9 ms, total: 4.83 s
Wall time: 2.77 s


In [29]:
res = res.reset_index()[['geohash8', 'count', 'regresion', 'reg_error']]

In [30]:
res_2 = res[res["regresion"].notna()]
res_2.head()

Unnamed: 0,geohash8,count,regresion,reg_error
0,8e8y,4812,"LinearRegression(copy_X=True, fit_intercept=Tr...",5.644255
1,9exz,21770,"LinearRegression(copy_X=True, fit_intercept=Tr...",43013960.0
2,9mum,1292,"LinearRegression(copy_X=True, fit_intercept=Tr...",50.38864
3,9muq,3305,"LinearRegression(copy_X=True, fit_intercept=Tr...",11.6489
4,9ppb,2,"LinearRegression(copy_X=True, fit_intercept=Tr...",0.0


In [31]:
res_2.shape
res_3 = res_2[['geohash8', 'reg_error']]

In [32]:
res_3.shape

(918, 2)

In [33]:
def get_coordinates(geo):
    bbox = geohash.bbox(geo)
    return [[bbox["w"], bbox["n"]], [bbox["e"], bbox["n"]], [bbox["e"], bbox["s"]], [bbox["w"], bbox["s"]]]

def get_geo_data(ghash_df):
    features = []

    for geo, _ in ghash_df.values:
        features.append({"type": "Feature", "id": geo, 
                         "geometry": {"type": "Polygon", "coordinates": [get_coordinates(geo)]}})

    return {"type": "FeatureCollection", "features": features}

In [34]:
def visualize_with_folium(df):
    m = folium.Map(location=[37.760806, 140.474722], zoom_start=10)

    # Add the color for the choropleth:
    m.choropleth(
        geo_data=get_geo_data(df),
        name='choropleth',
        data=df,
        columns=['geohash8', 'reg_error'],
        key_on='feature.id',
        fill_color='YlOrRd',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name='regression mean square error'
    )
    folium.LayerControl().add_to(m)
    return m

In [35]:
m = visualize_with_folium(res_3)
m

In [23]:
m.save("1million_geohash_error.html")