In [1]:
import pandas as pd 
import folium
import os
import geohash
from folium.plugins import HeatMap
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import numpy as np

In [2]:
df = pd.read_csv("10_million_with_elevation_and_geohashes.csv")  

In [3]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,geohash
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,xn7tncn498k2
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,xnezj2x7rnh7
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,xneysek46d7w
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,u336qgr9tvzw
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,xneyyr50hk99


In [5]:
df.shape

(9732519, 6)

In [6]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-geohashgrid-aggregation.html
# geogash precision 8 - 38.2m x 19m

In [7]:
def get_geohash8(row):
    return row.geohash[:8]

df["geohash8"] = df.apply(get_geohash8, axis=1)

In [8]:
points_in_smallest_geohashes = df.geohash8.value_counts()
points_in_smallest_geohashes[:10]

xn7tncn4    186517
c20g55gv    131654
dr085nmg    104446
9q9jhbm0     94183
u336qgr9     89549
xneutf7s     85402
c22yzgx1     76663
xnsm8qju     76071
drt3jg3b     76057
9qbdmpnm     75900
Name: geohash8, dtype: int64

In [9]:
points_in_smallest_geohashes.mean()

4.895803224558599

In [10]:
def get_timestamp(row):
    try:
        timestamp = datetime.strptime(row.capture_date, '%Y-%m-%d %H:%M:%S.%f').timestamp()
        return timestamp
    except:
        timestamp = datetime.strptime(row.capture_date, '%Y-%m-%d %H:%M:%S').timestamp()
        return timestamp

In [11]:
%%time
df["timestamp"] = df.apply(get_timestamp, axis=1)

CPU times: user 8min, sys: 7.43 s, total: 8min 7s
Wall time: 8min 8s


In [12]:
df.shape

(9732519, 8)

In [13]:
df.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,geohash,geohash8,timestamp
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,xn7tncn498k2,xn7tncn4,1540076000.0
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,xnezj2x7rnh7,xnezj2x7,1540076000.0
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,xneysek46d7w,xneysek4,1540076000.0
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,u336qgr9tvzw,u336qgr9,1540076000.0
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,xneyyr50hk99,xneyyr50,1540076000.0


In [14]:
def prepare_regresion_model(data):
    new_data = pd.DataFrame(data=[[]])
    size = data.shape[0]
    new_data['count'] = size
    
    if size <= 1:
        new_data['regresion'] = None
        new_data['reg_error'] = None
        return new_data
    
    train_size = int(0.8 * size)
    test_size = size - train_size
    
    X_train = np.array(data.timestamp[:train_size]).reshape(-1, 1)
    X_test = np.array(data.timestamp[train_size:]).reshape(-1, 1)
    
    Y_train = np.array(data.value[:train_size]).reshape(-1, 1)
    Y_test = np.array(data.value[train_size:]).reshape(-1, 1)
    
    regr = linear_model.LinearRegression()

    regr.fit(X_train, Y_train)
    new_data['regresion'] = regr
    
    Y_pred = regr.predict(X_test)
    new_data['reg_error'] = mean_squared_error(Y_test, Y_pred)
    return new_data

In [28]:
# todo reject if only one value per geohash
tmp = df[:100000]
tmp.head()

Unnamed: 0,capture_date,latitude,longitude,value,elevation,geohash,geohash8,timestamp
0,2018-10-21 01:00:26.000000,36.04108,140.226816,23.0,31,xn7tncn498k2,xn7tncn4,1540076000.0
1,2018-10-21 01:00:22.000000,37.796306,140.514413,19.0,72,xnezj2x7rnh7,xnezj2x7,1540076000.0
2,2018-10-21 01:00:16.000000,37.72333,140.476797,15.0,141,xneysek46d7w,xneysek4,1540076000.0
3,2018-10-21 00:59:16.000000,52.4449,13.315,16.0,47,u336qgr9tvzw,u336qgr9,1540076000.0
4,2018-10-21 01:00:16.000000,37.7875,140.5524,18.0,107,xneyyr50hk99,xneyyr50,1540076000.0


In [24]:
%%time
res = tmp.groupby('geohash8').apply(prepare_regresion_model)

CPU times: user 16min 13s, sys: 4.09 s, total: 16min 17s
Wall time: 15min 42s


In [29]:
%%time
sdfsdafsfs = tmp.groupby('geohash8').apply(prepare_regresion_model)

CPU times: user 12.2 s, sys: 86.2 ms, total: 12.3 s
Wall time: 8.77 s


In [25]:
res = res.reset_index()[['geohash8', 'count', 'regresion', 'reg_error']]

In [26]:
res_2 = res[res["regresion"].notna()]
res_2.head()

Unnamed: 0,geohash8,count,regresion,reg_error
0,8e8ygwdn,4812,"LinearRegression(copy_X=True, fit_intercept=Tr...",5.644255
1,9exzxf6s,21770,"LinearRegression(copy_X=True, fit_intercept=Tr...",43013960.0
2,9mumuerf,159,"LinearRegression(copy_X=True, fit_intercept=Tr...",27.09438
3,9mumuerg,44,"LinearRegression(copy_X=True, fit_intercept=Tr...",9.262565
5,9mumuffr,10,"LinearRegression(copy_X=True, fit_intercept=Tr...",10.90122


In [27]:
res_2.shape

(25139, 4)