In [30]:
import pandas as pd 
import folium
import os
import geohash
from folium.plugins import HeatMap
from sklearn import datasets, linear_model
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import numpy as np

In [31]:
RESULTS_DIR = "results"

In [32]:
df = pd.read_csv(os.path.join(RESULTS_DIR, "10_million_with_elevation_geohashes_timestamps.csv"))

In [33]:
df.head()

Unnamed: 0.1,Unnamed: 0,value,geohash,geohash5,timestamp
0,0,23.0,xn7tncn498k2,xn7tn,1540076000.0
1,1,19.0,xnezj2x7rnh7,xnezj,1540076000.0
2,2,15.0,xneysek46d7w,xneys,1540076000.0
3,3,16.0,u336qgr9tvzw,u336q,1540076000.0
4,4,18.0,xneyyr50hk99,xneyy,1540076000.0


In [34]:
points_geohashes_prec_5 = df.geohash5.value_counts()
points_geohashes_prec_5.mean(), points_geohashes_prec_5.median()

(310.4768877404536, 34.0)

In [35]:
geohashes_for_regression = list(points_geohashes_prec_5.where(lambda count: count >= 50).dropna().index)
len(geohashes_for_regression)

11920

In [36]:
groupped_df = df[df.geohash5.isin(geohashes_for_regression)][["value", "geohash5", "timestamp"]].groupby("geohash5")
len(groupped_df)

11920

In [37]:
def prepare_ridge_regresion_model(data):
    data = data.sort_values(by=['timestamp'])
    new_data = pd.DataFrame(data=[[]])
    size = data.shape[0]
    new_data['count'] = size
    
    if size <= 1:
        new_data['regresion'] = None
        new_data['reg_error'] = None
        return new_data
    
    train_size = int(0.8 * size)
    test_size = size - train_size
    
    X_train = np.array(data.timestamp[:train_size]).reshape(-1, 1)
    X_test = np.array(data.timestamp[train_size:]).reshape(-1, 1)
    
    Y_train = np.array(data.value[:train_size]).reshape(-1, 1)
    Y_test = np.array(data.value[train_size:]).reshape(-1, 1)
    
    regr = Ridge(alpha=0.7)

    regr.fit(X_train, Y_train)
    new_data['regresion'] = regr
    
    Y_pred = regr.predict(X_test)
    new_data['reg_error'] = mean_squared_error(Y_test, Y_pred)
    return new_data

In [None]:
df_after_training = groupped_df.apply(prepare_ridge_regresion_model).reset_index()

In [None]:
df_after_training.head()

In [None]:
df_after_training = df_after_training[df_after_training.reg_error <= 100]

In [None]:
def get_coordinates(geo):
    bbox = geohash.bbox(geo)
    return [[bbox["w"], bbox["n"]], [bbox["e"], bbox["n"]], [bbox["e"], bbox["s"]], [bbox["w"], bbox["s"]]]

def get_geo_data(ghash_df):
    features = []

    for geo, *_ in ghash_df.values:
        features.append({"type": "Feature", "id": geo, 
                         "geometry": {"type": "Polygon", "coordinates": [get_coordinates(geo)]}})

    return {"type": "FeatureCollection", "features": features}

def visualize_with_folium(df):
    m = folium.Map(location=[37.760806, 140.474722], zoom_start=10)

    m.choropleth(
        geo_data=get_geo_data(df),
        name='choropleth',
        data=df,
        columns=['geohash5', 'reg_error'],
        key_on='feature.id',
        fill_color='YlOrRd',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name='regression mean square error'
    )
    folium.LayerControl().add_to(m)
    return m

In [None]:
m = visualize_with_folium(df_after_training)

In [None]:
m.save(os.path.join(RESULTS_DIR, "ridge_regression_errors.html"))