In [1]:
import pandas as pd 
import folium
import os
import geohash
from folium.plugins import HeatMap
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error, mean_squared_log_error
from datetime import datetime
import numpy as np

In [2]:
RESULTS_DIR = "results"

In [3]:
df = pd.read_csv(os.path.join(RESULTS_DIR, "10_million_with_elevation_geohashes_timestamps.csv"))

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,value,geohash,geohash5,timestamp
0,0,23.0,xn7tncn498k2,xn7tn,1540076000.0
1,1,19.0,xnezj2x7rnh7,xnezj,1540076000.0
2,2,15.0,xneysek46d7w,xneys,1540076000.0
3,3,16.0,u336qgr9tvzw,u336q,1540076000.0
4,4,18.0,xneyyr50hk99,xneyy,1540076000.0


In [5]:
df = df[df.value >= 0]
df.head()

Unnamed: 0.1,Unnamed: 0,value,geohash,geohash5,timestamp
0,0,23.0,xn7tncn498k2,xn7tn,1540076000.0
1,1,19.0,xnezj2x7rnh7,xnezj,1540076000.0
2,2,15.0,xneysek46d7w,xneys,1540076000.0
3,3,16.0,u336qgr9tvzw,u336q,1540076000.0
4,4,18.0,xneyyr50hk99,xneyy,1540076000.0


In [6]:
points_geohashes_prec_5 = df.geohash5.value_counts()
points_geohashes_prec_5.mean(), points_geohashes_prec_5.median()

(310.47663253261874, 34.0)

In [7]:
geohashes_for_regression = list(points_geohashes_prec_5.where(lambda count: count <= 1000).dropna().where(lambda count: count >= 50).dropna().index)
len(geohashes_for_regression)

10834

In [8]:
groupped_df = df[df.geohash5.isin(geohashes_for_regression)][["value", "geohash5", "timestamp"]].groupby("geohash5")
len(groupped_df)

10834

In [9]:
def prepare_linear_regresion_model(data):
    data = data.sort_values(by=['timestamp'])
    new_data = pd.DataFrame(data=[[]])
    size = data.shape[0]
    new_data['count'] = size
    
    if size <= 1:
        new_data['regresion'] = None
        new_data['reg_mean_squared_error'] = None
        new_data['reg_r2_score_error'] = None
        new_data['reg_median_absolute_error'] = None
        new_data['reg_mean_squared_log_error'] = None
        return new_data
    
    train_size = int(0.8 * size)
    test_size = size - train_size
    
    X_train = np.array(data.timestamp[:train_size]).reshape(-1, 1)
    X_test = np.array(data.timestamp[train_size:]).reshape(-1, 1)
    
    Y_train = np.array(data.value[:train_size]).reshape(-1, 1)
    Y_test = np.array(data.value[train_size:]).reshape(-1, 1)
    
    regr = linear_model.LinearRegression()

    regr.fit(X_train, Y_train)
    new_data['regresion'] = regr
    
    Y_pred = regr.predict(X_test)
    # mean_squared_error, r2_score, median_absolute_error, mean_squared_log_error
    new_data['reg_mean_squared_error'] = count_function_of_none(mean_squared_error, Y_test, Y_pred)
    new_data['reg_r2_score_error'] = count_function_of_none(r2_score, Y_test, Y_pred)
    new_data['reg_median_absolute_error'] = count_function_of_none(median_absolute_error, Y_test, Y_pred)
    new_data['reg_mean_squared_log_error'] = count_function_of_none(mean_squared_log_error, Y_test, Y_pred)
    
    return new_data

def count_function_of_none(fun, Y_test, Y_pred):
    try:
        return fun(Y_test, Y_pred)
    except ValueError:
        return np.finfo(np.float64).max

In [10]:
df_after_training = groupped_df.apply(prepare_linear_regresion_model).reset_index()

  return mean_squared_error(np.log(y_true + 1), np.log(y_pred + 1),


In [11]:
df_after_training.head()

Unnamed: 0,geohash5,level_1,count,regresion,reg_mean_squared_error,reg_r2_score_error,reg_median_absolute_error,reg_mean_squared_log_error
0,6mc5m,0,167,"LinearRegression(copy_X=True, fit_intercept=Tr...",15.734465,0.093991,3.940649,0.016695
1,6mc5n,0,162,"LinearRegression(copy_X=True, fit_intercept=Tr...",184.964721,-7.283024,13.549167,0.176358
2,6mc5p,0,70,"LinearRegression(copy_X=True, fit_intercept=Tr...",133.037319,-9.413464,12.284661,0.050694
3,6mc5t,0,96,"LinearRegression(copy_X=True, fit_intercept=Tr...",18.379484,-0.759223,3.521412,0.027526
4,6msem,0,119,"LinearRegression(copy_X=True, fit_intercept=Tr...",25.458074,-0.055333,3.927389,0.009508


In [12]:
df_after_training_mean_squared = df_after_training[df_after_training.reg_mean_squared_error <= 100]
df_after_training_reg_r2_score = df_after_training[df_after_training.reg_r2_score_error <= 1000][df_after_training.reg_r2_score_error >= -5]
df_after_training_median_absolute = df_after_training[df_after_training.reg_median_absolute_error <= 10]
df_after_training_mean_log = df_after_training[df_after_training.reg_mean_squared_log_error <= 10]

In [13]:
def get_coordinates(geo):
    bbox = geohash.bbox(geo)
    return [[bbox["w"], bbox["n"]], [bbox["e"], bbox["n"]], [bbox["e"], bbox["s"]], [bbox["w"], bbox["s"]]]

def get_geo_data(ghash_df):
    features = []

    for geo, *_ in ghash_df.values:
        features.append({"type": "Feature", "id": geo, 
                         "geometry": {"type": "Polygon", "coordinates": [get_coordinates(geo)]}})

    return {"type": "FeatureCollection", "features": features}

def visualize_with_folium(df, error):
    m = folium.Map(location=[37.760806, 140.474722], zoom_start=10)

    m.choropleth(
        geo_data=get_geo_data(df),
        name='choropleth',
        data=df,
        columns=['geohash5', error],
        key_on='feature.id',
        fill_color='YlOrRd',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name='error'
    )
    folium.LayerControl().add_to(m)
    return m

In [14]:
m_mean_squared = visualize_with_folium(df_after_training_mean_squared, 'reg_mean_squared_error')
m_reg_r2_score = visualize_with_folium(df_after_training_reg_r2_score, 'reg_r2_score_error')
m_median_absolute = visualize_with_folium(df_after_training_median_absolute, 'reg_median_absolute_error')
m_mean_log = visualize_with_folium(df_after_training_mean_log, 'reg_median_absolute_error')

In [15]:
m_mean_squared.save(os.path.join(RESULTS_DIR, "linear_regression_mean_squared_errors.html"))
m_reg_r2_score.save(os.path.join(RESULTS_DIR, "linear_regression_reg_r2_score_errors.html"))
m_median_absolute.save(os.path.join(RESULTS_DIR, "linear_regression_median_absolute_errors.html"))
m_mean_log.save(os.path.join(RESULTS_DIR, "linear_regression_mean_log_errors.html"))

In [16]:
df_after_training.head()

Unnamed: 0,geohash5,level_1,count,regresion,reg_mean_squared_error,reg_r2_score_error,reg_median_absolute_error,reg_mean_squared_log_error
0,6mc5m,0,167,"LinearRegression(copy_X=True, fit_intercept=Tr...",15.734465,0.093991,3.940649,0.016695
1,6mc5n,0,162,"LinearRegression(copy_X=True, fit_intercept=Tr...",184.964721,-7.283024,13.549167,0.176358
2,6mc5p,0,70,"LinearRegression(copy_X=True, fit_intercept=Tr...",133.037319,-9.413464,12.284661,0.050694
3,6mc5t,0,96,"LinearRegression(copy_X=True, fit_intercept=Tr...",18.379484,-0.759223,3.521412,0.027526
4,6msem,0,119,"LinearRegression(copy_X=True, fit_intercept=Tr...",25.458074,-0.055333,3.927389,0.009508


In [17]:
df_after_training.reg_mean_squared_error.min()

1.1300318991792446

In [18]:
df_after_training[df_after_training.reg_mean_squared_error <= 2]

Unnamed: 0,geohash5,level_1,count,regresion,reg_mean_squared_error,reg_r2_score_error,reg_median_absolute_error,reg_mean_squared_log_error
167,9qcqx,0,60,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.287155,0.479353,0.838472,0.002626
647,c21gt,0,52,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.510112,-0.003976,1.287055,0.00233
1007,dnqv4,0,84,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.778616,-0.184378,1.232094,0.001182
1042,dp6fc,0,63,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.709276,-0.21373,0.779557,0.001969
1101,dphum,0,78,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.130032,0.178159,0.470386,0.001197
1312,drt32,0,70,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.438759,0.322123,1.028736,0.001127
1463,gcjxr,0,58,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.921474,-1.305769,1.396881,0.003407
1476,gcmuf,0,59,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.375759,-0.015945,0.764091,0.001495
2325,spsty,0,66,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.632031,-0.021975,0.771216,0.00156
2610,srt3d,0,72,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.305611,-0.156545,0.768893,0.002423


In [19]:
single_regression = df_after_training[df_after_training.geohash5 == 'drt32']
single_regression

Unnamed: 0,geohash5,level_1,count,regresion,reg_mean_squared_error,reg_r2_score_error,reg_median_absolute_error,reg_mean_squared_log_error
1312,drt32,0,70,"LinearRegression(copy_X=True, fit_intercept=Tr...",1.438759,0.322123,1.028736,0.001127


In [20]:
single_regression = df_after_training[df_after_training.geohash5 == 'drt32'].iloc[0]['regresion']
type(single_regression)

sklearn.linear_model.base.LinearRegression

In [21]:
def get_timestamp(date):
    return datetime.strptime(date, '%Y-%m-%d %H:%M:%S').timestamp()

In [22]:
get_timestamp('2019-01-15 00:00:00')

1547506800.0

In [23]:
single_regression.predict([[get_timestamp('2019-01-15 00:00:00')]])

array([[-1406944.1275862]])