In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [289]:
import pandas as pd
import numpy as np
from numpy import inf, mean, median
import math
import datetime
import json
import os
import glob
import pickle
import requests
# import requests_cache
import plotly.graph_objects as go
import plotly.express as px
import folium
from folium.plugins import MarkerCluster, HeatMap
import geopandas as gpd
import osm2geojson
import geojson
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon, LineString
from geopy.distance import geodesic, great_circle

In [3]:
import cudf # pandas на GPU
import cupy # numpy на GPU
import dask_cudf # multiple GPUs

import rmm

In [4]:
rmm.reinitialize(managed_memory=True)
assert(rmm.is_initialized())

In [5]:
pd.options.display.max_rows = 100

# Roads

In [221]:
with open('data_osm_roads_gdf_2.pickle', 'rb') as _f:
    data_osm_roads_gdf = pickle.load(_f)

In [222]:
data_osm_roads_gdf.shape

(57131, 9)

In [223]:
data_osm_roads_gdf.head()

Unnamed: 0,geometry,type,id,tags,highway,lanes,oneway,surface,representative_point
0,"LINESTRING (34.82303 57.38299, 34.82630 57.380...",way,4412349,"{'highway': 'trunk', 'int_ref': 'E 105;AH8', '...",trunk,2,yes,asphalt,POINT (34.82630 57.38061)
1,"LINESTRING (34.74326 57.44017, 34.74535 57.438...",way,4412351,"{'highway': 'trunk', 'int_ref': 'E 105;AH8', '...",trunk,1,yes,asphalt,POINT (34.74535 57.43867)
2,"LINESTRING (34.66128 57.49754, 34.66678 57.493...",way,4412353,"{'highway': 'trunk', 'int_ref': 'E 105;AH8', '...",trunk,1,yes,asphalt,POINT (34.67413 57.48855)
3,"LINESTRING (31.35035 56.34085, 31.35280 56.340...",way,4420591,"{'highway': 'trunk', 'int_ref': 'E 22', 'lanes...",trunk,2,,asphalt,POINT (31.47337 56.32730)
4,"LINESTRING (31.58938 56.32759, 31.59669 56.328...",way,4420592,"{'highway': 'trunk', 'int_ref': 'E 22', 'lanes...",trunk,2,,asphalt,POINT (31.63790 56.33436)


In [224]:
data_osm_roads_gdf['road_lon'] = data_osm_roads_gdf['representative_point'].x
data_osm_roads_gdf['road_lat'] = data_osm_roads_gdf['representative_point'].y

In [225]:
data_osm_roads_gdf.head()

Unnamed: 0,geometry,type,id,tags,highway,lanes,oneway,surface,representative_point,road_lon,road_lat
0,"LINESTRING (34.82303 57.38299, 34.82630 57.380...",way,4412349,"{'highway': 'trunk', 'int_ref': 'E 105;AH8', '...",trunk,2,yes,asphalt,POINT (34.82630 57.38061),34.826305,57.380612
1,"LINESTRING (34.74326 57.44017, 34.74535 57.438...",way,4412351,"{'highway': 'trunk', 'int_ref': 'E 105;AH8', '...",trunk,1,yes,asphalt,POINT (34.74535 57.43867),34.745351,57.438669
2,"LINESTRING (34.66128 57.49754, 34.66678 57.493...",way,4412353,"{'highway': 'trunk', 'int_ref': 'E 105;AH8', '...",trunk,1,yes,asphalt,POINT (34.67413 57.48855),34.674135,57.488555
3,"LINESTRING (31.35035 56.34085, 31.35280 56.340...",way,4420591,"{'highway': 'trunk', 'int_ref': 'E 22', 'lanes...",trunk,2,,asphalt,POINT (31.47337 56.32730),31.473373,56.327296
4,"LINESTRING (31.58938 56.32759, 31.59669 56.328...",way,4420592,"{'highway': 'trunk', 'int_ref': 'E 22', 'lanes...",trunk,2,,asphalt,POINT (31.63790 56.33436),31.637903,56.334365


# Points

In [226]:
data_points = pd.DataFrame({'kladr': [
    '0200000101500',
    '2700000100000',
    '5501900001100',
    '2801800001600',
    '2800000400000',
    '3301000006100',
    '3800000300000',
    '5401700005600',
    '5003200002100',
    '5003200003000',
    '5401800000600',
    '5401800001000',
    '3813300000700',
    '3813300001000',
    '5003200003100',
    '5003200005100',
    '5401800001200',
    '5401900000200',
    '5003200006500',
    '5003200007300',
    '3813300001100',
    '3813300001200',
    '5401900001500',
    '5401900002800',
    '3813300001500',
    '3813300002000',
    '5003200009200',
    '5003200010000',
    '5401900002900',
    '5402000000400',
    '3813300003300',
    '3813300003600',
    '5003200010400',
    '5003200010700',
    '5402000000500',
    '5402000000800',
    '3813300003700',
    '3813300003800',
    '3813300004500',
    '4000000107200',
    '5003200011900',
]})

In [227]:
def kladr_to_address_api(kladr_id):   
  
    data = {
        "query": kladr_id
    }

    api_url = 'https://suggestions.dadata.ru/suggestions/api/4_1/rs/findById/fias'
    headers = {
        'content-type': 'application/json',
        'Authorization': 'Token 79abf89d58871ed1df79b83126f8f8c2362e51db'
    }
    response = requests.post(api_url, json=data, headers=headers)
    adress_json = response.json()
    adress_str = adress_json['suggestions'][0]['value']
    return adress_str

In [228]:
def address_to_geo_coord_api(address):
    r = requests.get(f'http://search.maps.sputnik.ru/search/addr?q={address}')
    response = r.json()
    coordinates = response['result']['address'][0]['features'][0]['geometry']['geometries'][0]['coordinates']
    coordinates = coordinates[::-1]
    return coordinates

In [229]:
addresses = []
coordinates = []
for kladr in data_points['kladr']:
    address = kladr_to_address_api(kladr)
    addresses.append(address)
    coordinate = address_to_geo_coord_api(address)
    coordinates.append(coordinate)
    print(address, coordinate)
data_points['address'] = addresses
data_points['coordinates'] = coordinates

Респ Башкортостан, г Уфа, деревня Жилино [54.656384, 56.06246]
Хабаровский край, г Хабаровск [48.481403, 135.07693]
Омская обл, Одесский р-н, село Желанное [54.166344, 72.575775]
Амурская обл, Тамбовский р-н, село Муравьевка [49.837215, 127.73236]
Амурская обл, г Зея [53.740356, 127.27162]
Владимирская обл, Меленковский р-н, деревня Левенда [55.31677, 41.793613]
Иркутская обл, г Иркутск [52.289597, 104.28059]
Новосибирская обл, Кыштовский р-н, деревня Ядкан [56.733883, 76.515045]
Московская обл, г Серпухов, деревня Арнеево [54.915524, 37.41955]
Московская обл, г Серпухов, деревня Борисово [55.35957, 38.047356]
Новосибирская обл, Маслянинский р-н, село Большой Изырак [54.514095, 84.27022]
Новосибирская обл, Маслянинский р-н, село Дубровка [54.46441, 84.721375]
Иркутская обл, Эхирит-Булагатский р-н, деревня Верхняя Идыга [52.92196, 104.562065]
Иркутская обл, Эхирит-Булагатский р-н, село Захал [52.602913, 104.73313]
Московская обл, г Серпухов, деревня Бутурлино [54.92471, 37.484756]
Моско

In [230]:
data_points['lat'], data_points['lon'] = zip(*data_points['coordinates'])

In [231]:
data_points.shape

(41, 5)

In [232]:
data_points.head()

Unnamed: 0,kladr,address,coordinates,lat,lon
0,200000101500,"Респ Башкортостан, г Уфа, деревня Жилино","[54.656384, 56.06246]",54.656384,56.06246
1,2700000100000,"Хабаровский край, г Хабаровск","[48.481403, 135.07693]",48.481403,135.07693
2,5501900001100,"Омская обл, Одесский р-н, село Желанное","[54.166344, 72.575775]",54.166344,72.575775
3,2801800001600,"Амурская обл, Тамбовский р-н, село Муравьевка","[49.837215, 127.73236]",49.837215,127.73236
4,2800000400000,"Амурская обл, г Зея","[53.740356, 127.27162]",53.740356,127.27162


# Points To Roads

In [233]:
data_points_2 = data_points.loc[(pd.notnull(data_points['lat']))&(pd.notnull(data_points['lon'])), ['kladr', 'lat', 'lon']].reset_index()

In [234]:
data_points_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   index   41 non-null     int64  
 1   kladr   41 non-null     object 
 2   lat     41 non-null     float64
 3   lon     41 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.4+ KB


In [235]:
data_osm_roads_gdf_2 = data_osm_roads_gdf.loc[(pd.notnull(data_osm_roads_gdf['road_lat']))&(pd.notnull(data_osm_roads_gdf['road_lon'])), ['road_lon', 'road_lat']]

In [236]:
data_osm_roads_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 57131 entries, 0 to 57130
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   geometry              57131 non-null  geometry
 1   type                  57131 non-null  object  
 2   id                    57131 non-null  int64   
 3   tags                  57131 non-null  object  
 4   highway               57131 non-null  object  
 5   lanes                 42626 non-null  object  
 6   oneway                35457 non-null  object  
 7   surface               54209 non-null  object  
 8   representative_point  57131 non-null  geometry
 9   road_lon              57131 non-null  float64 
 10  road_lat              57131 non-null  float64 
dtypes: float64(2), geometry(2), int64(1), object(6)
memory usage: 4.8+ MB


In [237]:
%%time
data_points2roads = pd.merge(data_points_2[['index', 'lat', 'lon']],
                             data_osm_roads_gdf_2,
                             how='cross',
                            )

CPU times: user 288 ms, sys: 246 ms, total: 534 ms
Wall time: 531 ms


In [238]:
data_points2roads.shape

(2342371, 5)

In [239]:
data_points2roads.head()

Unnamed: 0,index,lat,lon,road_lon,road_lat
0,0,54.656384,56.06246,34.826305,57.380612
1,0,54.656384,56.06246,34.745351,57.438669
2,0,54.656384,56.06246,34.674135,57.488555
3,0,54.656384,56.06246,31.473373,56.327296
4,0,54.656384,56.06246,31.637903,56.334365


In [240]:
data_points2roads.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2342371 entries, 0 to 2342370
Data columns (total 5 columns):
 #   Column    Dtype  
---  ------    -----  
 0   index     int64  
 1   lat       float64
 2   lon       float64
 3   road_lon  float64
 4   road_lat  float64
dtypes: float64(4), int64(1)
memory usage: 107.2 MB


In [241]:
a = data_points2roads.to_numpy()
a

array([[ 0.      , 54.656384, 56.06246 , 34.826305, 57.380612],
       [ 0.      , 54.656384, 56.06246 , 34.745351, 57.438669],
       [ 0.      , 54.656384, 56.06246 , 34.674135, 57.488555],
       ...,
       [40.      , 54.95071 , 37.388958, 38.946385, 45.028139],
       [40.      , 54.95071 , 37.388958, 38.945421, 45.027111],
       [40.      , 54.95071 , 37.388958, 39.121434, 44.956281]])

In [242]:
# data_points2roads_cudf = cudf.DataFrame(data_points2roads) # почему-то не работает: Did not pass numpy.dtype object

data_points2roads_cudf = cudf.DataFrame(data_points2roads.to_numpy(), columns=['index', 'lat', 'lon', 'road_lon', 'road_lat'])

In [243]:
data_points2roads_cudf.head()

Unnamed: 0,index,lat,lon,road_lon,road_lat
0,0.0,54.656384,56.06246,34.826305,57.380612
1,0.0,54.656384,56.06246,34.745351,57.438669
2,0.0,54.656384,56.06246,34.674135,57.488555
3,0.0,54.656384,56.06246,31.473373,56.327296
4,0.0,54.656384,56.06246,31.637903,56.334365


In [244]:
data_points2roads_cudf.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 2342371 entries, 0 to 2342370
Data columns (total 5 columns):
 #   Column    Dtype
---  ------    -----
 0   index     float64
 1   lat       float64
 2   lon       float64
 3   road_lon  float64
 4   road_lat  float64
dtypes: float64(5)
memory usage: 89.4 MB


In [245]:
def f_dist_simplifier_lat(city_lon_lat_point_lon_lat, max_val=6):
    city_lon = city_lon_lat_point_lon_lat[0]
    city_lat = city_lon_lat_point_lon_lat[1]
    point_lon = city_lon_lat_point_lon_lat[2]
    point_lat = city_lon_lat_point_lon_lat[3]
    if ((city_lat < point_lat + max_val) and
        (city_lat > point_lat - max_val)):
        return 1
    else:
        return 0

In [246]:
%%time
data_points2roads['simplifier_sign'] = data_points2roads[['road_lon', 'road_lat', 'lon', 'lat']].apply(f_dist_simplifier_lat, axis=1, args=(3,))

CPU times: user 2min 3s, sys: 3.53 s, total: 2min 7s
Wall time: 2min 6s


In [247]:
%%time
data_points2roads['simplifier_sign_2'] = 0
data_points2roads.loc[(data_points2roads['road_lat'] < data_points2roads['lat'] + 3) & (data_points2roads['road_lat'] > data_points2roads['lat'] - 3), 'simplifier_sign_2'] = 1

CPU times: user 46 ms, sys: 27.5 ms, total: 73.5 ms
Wall time: 71.1 ms


In [248]:
data_points2roads[['simplifier_sign', 'simplifier_sign_2']].groupby(['simplifier_sign', 'simplifier_sign_2']).size()

simplifier_sign  simplifier_sign_2
0                0                    1227322
1                1                    1115049
dtype: int64

In [249]:
%%time
data_points2roads = data_points2roads.loc[data_points2roads['simplifier_sign']==1]

CPU times: user 38.3 ms, sys: 19.8 ms, total: 58 ms
Wall time: 56.1 ms


In [250]:
data_points2roads.shape

(1115049, 7)

In [196]:
def f_dist_simplifier_lat_cudf(road_lat, lat, simplifier_sign, max_val=6):
    for i, (road_lat_i, lat_i) in enumerate(zip(road_lat, lat)):
        simplifier_sign[i] = 1 if road_lat_i < lat_i + max_val and road_lat_i > lat_i - max_val else 0

In [251]:
%%time
data_points2roads_cudf = data_points2roads_cudf.apply_rows(f_dist_simplifier_lat_cudf,
                                                           incols=['road_lat', 'lat'],
                                                           outcols=dict(simplifier_sign=np.int16,),
                                                           kwargs=dict(max_val=3,)
                                                          )

CPU times: user 19.6 ms, sys: 16.4 ms, total: 35.9 ms
Wall time: 34.4 ms


In [252]:
%%time
data_points2roads_cudf['simplifier_sign_2'] = 0
data_points2roads_cudf.loc[(data_points2roads_cudf['road_lat'] < data_points2roads_cudf['lat'] + 3) & (data_points2roads_cudf['road_lat'] > data_points2roads_cudf['lat'] - 3), 'simplifier_sign_2'] = 1

CPU times: user 24 ms, sys: 8 ms, total: 32 ms
Wall time: 30.6 ms


In [253]:
data_points2roads_cudf[['simplifier_sign', 'simplifier_sign_2']].groupby(['simplifier_sign', 'simplifier_sign_2']).size()

simplifier_sign  simplifier_sign_2
0                0                    1227322
1                1                    1115049
dtype: int32

In [254]:
%%time
data_points2roads_cudf = data_points2roads_cudf.loc[data_points2roads_cudf['simplifier_sign']==1]

CPU times: user 16.1 ms, sys: 4.03 ms, total: 20.1 ms
Wall time: 19 ms


In [255]:
data_points2roads_cudf.shape

(1115049, 7)

In [256]:
def f_dist_simplifier_lon(city_lon_lat_point_lon_lat, max_val=6):
    city_lon = city_lon_lat_point_lon_lat[0]
    city_lat = city_lon_lat_point_lon_lat[1]
    point_lon = city_lon_lat_point_lon_lat[2]
    point_lat = city_lon_lat_point_lon_lat[3]
    max_val = max_val / math.cos(math.radians(point_lat))
    if ((city_lon < point_lon + max_val) and
        (city_lon > point_lon - max_val)):
        return 1
    else:
        return 0

In [257]:
%%time
data_points2roads['simplifier_sign'] = data_points2roads[['road_lon', 'road_lat', 'lon', 'lat']].apply(f_dist_simplifier_lon, axis=1, args=(3,))

CPU times: user 40.8 s, sys: 327 ms, total: 41.1 s
Wall time: 41.1 s


In [258]:
%%time
data_points2roads['simplifier_sign_2'] = 0
data_points2roads.loc[(data_points2roads['road_lon'] < data_points2roads['lon'] + 3 / np.cos(np.radians(data_points2roads['lat']))) &
                      (data_points2roads['road_lon'] > data_points2roads['lon'] - 3 / np.cos(np.radians(data_points2roads['lat']))),
                      'simplifier_sign_2'] = 1

CPU times: user 88.4 ms, sys: 19.9 ms, total: 108 ms
Wall time: 106 ms


In [259]:
data_points2roads[['simplifier_sign', 'simplifier_sign_2']].groupby(['simplifier_sign', 'simplifier_sign_2']).size()

simplifier_sign  simplifier_sign_2
0                0                    887140
1                1                    227909
dtype: int64

In [260]:
%%time
data_points2roads = data_points2roads.loc[data_points2roads['simplifier_sign']==1]

CPU times: user 15.1 ms, sys: 3.69 ms, total: 18.8 ms
Wall time: 16.3 ms


In [261]:
data_points2roads.shape

(227909, 7)

In [262]:
def f_dist_simplifier_lon_cudf(road_lon, road_lat, lon, lat, simplifier_sign, max_val=6):
    for i, (road_lon_i, road_lat_i, lon_i, lat_i) in enumerate(zip(road_lon, road_lat, lon, lat)):
        max_val = max_val / math.cos(math.radians(lat_i))
        simplifier_sign[i] = 1 if road_lon_i < lon_i + max_val and road_lon_i > lon_i - max_val else 0

In [263]:
%%time
data_points2roads_cudf = data_points2roads_cudf.apply_rows(f_dist_simplifier_lon_cudf,
                                                           incols=['road_lon', 'road_lat', 'lon', 'lat'],
                                                           outcols=dict(simplifier_sign=np.int16,),
                                                           kwargs=dict(max_val=3,)
                                                          )

CPU times: user 645 ms, sys: 59.6 ms, total: 705 ms
Wall time: 703 ms


In [264]:
%%time
data_points2roads_cudf['simplifier_sign_2'] = 0
data_points2roads_cudf.loc[(data_points2roads_cudf['road_lon'] < data_points2roads_cudf['lon'] + 3 / np.cos(np.radians(data_points2roads_cudf['lat']))) &
                           (data_points2roads_cudf['road_lon'] > data_points2roads_cudf['lon'] - 3 / np.cos(np.radians(data_points2roads_cudf['lat']))),
                           'simplifier_sign_2'] = 1

CPU times: user 26.1 ms, sys: 8.02 ms, total: 34.1 ms
Wall time: 32.4 ms


In [265]:
data_points2roads_cudf[['simplifier_sign', 'simplifier_sign_2']].groupby(['simplifier_sign', 'simplifier_sign_2']).size()

simplifier_sign  simplifier_sign_2
0                0                    887140
1                1                    227909
dtype: int32

In [266]:
%%time
data_points2roads_cudf = data_points2roads_cudf.loc[data_points2roads_cudf['simplifier_sign']==1]

CPU times: user 7.03 ms, sys: 4.19 ms, total: 11.2 ms
Wall time: 9.8 ms


In [267]:
data_points2roads_cudf.shape

(227909, 7)

In [268]:
def f_dist(lon_lat_1_2):
    lon_1 = lon_lat_1_2[0]
    lat_1 = lon_lat_1_2[1]
    lon_2 = lon_lat_1_2[2]
    lat_2 = lon_lat_1_2[3]
    if pd.isnull(lon_1):
        return None
    if pd.isnull(lat_1):
        return None
    if pd.isnull(lon_2):
        return None
    if pd.isnull(lat_2):
        return None
    return geodesic((lat_1, lon_1), (lat_2, lon_2)).km

In [290]:
def f_dist_2(lon_lat_1_2):
    lon_1 = lon_lat_1_2[0]
    lat_1 = lon_lat_1_2[1]
    lon_2 = lon_lat_1_2[2]
    lat_2 = lon_lat_1_2[3]
    if pd.isnull(lon_1):
        return None
    if pd.isnull(lat_1):
        return None
    if pd.isnull(lon_2):
        return None
    if pd.isnull(lat_2):
        return None
    return great_circle((lat_1, lon_1), (lat_2, lon_2)).km

In [269]:
%%time
data_points2roads['dist'] = data_points2roads[['road_lon', 'road_lat', 'lon', 'lat']].apply(f_dist, axis=1)

CPU times: user 2min 9s, sys: 417 ms, total: 2min 10s
Wall time: 2min 9s


In [291]:
%%time
data_points2roads['dist_2'] = data_points2roads[['road_lon', 'road_lat', 'lon', 'lat']].apply(f_dist_2, axis=1)

CPU times: user 24.4 s, sys: 315 ms, total: 24.8 s
Wall time: 24.6 s


In [270]:
print(f"dist_min = {min(data_points2roads['dist'])} km, dist_max = {max(data_points2roads['dist'])} km")

dist_min = 0.7629011068564221 km, dist_max = 473.7697927006493 km


In [292]:
print(f"dist_min_2 = {min(data_points2roads['dist_2'])} km, dist_max_2 = {max(data_points2roads['dist_2'])} km")

dist_min_2 = 0.7620252340092496 km, dist_max_2 = 472.787098318829 km


In [293]:
data_points2roads.head()

Unnamed: 0,index,lat,lon,road_lon,road_lat,simplifier_sign,simplifier_sign_2,dist,dist_2
11986,0,54.656384,56.06246,51.208425,52.849901,1,1,377.955123,376.961471
11990,0,54.656384,56.06246,51.120501,52.864984,1,1,381.946933,380.933121
12026,0,54.656384,56.06246,51.702642,52.765198,1,1,356.514605,355.633692
12151,0,54.656384,56.06246,51.220868,52.847263,1,1,377.425755,376.434978
12152,0,54.656384,56.06246,51.227666,52.845825,1,1,377.136842,376.147636


In [294]:
def f_dist_cudf(road_lon, road_lat, lon, lat, dist):
    for i, (lon_1, lat_1, lon_2, lat_2) in enumerate(zip(road_lon, road_lat, lon, lat)):
        lon_1 = math.radians(lon_1)
        lat_1 = math.radians(lat_1)
        lon_2 = math.radians(lon_2)
        lat_2 = math.radians(lat_2)
        dlon = lon_2 - lon_1
        dlat = lat_2 - lat_1
        a = math.sin(dlat/2)**2 + math.cos(lat_1) * math.cos(lat_2) * math.sin(dlon/2)**2        
        c = 2 * math.asin(math.sqrt(a)) 
        r = 6371.009 # Radius of earth in kilometers
        dist[i] = c * r

In [295]:
%%time
data_points2roads_cudf = data_points2roads_cudf.apply_rows(f_dist_cudf,
                                                           incols=['road_lon', 'road_lat', 'lon', 'lat'],
                                                           outcols=dict(dist=np.float64,),
                                                           kwargs=dict()
                                                          )

CPU times: user 6.51 s, sys: 445 ms, total: 6.96 s
Wall time: 6.88 s


In [296]:
print(f"dist_min = {min(data_points2roads_cudf['dist'].to_pandas())} km, dist_max = {max(data_points2roads_cudf['dist'].to_pandas())} km")

dist_min = 0.7620252340090102 km, dist_max = 472.7870983188284 km


In [297]:
data_points2roads_cudf.head()

Unnamed: 0,index,lat,lon,road_lon,road_lat,simplifier_sign,simplifier_sign_2,dist
11986,0.0,54.656384,56.06246,51.208425,52.849901,1,1,376.961471
11990,0.0,54.656384,56.06246,51.120501,52.864984,1,1,380.933121
12026,0.0,54.656384,56.06246,51.702642,52.765198,1,1,355.633692
12151,0.0,54.656384,56.06246,51.220868,52.847263,1,1,376.434978
12152,0.0,54.656384,56.06246,51.227666,52.845825,1,1,376.147636


# Update Points

## The Nearest Road

In [298]:
data_points2roads = data_points2roads_cudf.to_pandas()

In [299]:
%%time
data_points2roads_gr = data_points2roads.loc[:, ['index', 'dist']].groupby('index').min().reset_index()

CPU times: user 17.6 ms, sys: 7.36 ms, total: 24.9 ms
Wall time: 22.5 ms


In [300]:
data_points2roads_gr.rename(columns={'dist':'dist_to_road'}, inplace=True)

In [301]:
data_points2roads_gr.shape

(41, 2)

In [302]:
data_points2roads_gr.loc[pd.isnull(data_points2roads_gr['dist_to_road'])]

Unnamed: 0,index,dist_to_road


In [303]:
data_points2roads_gr.loc[pd.isnull(data_points2roads_gr['dist_to_road']), 'dist_to_road'] = 500

In [304]:
data_points2roads_gr.head()

Unnamed: 0,index,dist_to_road
0,0.0,0.762025
1,1.0,2.080641
2,2.0,13.342491
3,3.0,40.813837
4,4.0,79.005054


## Update

In [306]:
%%time
data_points = data_points.merge(data_points_2, right_on='kladr', left_on='kladr', how='left')

CPU times: user 7.86 ms, sys: 0 ns, total: 7.86 ms
Wall time: 7.3 ms


In [307]:
%%time
data_points = data_points.merge(data_points2roads_gr, right_on='index', left_on='index', how='left')

CPU times: user 12.7 ms, sys: 0 ns, total: 12.7 ms
Wall time: 11.4 ms


In [308]:
data_points.shape

(41, 9)

In [130]:
with open('data_points.pickle', 'wb') as _f:
    pickle.dump(data_points, _f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('data_points.pickle', 'rb') as _f:
    data_points = pickle.load(_f)