In [48]:
import pandas as pd
import numpy as np
from numpy import inf, mean, median
import math
import datetime
import json
import os
import glob
import pickle
import requests
# import requests_cache
import plotly.graph_objects as go
import plotly.express as px
import folium
from folium.plugins import MarkerCluster, HeatMap
import geopandas as gpd
import osm2geojson
import geojson
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon, LineString

# Load Points

In [2]:
data_points = pd.read_csv('data_post_offices.csv', sep=';')

In [3]:
data_points.head()

Unnamed: 0,post_index,post_address,post_coordinates,post_lat,post_lon
0,101000,"г Москва , Мясницкая ул, 26","[55.763874, 37.637253]",55.763874,37.637253
1,103132,"г Москва , Старая пл, 2/14, стр.1","[55.75586, 37.629196]",55.75586,37.629196
2,103265,"г Москва , Охотный Ряд ул, 1","[55.757416, 37.61613]",55.757416,37.61613
3,103274,"г Москва , Краснопресненская наб, 2, стр.1","[55.755093, 37.572784]",55.755093,37.572784
4,103426,"г Москва , Дмитровка Б. ул, 26","[55.76444, 37.612213]",55.76444,37.612213


In [4]:
data_points.rename(columns={'post_address':'address', 'post_coordinates':'coordinates', 'post_lat':'lat', 'post_lon':'lon'}, inplace=True)
data_points.drop('post_index', axis=1, inplace=True)

In [5]:
data_points.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   address      496 non-null    object 
 1   coordinates  496 non-null    object 
 2   lat          496 non-null    float64
 3   lon          496 non-null    float64
dtypes: float64(2), object(2)
memory usage: 15.6+ KB


# Load Metro Stations

In [6]:
with open('data_mos_metro_stations_gdf.pickle', 'rb') as _f:
    data_mos_metro_stations_gdf = pickle.load(_f)

In [7]:
data_mos_metro_stations_gdf.head()

Unnamed: 0,station_name,line_name,geometry,station_point
0,Авиамоторная,Большая кольцевая линия,POINT (37.71948 55.75325),POINT (37.71948 55.75325)
1,Авиамоторная,Калининская линия,"MULTIPOINT (37.71637 55.75168, 37.71640 55.751...",POINT (37.71680 55.75196)
2,Автозаводская,Замоскворецкая линия,"MULTIPOINT (37.65732 55.70643, 37.65737 55.708...",POINT (37.65737 55.70860)
3,Автозаводская,Московское центральное кольцо,"MULTIPOINT (37.66057 55.70599, 37.66290 55.70537)",POINT (37.66057 55.70599)
4,Академическая,Калужско-Рижская линия,"MULTIPOINT (37.57222 55.68702, 37.57237 55.687...",POINT (37.57306 55.68695)


In [8]:
data_mos_metro_stations_gdf.shape

(264, 4)

In [11]:
data_mos_metro_stations_gdf['station_lon'] = data_mos_metro_stations_gdf['station_point'].x
data_mos_metro_stations_gdf['station_lat'] = data_mos_metro_stations_gdf['station_point'].y

# Points To Metro Stations

In [9]:
data_points_2 = data_points.loc[(pd.notnull(data_points['lat']))&(pd.notnull(data_points['lon'])), ['lat', 'lon']].reset_index()

In [10]:
data_points_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   index   496 non-null    int64  
 1   lat     496 non-null    float64
 2   lon     496 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 11.8 KB


In [13]:
data_mos_metro_stations_gdf_2 = data_mos_metro_stations_gdf.loc[(pd.notnull(data_mos_metro_stations_gdf['station_lat']))&(pd.notnull(data_mos_metro_stations_gdf['station_lon'])), ['station_lon', 'station_lat']]

In [14]:
data_mos_metro_stations_gdf_2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 264 entries, 0 to 263
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   station_lon  264 non-null    float64
 1   station_lat  264 non-null    float64
dtypes: float64(2)
memory usage: 6.2 KB


In [15]:
%%time
data_points2metro = pd.merge(data_points_2[['index', 'lat', 'lon']],
                             data_mos_metro_stations_gdf_2,
                             how='cross',
                            )

CPU times: user 27 ms, sys: 41.5 ms, total: 68.6 ms
Wall time: 65.7 ms


In [16]:
data_points2metro.shape

(130944, 5)

In [17]:
data_points2metro.head()

Unnamed: 0,index,lat,lon,station_lon,station_lat
0,0,55.763874,37.637253,37.719478,55.753246
1,0,55.763874,37.637253,37.7168,55.751956
2,0,55.763874,37.637253,37.657373,55.708597
3,0,55.763874,37.637253,37.660571,55.705993
4,0,55.763874,37.637253,37.573061,55.686951


In [18]:
data_points2metro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130944 entries, 0 to 130943
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   index        130944 non-null  int64  
 1   lat          130944 non-null  float64
 2   lon          130944 non-null  float64
 3   station_lon  130944 non-null  float64
 4   station_lat  130944 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 6.0 MB


In [19]:
def f_dist(df, col_lat_1, col_lon_1, col_lat_2, col_lon_2, col_dist):
    
    lat_1 = math.pi/180 * df[col_lat_1].to_numpy()
    lon_1 = math.pi/180 * df[col_lon_1].to_numpy()
    lat_2 = math.pi/180 * df[col_lat_2].to_numpy()
    lon_2 = math.pi/180 * df[col_lon_2].to_numpy()
    
    dlon = lon_1 - lon_2
    dlat = lat_1 - lat_2
    a = np.sin(dlat/2)**2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    dist = c * 6371.009
    
    df[col_dist] = dist
    
    return df

In [20]:
%%time
data_points2metro = f_dist(data_points2metro, 'lat', 'lon', 'station_lat', 'station_lon', 'dist')

CPU times: user 73.8 ms, sys: 46 ms, total: 120 ms
Wall time: 81.5 ms


In [21]:
data_points2metro.head()

Unnamed: 0,index,lat,lon,station_lon,station_lat,dist
0,0,55.763874,37.637253,37.719478,55.753246,5.278595
1,0,55.763874,37.637253,37.7168,55.751956,5.150539
2,0,55.763874,37.637253,37.657373,55.708597,6.274263
3,0,55.763874,37.637253,37.660571,55.705993,6.599565
4,0,55.763874,37.637253,37.573061,55.686951,9.450924


# Update Points

In [23]:
%%time
data_points2metro_min = data_points2metro[['index', 'dist']].groupby('index').min().reset_index()

CPU times: user 269 ms, sys: 0 ns, total: 269 ms
Wall time: 270 ms


In [24]:
data_points2metro_min.head()

Unnamed: 0,index,dist
0,0,0.133073
1,1,0.209655
2,2,0.074898
3,3,0.645962
4,4,0.308304


In [26]:
data_points2metro_min.shape

(496, 2)

In [27]:
data_points2metro_min.set_index('index', inplace=True)

In [28]:
data_points2metro_min.head()

Unnamed: 0_level_0,dist
index,Unnamed: 1_level_1
0,0.133073
1,0.209655
2,0.074898
3,0.645962
4,0.308304


In [29]:
%%time
data_points = data_points.merge(data_points2metro_min, left_index=True, right_index=True, how='left')

CPU times: user 5.42 ms, sys: 0 ns, total: 5.42 ms
Wall time: 4.45 ms


In [31]:
data_points.rename(columns={'dist':'dist_to_metro'}, inplace=True)

In [32]:
data_points.head()

Unnamed: 0,address,coordinates,lat,lon,dist_to_metro
0,"г Москва , Мясницкая ул, 26","[55.763874, 37.637253]",55.763874,37.637253,0.133073
1,"г Москва , Старая пл, 2/14, стр.1","[55.75586, 37.629196]",55.75586,37.629196,0.209655
2,"г Москва , Охотный Ряд ул, 1","[55.757416, 37.61613]",55.757416,37.61613,0.074898
3,"г Москва , Краснопресненская наб, 2, стр.1","[55.755093, 37.572784]",55.755093,37.572784,0.645962
4,"г Москва , Дмитровка Б. ул, 26","[55.76444, 37.612213]",55.76444,37.612213,0.308304


In [33]:
def f_gr(val, gr):
    return int(val / gr) * gr

In [38]:
data_points['dist_to_metro_gr'] = data_points['dist_to_metro'].apply(f_gr, args=(0.25,))
data_points.loc[data_points['dist_to_metro_gr']>3, 'dist_to_metro_gr'] = 3

In [39]:
data_points.head()

Unnamed: 0,address,coordinates,lat,lon,dist_to_metro,dist_to_metro_gr
0,"г Москва , Мясницкая ул, 26","[55.763874, 37.637253]",55.763874,37.637253,0.133073,0.0
1,"г Москва , Старая пл, 2/14, стр.1","[55.75586, 37.629196]",55.75586,37.629196,0.209655,0.0
2,"г Москва , Охотный Ряд ул, 1","[55.757416, 37.61613]",55.757416,37.61613,0.074898,0.0
3,"г Москва , Краснопресненская наб, 2, стр.1","[55.755093, 37.572784]",55.755093,37.572784,0.645962,0.5
4,"г Москва , Дмитровка Б. ул, 26","[55.76444, 37.612213]",55.76444,37.612213,0.308304,0.25


In [43]:
data_points_gr = data_points[['dist_to_metro_gr']].groupby('dist_to_metro_gr').size().reset_index(name='count')
data_points_gr

Unnamed: 0,dist_to_metro_gr,count
0,0.0,54
1,0.25,96
2,0.5,91
3,0.75,92
4,1.0,50
5,1.25,31
6,1.5,16
7,1.75,15
8,2.0,7
9,2.25,6


In [None]:
fig_dist_to_metro = px.bar(data_points_gr, x='dist_to_metro_gr', y='count')

In [47]:
fig_dist_to_metro.write_html('fig_dist_to_metro.html')

In [None]:
with open('data_points.pickle', 'wb') as _f:
    pickle.dump(data_points, _f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('data_points.pickle', 'rb') as _f:
    data_points = pickle.load(_f)