In [1]:
import pandas as pd
import numpy as np
from numpy import inf, mean, median
import math
import datetime
import json
import os
import glob
import pickle
import requests
# import requests_cache
import plotly.graph_objects as go
import plotly.express as px
import folium
from folium.plugins import MarkerCluster, HeatMap
import geopandas as gpd
import osm2geojson
import geojson
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon, LineString

# Load Points

In [2]:
data_points = pd.read_csv('data_post_offices.csv', sep=';')

In [3]:
data_points.head()

Unnamed: 0,post_index,post_address,post_coordinates,post_lat,post_lon
0,101000,"г Москва , Мясницкая ул, 26","[55.763874, 37.637253]",55.763874,37.637253
1,103132,"г Москва , Старая пл, 2/14, стр.1","[55.75586, 37.629196]",55.75586,37.629196
2,103265,"г Москва , Охотный Ряд ул, 1","[55.757416, 37.61613]",55.757416,37.61613
3,103274,"г Москва , Краснопресненская наб, 2, стр.1","[55.755093, 37.572784]",55.755093,37.572784
4,103426,"г Москва , Дмитровка Б. ул, 26","[55.76444, 37.612213]",55.76444,37.612213


In [4]:
data_points.rename(columns={'post_address':'address', 'post_coordinates':'coordinates', 'post_lat':'lat', 'post_lon':'lon'}, inplace=True)
data_points.drop('post_index', axis=1, inplace=True)

In [5]:
data_points.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   address      496 non-null    object 
 1   coordinates  496 non-null    object 
 2   lat          496 non-null    float64
 3   lon          496 non-null    float64
dtypes: float64(2), object(2)
memory usage: 15.6+ KB


# Load Malls

In [6]:
data_malls = pd.read_csv('data_malls.csv', sep=';')

In [7]:
data_malls.head()

Unnamed: 0,mall_name,mall_address,mall_coordinates,mall_lat,mall_lon
0,Columbus,"ул. Красного Маяка, 2Б, Москва","[55.611088, 37.60684]",55.611088,37.60684
1,Crocus City Mall,"66-й км МКАД, Красногорск, Московская обл.","[55.76421, 37.84499]",55.76421,37.84499
2,FORT,"Новоясеневский просп., 11, Москва","[55.606438, 37.53098]",55.606438,37.53098
3,XL,"Дмитровское ш., 89, Москва","[55.86367, 37.545452]",55.86367,37.545452
4,XL-2,"Коммунистическая ул., 10, корп. 1, Мытищи, Мос...","[55.891644, 37.745007]",55.891644,37.745007


In [8]:
data_malls.shape

(94, 5)

# Points To Malls

In [9]:
data_points_2 = data_points.loc[(pd.notnull(data_points['lat']))&(pd.notnull(data_points['lon'])), ['lat', 'lon']].reset_index()

In [10]:
data_points_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   index   496 non-null    int64  
 1   lat     496 non-null    float64
 2   lon     496 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 11.8 KB


In [11]:
data_malls_2 = data_malls.loc[(pd.notnull(data_malls['mall_lat']))&(pd.notnull(data_malls['mall_lon'])), ['mall_lon', 'mall_lat']]

In [12]:
data_malls_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94 entries, 0 to 93
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mall_lon  94 non-null     float64
 1   mall_lat  94 non-null     float64
dtypes: float64(2)
memory usage: 2.2 KB


In [13]:
%%time
data_points2malls = pd.merge(data_points_2[['index', 'lat', 'lon']],
                             data_malls_2,
                             how='cross',
                            )

CPU times: user 73 ms, sys: 22.5 ms, total: 95.5 ms
Wall time: 80.6 ms


In [14]:
data_points2malls.shape

(46624, 5)

In [15]:
data_points2malls.head()

Unnamed: 0,index,lat,lon,mall_lon,mall_lat
0,0,55.763874,37.637253,37.60684,55.611088
1,0,55.763874,37.637253,37.84499,55.76421
2,0,55.763874,37.637253,37.53098,55.606438
3,0,55.763874,37.637253,37.545452,55.86367
4,0,55.763874,37.637253,37.745007,55.891644


In [16]:
data_points2malls.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46624 entries, 0 to 46623
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   index     46624 non-null  int64  
 1   lat       46624 non-null  float64
 2   lon       46624 non-null  float64
 3   mall_lon  46624 non-null  float64
 4   mall_lat  46624 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 2.1 MB


In [17]:
def f_dist(df, col_lat_1, col_lon_1, col_lat_2, col_lon_2, col_dist):
    
    lat_1 = math.pi/180 * df[col_lat_1].to_numpy()
    lon_1 = math.pi/180 * df[col_lon_1].to_numpy()
    lat_2 = math.pi/180 * df[col_lat_2].to_numpy()
    lon_2 = math.pi/180 * df[col_lon_2].to_numpy()
    
    dlon = lon_1 - lon_2
    dlat = lat_1 - lat_2
    a = np.sin(dlat/2)**2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    dist = c * 6371.009
    
    df[col_dist] = dist
    
    return df

In [18]:
%%time
data_points2malls = f_dist(data_points2malls, 'lat', 'lon', 'mall_lat', 'mall_lon', 'dist')

CPU times: user 10.1 ms, sys: 4.51 ms, total: 14.6 ms
Wall time: 11.5 ms


In [19]:
data_points2malls.head()

Unnamed: 0,index,lat,lon,mall_lon,mall_lat,dist
0,0,55.763874,37.637253,37.60684,55.611088,17.095671
1,0,55.763874,37.637253,37.84499,55.76421,12.995787
2,0,55.763874,37.637253,37.53098,55.606438,18.730787
3,0,55.763874,37.637253,37.545452,55.86367,12.491467
4,0,55.763874,37.637253,37.745007,55.891644,15.720744


# Update Points

In [20]:
%%time
data_points2malls_min = data_points2malls[['index', 'dist']].groupby('index').min().reset_index()

CPU times: user 225 ms, sys: 45.6 ms, total: 270 ms
Wall time: 258 ms


In [21]:
data_points2malls_min.head()

Unnamed: 0,index,dist
0,0,0.96146
1,1,0.496921
2,2,0.042458
3,3,0.691111
4,4,0.620061


In [22]:
data_points2malls_min.shape

(496, 2)

In [23]:
data_points2malls_min.set_index('index', inplace=True)

In [24]:
data_points2malls_min.head()

Unnamed: 0_level_0,dist
index,Unnamed: 1_level_1
0,0.96146
1,0.496921
2,0.042458
3,0.691111
4,0.620061


In [25]:
%%time
data_points = data_points.merge(data_points2malls_min, left_index=True, right_index=True, how='left')

CPU times: user 3.01 ms, sys: 0 ns, total: 3.01 ms
Wall time: 2.7 ms


In [26]:
data_points.rename(columns={'dist':'dist_to_mall'}, inplace=True)

In [27]:
data_points.head()

Unnamed: 0,address,coordinates,lat,lon,dist_to_mall
0,"г Москва , Мясницкая ул, 26","[55.763874, 37.637253]",55.763874,37.637253,0.96146
1,"г Москва , Старая пл, 2/14, стр.1","[55.75586, 37.629196]",55.75586,37.629196,0.496921
2,"г Москва , Охотный Ряд ул, 1","[55.757416, 37.61613]",55.757416,37.61613,0.042458
3,"г Москва , Краснопресненская наб, 2, стр.1","[55.755093, 37.572784]",55.755093,37.572784,0.691111
4,"г Москва , Дмитровка Б. ул, 26","[55.76444, 37.612213]",55.76444,37.612213,0.620061


In [28]:
def f_gr(val, gr):
    return int(val / gr) * gr

In [29]:
data_points['dist_to_mall_gr'] = data_points['dist_to_mall'].apply(f_gr, args=(0.5,))
data_points.loc[data_points['dist_to_mall_gr']>15, 'dist_to_mall_gr'] = 15

In [30]:
data_points.head()

Unnamed: 0,address,coordinates,lat,lon,dist_to_mall,dist_to_mall_gr
0,"г Москва , Мясницкая ул, 26","[55.763874, 37.637253]",55.763874,37.637253,0.96146,0.5
1,"г Москва , Старая пл, 2/14, стр.1","[55.75586, 37.629196]",55.75586,37.629196,0.496921,0.0
2,"г Москва , Охотный Ряд ул, 1","[55.757416, 37.61613]",55.757416,37.61613,0.042458,0.0
3,"г Москва , Краснопресненская наб, 2, стр.1","[55.755093, 37.572784]",55.755093,37.572784,0.691111,0.5
4,"г Москва , Дмитровка Б. ул, 26","[55.76444, 37.612213]",55.76444,37.612213,0.620061,0.5


In [31]:
data_points_gr = data_points[['dist_to_mall_gr']].groupby('dist_to_mall_gr').size().reset_index(name='count')
data_points_gr

Unnamed: 0,dist_to_mall_gr,count
0,0.0,61
1,0.5,90
2,1.0,93
3,1.5,61
4,2.0,63
5,2.5,35
6,3.0,28
7,3.5,18
8,4.0,8
9,4.5,9


In [None]:
fig_dist_to_mall = px.bar(data_points_gr, x='dist_to_mall_gr', y='count')

In [33]:
fig_dist_to_mall.write_html('fig_dist_to_mall.html')

In [None]:
with open('data_points.pickle', 'wb') as _f:
    pickle.dump(data_points, _f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('data_points.pickle', 'rb') as _f:
    data_points = pickle.load(_f)