# Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import math
import datetime as dt
import json
import requests
from tqdm import tqdm
import os

In [2]:
import geopandas as gpd
import geojson
import h3
import osmnx as ox
import osm2geojson
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon, LineString
from geopy.distance import geodesic, great_circle

In [3]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

In [4]:
import plotly.graph_objects as go
import plotly.express as px

In [5]:
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap

In [6]:
pd.options.display.max_rows = 300
pd.options.display.max_columns = 300

In [7]:
import warnings
warnings.simplefilter('ignore')

# Region

In [8]:
regions = [
    '50', # Москва
    '23', # Краснодар
    '66', # Екатеринбург
    '61', # Ростов-на-Дону
    '02', # Уфа
    '16', # Казань
    '72', # Тюмень
    '74', # Челябинск
    '52', # Нижний Новгород
    '63', # Самара
    '24', # Красноярск
    '26', # Ставрополь
    '54', # Новосибирск
    '42', # Кемерово
    '59', # Пермь
    '34', # Волгоград
    '64', # Саратов
    '38', # Иркутск
    '22', # Барнаул
    '36', # Воронеж
    '56', # Оренбург
    '55', # Омск
    '25', # Владивосток
    '47', # Санкт-Петербург
    '86', # Ханты-Мансийск
    '31', # Белгород
    '18', # Ижевск
    '71', # Тула
    '33', # Владимир
    '58', # Пенза
    '27', # Хабаровск
    '69', # Тверь
    '43', # Киров
    '76', # Ярославль
    '73', # Ульяновск
    '21', # Чебоксары
    '32', # Брянск
    '35', # Вологда
    '29', # Архангельск
    '48', # Липецк
    '62', # Рязань
    '46', # Курск
    '75', # Чита
    '70', # Томск
    '68', # Тамбов
    '37', # Иваново
    '30', # Астрахань
    '40', # Калуга
    '03', # Улан-Удэ
    '39', # Калининград
    '67', # Смоленск
    '14', # Якутск
    '45', # Курган
    '11', # Сыктывкар
    '28', # Благовещенск
    '13', # Саранск
    '51', # Мурманск
    '57', # Орёл
    '12', # Йошкар-Ола
    '44', # Кострома
    '60', # Псков
    '10', # Петрозаводск
    '53', # Великий Новгород
    '89', # Салехард
    '19', # Абакан
    '65', # Южно-Сахалинск
    '41', # Петропавловск-Камчатский
    '17', # Кызыл
    '08', # Элиста
    '04', # Горно-Алтайск
    '79', # Биробиджан
    '49', # Магадан
    '87', # Анадырь
    # '83', # Нарьян-Мар
]

In [9]:
len(regions)

73

In [10]:
regions_bad = [
    '77',
    '78',
    '05',
    '91',
    '20',
    '07',
    '15',
    '09',
    '06',
    '01',
    '92',
    '83'
]

# Hexes

In [11]:
h3_resolution = 5

In [12]:
path = '/home/tochka/Documents/Other/Work/Geo/EGRUL/'

In [13]:
hexes = {}
shape = 0

for reg in regions:
    
    try:
        hexes_gdf_reg = gpd.read_parquet(f'{path}/egrul_hexes/hexes_{reg}_ul_3.parquet.gzip')
        hexes_gdf_reg['region'] = reg
        hexes[reg] = hexes_gdf_reg
        shape += hexes_gdf_reg.shape[0]
        print(f'{reg}: OK, shape: {hexes_gdf_reg.shape}')
    
    except:
        print(f'{reg}: ERROR load hexes file')
        continue

print(f'result shape: {shape}')


50: OK, shape: (191, 23)
23: OK, shape: (399, 23)
66: OK, shape: (690, 23)
61: OK, shape: (406, 23)
02: OK, shape: (503, 23)
16: OK, shape: (247, 23)
72: OK, shape: (551, 23)
74: OK, shape: (305, 23)
52: OK, shape: (301, 23)
63: OK, shape: (200, 23)
24: OK, shape: (9259, 23)
26: OK, shape: (259, 23)
54: OK, shape: (608, 23)
42: OK, shape: (344, 23)
59: OK, shape: (585, 23)
34: OK, shape: (436, 23)
64: OK, shape: (377, 23)
38: OK, shape: (3112, 23)
22: OK, shape: (575, 23)
36: OK, shape: (209, 23)
56: OK, shape: (430, 23)
55: OK, shape: (482, 23)
25: OK, shape: (938, 23)
47: OK, shape: (452, 23)
86: OK, shape: (1978, 23)
31: OK, shape: (114, 23)
18: OK, shape: (157, 23)
71: OK, shape: (110, 23)
33: OK, shape: (114, 23)
58: OK, shape: (170, 23)
27: OK, shape: (3489, 23)
69: OK, shape: (370, 23)
43: OK, shape: (465, 23)
76: OK, shape: (153, 23)
73: OK, shape: (136, 23)
21: OK, shape: (70, 23)
32: OK, shape: (152, 23)
35: OK, shape: (609, 23)
29: OK, shape: (1481, 23)
48: OK, shape: (99, 2

In [14]:
hexes_gdf = gpd.GeoDataFrame()

for r, h in hexes.items():
    hexes_gdf = pd.concat(
        [hexes_gdf, h],
        axis=0,
        join='outer',
        ignore_index=True
    )

In [15]:
hexes_gdf.shape

(66116, 23)

In [16]:
hexes_gdf.head()

Unnamed: 0,geometry,hex_id,assets_balance,short_borrowed_funds,long_borrowed_funds,revenue,net_profit_loss,count_2020,count_2021,begin_2021,end_2021,begin_2021_rel,end_2021_rel,count_2021_2020_rel,net_profit_loss_to_revenue_2020,net_profit_loss_to_assets_2020,short_borrowed_funds_to_revenue_2020,long_borrowed_funds_to_revenue_2020,revenue_to_assets_2020,assets_to_count_2020,net_profit_loss_to_count_2020,revenue_to_count_2020,region
0,"POLYGON ((35.71650 55.75433, 35.73023 55.84282...",8511ae47fffffff,19165020.0,691000.0,1.0,63688000.0,64989.0,15.0,16.0,1.0,4.0,0.0625,0.25,1.066667,0.00102,0.003391,0.01085,1.570155e-08,3.323139,1277668.0,4332.6,4245867.0,50
1,"POLYGON ((39.51699 55.12472, 39.53917 55.21416...",85118ebbfffffff,2949380000.0,391380200.0,434224300.0,4922620000.0,222245900.0,81.0,81.0,2.0,3.0,0.024691,0.037037,1.0,0.045148,0.075353,0.079506,0.08820999,1.669035,36412100.0,2743777.0,60773090.0,50
2,"POLYGON ((35.53238 55.36132, 35.40295 55.32229...",85118583fffffff,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,1.0,-3.0,-3.0,-3.0,-3.0,-3.0,0.0,0.0,0.0,50
3,"POLYGON ((38.36203 56.24641, 38.24364 56.29859...",85118c97fffffff,1571703000.0,35902040.0,19464710.0,1060409000.0,-301576900.0,157.0,146.0,4.0,14.0,0.027397,0.09589,0.929936,-0.284397,-0.191879,0.033857,0.01835586,0.674688,10010850.0,-1920872.0,6754195.0,50
4,"POLYGON ((38.24346 55.71194, 38.37996 55.74853...",85118cd7fffffff,58386880000.0,1862619000.0,10534720000.0,67735590000.0,4400965000.0,1276.0,1266.0,113.0,93.0,0.089258,0.07346,0.992163,0.064973,0.075376,0.027498,0.1555271,1.160117,45757740.0,3449032.0,53084320.0,50


In [17]:
hexes_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 66116 entries, 0 to 66115
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   geometry                              66116 non-null  geometry
 1   hex_id                                66116 non-null  object  
 2   assets_balance                        66116 non-null  float64 
 3   short_borrowed_funds                  66116 non-null  float64 
 4   long_borrowed_funds                   66116 non-null  float64 
 5   revenue                               66116 non-null  float64 
 6   net_profit_loss                       66116 non-null  float64 
 7   count_2020                            66116 non-null  float64 
 8   count_2021                            66116 non-null  float64 
 9   begin_2021                            66116 non-null  float64 
 10  end_2021                              66116 non-null  float64 

In [18]:
hexes_gdf[hexes_gdf['revenue_to_assets_2020'].isnull()]

Unnamed: 0,geometry,hex_id,assets_balance,short_borrowed_funds,long_borrowed_funds,revenue,net_profit_loss,count_2020,count_2021,begin_2021,end_2021,begin_2021_rel,end_2021_rel,count_2021_2020_rel,net_profit_loss_to_revenue_2020,net_profit_loss_to_assets_2020,short_borrowed_funds_to_revenue_2020,long_borrowed_funds_to_revenue_2020,revenue_to_assets_2020,assets_to_count_2020,net_profit_loss_to_count_2020,revenue_to_count_2020,region
19915,"POLYGON ((57.37499 51.68984, 57.22418 51.67614...",8521204ffffffff,-2000.0,0.0,0.0,0.0,-259000.0,4.0,5.0,1.0,0.0,0.2,0.0,1.25,-3.0,,-3.0,-3.0,,-500.0,-64750.0,0.0,56


In [19]:
hexes_gdf[hexes_gdf['assets_balance'] < 0]

Unnamed: 0,geometry,hex_id,assets_balance,short_borrowed_funds,long_borrowed_funds,revenue,net_profit_loss,count_2020,count_2021,begin_2021,end_2021,begin_2021_rel,end_2021_rel,count_2021_2020_rel,net_profit_loss_to_revenue_2020,net_profit_loss_to_assets_2020,short_borrowed_funds_to_revenue_2020,long_borrowed_funds_to_revenue_2020,revenue_to_assets_2020,assets_to_count_2020,net_profit_loss_to_count_2020,revenue_to_count_2020,region
19915,"POLYGON ((57.37499 51.68984, 57.22418 51.67614...",8521204ffffffff,-2000.0,0.0,0.0,0.0,-259000.0,4.0,5.0,1.0,0.0,0.2,0.0,1.25,-3.0,,-3.0,-3.0,,-500.0,-64750.0,0.0,56


In [20]:
hexes_gdf.loc[
    19915,
    [
        'assets_balance',
        'net_profit_loss_to_assets_2020',
        'revenue_to_assets_2020',
        'assets_to_count_2020',
    ]
] = 0

In [21]:
feature_columns = [f for f in hexes_gdf.columns if f not in ['geometry','hex_id','region']]
feature_columns

['assets_balance',
 'short_borrowed_funds',
 'long_borrowed_funds',
 'revenue',
 'net_profit_loss',
 'count_2020',
 'count_2021',
 'begin_2021',
 'end_2021',
 'begin_2021_rel',
 'end_2021_rel',
 'count_2021_2020_rel',
 'net_profit_loss_to_revenue_2020',
 'net_profit_loss_to_assets_2020',
 'short_borrowed_funds_to_revenue_2020',
 'long_borrowed_funds_to_revenue_2020',
 'revenue_to_assets_2020',
 'assets_to_count_2020',
 'net_profit_loss_to_count_2020',
 'revenue_to_count_2020']

In [22]:
X = hexes_gdf[feature_columns].copy()

# KMeans

In [62]:
%%time
kmeans = KMeans(
    n_clusters=5,
    random_state=0,
    max_iter=1000
).fit(X)

CPU times: user 11.5 s, sys: 5.46 s, total: 16.9 s
Wall time: 1.52 s


In [63]:
kmeans_labels = kmeans.labels_

In [64]:
hexes_gdf['kmeans_labels'] = kmeans.labels_

In [65]:
%%time
silhouette_score(X, kmeans_labels, metric='euclidean')

CPU times: user 2min, sys: 6min 4s, total: 8min 5s
Wall time: 1min 16s


0.9968287740692953

In [66]:
hexes_gdf.groupby('kmeans_labels').size()

kmeans_labels
0    66031
1        1
2        1
3       12
4       71
dtype: int64

In [67]:
reg = '66'

map_hexes_kmeans = folium.Map(location=[58.5, 63], zoom_start=5, width=900, height=700, control_scale=True)

map_hexes_kmeans.choropleth(
    geo_data = hexes_gdf.loc[hexes_gdf['region']==reg, ['hex_id', 'geometry']].to_json(),
    name = 'choropleth',
    data = hexes_gdf.loc[hexes_gdf['region']==reg, ['hex_id', 'kmeans_labels']],
    key_on = 'feature.properties.hex_id',
    columns = ['hex_id', 'kmeans_labels'],
    fill_color = 'YlGnBu',
    color = "blue",
    line_weight = 1,
    fill_opacity = 0.8,
    line_opacity = 0.2,
    legend_name = 'type',
    highlight = True
)

map_hexes_kmeans

# DBScan

In [None]:
%%time
dbscan = DBSCAN(
    eps=3,
    min_samples=3,
    metric='euclidean',
    algorithm='brute',
    leaf_size=30,
    n_jobs=-1
).fit(X)

In [63]:
dbscan_labels = dbscan.labels_

In [64]:
hexes_gdf['dbscan_labels'] = dbscan.labels_

In [65]:
%%time
silhouette_score(X, dbscan_labels, metric='euclidean')

CPU times: user 2min, sys: 6min 4s, total: 8min 5s
Wall time: 1min 16s


0.9968287740692953

In [66]:
hexes_gdf.groupby('dbscan_labels').size()

kmeans_labels
0    66031
1        1
2        1
3       12
4       71
dtype: int64

In [67]:
reg = '66'

map_hexes_dbscan = folium.Map(location=[58.5, 63], zoom_start=5, width=900, height=700, control_scale=True)

map_hexes_dbscan.choropleth(
    geo_data = hexes_gdf.loc[hexes_gdf['region']==reg, ['hex_id', 'geometry']].to_json(),
    name = 'choropleth',
    data = hexes_gdf.loc[hexes_gdf['region']==reg, ['hex_id', 'dbscan_labels']],
    key_on = 'feature.properties.hex_id',
    columns = ['hex_id', 'dbscan_labels'],
    fill_color = 'YlGnBu',
    color = "blue",
    line_weight = 1,
    fill_opacity = 0.8,
    line_opacity = 0.2,
    legend_name = 'type',
    highlight = True
)

map_hexes_dbscan

# Agglomerative

In [None]:
%%time
agglomerative = AgglomerativeClustering(
    n_clusters=5,
    affinity='euclidean',
    linkage='ward'
).fit(X)

In [63]:
agglomerative_labels = agglomerative.labels_

In [64]:
hexes_gdf['agglomerative_labels'] = agglomerative.labels_

In [65]:
%%time
silhouette_score(X, agglomerative_labels, metric='euclidean')

CPU times: user 2min, sys: 6min 4s, total: 8min 5s
Wall time: 1min 16s


0.9968287740692953

In [66]:
hexes_gdf.groupby('agglomerative_labels').size()

kmeans_labels
0    66031
1        1
2        1
3       12
4       71
dtype: int64

In [67]:
reg = '66'

map_hexes_agglomerative = folium.Map(location=[58.5, 63], zoom_start=5, width=900, height=700, control_scale=True)

map_hexes_agglomerative.choropleth(
    geo_data = hexes_gdf.loc[hexes_gdf['region']==reg, ['hex_id', 'geometry']].to_json(),
    name = 'choropleth',
    data = hexes_gdf.loc[hexes_gdf['region']==reg, ['hex_id', 'agglomerative_labels']],
    key_on = 'feature.properties.hex_id',
    columns = ['hex_id', 'agglomerative_labels'],
    fill_color = 'YlGnBu',
    color = "blue",
    line_weight = 1,
    fill_opacity = 0.8,
    line_opacity = 0.2,
    legend_name = 'type',
    highlight = True
)

map_hexes_agglomerative