# Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import math
import datetime as dt
import json
import requests
from tqdm import tqdm

In [2]:
import geopandas as gpd
import geojson
import h3
import osm2geojson
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon, LineString
from geopy.distance import geodesic, great_circle

In [3]:
import plotly.graph_objects as go
import plotly.express as px

In [4]:
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap

In [5]:
pd.options.display.max_rows = 300
pd.options.display.max_columns = 300

# Login to DWH

In [6]:
from tochkaml.db.dwh import connect_to_dwh

In [7]:
dwh_login = 'BOCHKAREV_DV'

In [8]:
dwh_pass = 'r5eN3Y6z'

In [9]:
dwh_connection = connect_to_dwh(dwh_login, dwh_pass)

Using 10.103.40.226


In [10]:
from tochkaml import storage

In [51]:
path = '/home/jovyan/work'

# Region

In [11]:
reg = '66' # Екатеринбург

# Data

In [23]:
try:
    storage.personal.load(f'df_kladr_{reg}_upd.csv.gz')
    df_kladr_upd = pd.read_csv(
        f'{path}/df_kladr_{reg}_upd.csv.gz',
        sep=';',
        compression='gzip',
        dtype={'kladr_11':str, 'kladr':str}
    )
except:
    print('no kladr file')

try:
    storage.personal.load(f'df_fias_{reg}_upd.csv.gz')
    df_fias_upd = pd.read_csv(
        f'{path}/df_fias_{reg}_upd.csv.gz',
        sep=';',
        compression='gzip'
    )
except:
    print('no fias file')

try:
    storage.personal.load(f'df_inn_{reg}_2.csv.gz')
    df_inn = pd.read_csv(
        f'{path}/df_inn_{reg}_2.csv.gz',
        parse_dates=['begin_date','end_date','start_reg_date','end_reg_date'],
        dtype={
            'inn':str,
            'tax_agency_code':str,
            'end_code':str,
            'kladr_code':str,
            'kladr_11':str,
            'inn_eruz':str
        },
        sep=';',
        compression='gzip'
    )
except:
    print('no inn file')

df_inn['kladr_11'] = df_inn['kladr_code'].str[:11]
df_inn = df_inn.merge(
    df_kladr_upd[['kladr_11', 'latitude', 'longitude']],
    how='left',
    on='kladr_11'
)

df_inn = df_inn.merge(
    df_fias_upd[['fias_code', 'latitude', 'longitude']].rename(columns={'latitude':'latitude_fias', 'longitude':'longitude_fias'}),
    how='left',
    on='fias_code'
)

df_inn.loc[df_inn.latitude.isnull(), 'latitude'] = df_inn['latitude_fias']
df_inn.loc[df_inn.longitude.isnull(), 'longitude'] = df_inn['longitude_fias']
df_inn.drop(columns=['latitude_fias', 'longitude_fias'], inplace=True)


2022-11-22 14:24.24 [info     ] check for directory            fs=s3fs path=df_kladr_66_upd.csv.gz
2022-11-22 14:24.24 [info     ] load file                      path=df_kladr_66_upd.csv.gz
2022-11-22 14:24.24 [info     ] open file for read             fs=s3fs path=df_kladr_66_upd.csv.gz
2022-11-22 14:24.24 [info     ] open file for write            fs=localfs path=df_kladr_66_upd.csv.gz
2022-11-22 14:24.24 [info     ] check for directory            fs=s3fs path=df_fias_66_upd.csv.gz
2022-11-22 14:24.24 [info     ] load file                      path=df_fias_66_upd.csv.gz
2022-11-22 14:24.24 [info     ] open file for read             fs=s3fs path=df_fias_66_upd.csv.gz
2022-11-22 14:24.24 [info     ] open file for write            fs=localfs path=df_fias_66_upd.csv.gz
2022-11-22 14:24.24 [info     ] check for directory            fs=s3fs path=df_inn_66_2.csv.gz
2022-11-22 14:24.24 [info     ] load file                      path=df_inn_66_2.csv.gz
2022-11-22 14:24.24 [info     ] open fil

In [24]:
df_inn.head()

Unnamed: 0,inn,tax_agency_code,begin_date,end_date,end_code,full_address,kladr_code,main_okved,inn_eruz,start_reg_date,end_reg_date,year,correction_num,assets_balance,short_borrowed_funds,long_borrowed_funds,revenue,net_profit_loss,count_2020,count_2021,begin_2021,end_2021,fias_code,kladr_11,latitude,longitude
0,6670103683,6670,2005-12-02,2022-03-11,415.0,"620049, ОБЛАСТЬ СВЕРДЛОВСКАЯ, Г. ЕКАТЕРИНБУРГ,...",660000010000805,72.19,,NaT,NaT,,-1,0,0,0,0,0,1,1,0,0,,66000001000,56.838607,60.605514
1,6670469092,6670,2018-06-13,2021-03-12,407.0,"620137, ОБЛАСТЬ СВЕРДЛОВСКАЯ, ГОРОД ЕКАТЕРИНБУ...",660000010001077,74.10,,NaT,NaT,,-1,0,0,0,0,0,1,1,0,1,,66000001000,56.838607,60.605514
2,6634002707,6676,2002-11-25,NaT,,"623950, ОБЛАСТЬ СВЕРДЛОВСКАЯ, Г. ТАВДА, УЛ. РА...",660000420000030,87.90,,NaT,NaT,,-1,0,0,0,0,0,1,1,0,0,,66000042000,58.04342,65.274283
3,6672199934,6685,2005-12-01,2020-06-03,407.0,"620138, ОБЛАСТЬ СВЕРДЛОВСКАЯ, ГОРОД ЕКАТЕРИНБУ...",660000010001221,68.32.1,,NaT,NaT,,-1,0,0,0,0,0,1,0,0,0,,66000001000,56.838607,60.605514
4,6666007465,6612,2002-12-20,NaT,,"623401, ОБЛАСТЬ СВЕРДЛОВСКАЯ, Г. КАМЕНСК-УРАЛЬ...",660000220000227,87.90,6666007465.0,2021-06-05,2024-06-05,,-1,0,0,0,0,0,1,1,0,0,,66000022000,56.414897,61.918905


In [25]:
df_inn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139883 entries, 0 to 139882
Data columns (total 26 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   inn                   139883 non-null  object        
 1   tax_agency_code       139883 non-null  object        
 2   begin_date            139883 non-null  datetime64[ns]
 3   end_date              33363 non-null   datetime64[ns]
 4   end_code              33363 non-null   object        
 5   full_address          139883 non-null  object        
 6   kladr_code            122700 non-null  object        
 7   main_okved            137998 non-null  object        
 8   inn_eruz              16291 non-null   object        
 9   start_reg_date        16218 non-null   datetime64[ns]
 10  end_reg_date          11236 non-null   datetime64[ns]
 11  year                  83693 non-null   float64       
 12  correction_num        139883 non-null  int64         
 13 

# Data With Neighbours

In [52]:
storage.personal.load('spr_regions_neighbours.xlsx')

2022-11-22 14:34.43 [info     ] check for directory            fs=s3fs path=spr_regions_neighbours.xlsx
2022-11-22 14:34.43 [info     ] load file                      path=spr_regions_neighbours.xlsx
2022-11-22 14:34.43 [info     ] open file for read             fs=s3fs path=spr_regions_neighbours.xlsx
2022-11-22 14:34.43 [info     ] open file for write            fs=localfs path=spr_regions_neighbours.xlsx


In [57]:
spr_neighbours = pd.read_excel(
    f'{path}/spr_regions_neighbours.xlsx',
    sheet_name='regions_neighbours',
    dtype={'region_code':str, 'region_neighbour_code':str}
)

In [58]:
spr_neighbours

Unnamed: 0,region,region_code,region_neighbour,region_neighbour_code
0,Свердловская,66,Пермский,59
1,Свердловская,66,Коми,11
2,Свердловская,66,Ханты-Мансийский - Югра,86
3,Свердловская,66,Тюменская,72
4,Свердловская,66,Курганская,45
5,Свердловская,66,Челябинская,74
6,Свердловская,66,Башкортостан,2
7,Челябинская,74,Свердловская,66
8,Челябинская,74,Курганская,45
9,Челябинская,74,Оренбургская,56


In [61]:
reg_neighbours = list(spr_neighbours.loc[spr_neighbours['region_code'] == reg, 'region_neighbour_code'])

In [62]:
reg_neighbours

['59', '11', '86', '72', '45', '74', '02']

In [63]:
df_inn_neighbour = {}

for reg_n in reg_neighbours:
    
    try:
        storage.personal.load(f'df_kladr_{reg_n}_upd.csv.gz')
        df_kladr_upd = pd.read_csv(
            f'{path}/df_kladr_{reg_n}_upd.csv.gz',
            sep=';',
            compression='gzip',
            dtype={'kladr_11':str, 'kladr':str}
        )
    except:
        print('no kladr file')

    try:
        storage.personal.load(f'df_fias_{reg_n}_upd.csv.gz')
        df_fias_upd = pd.read_csv(
            f'{path}/df_fias_{reg_n}_upd.csv.gz',
            sep=';',
            compression='gzip'
        )
    except:
        print('no fias file')

    try:
        storage.personal.load(f'df_inn_{reg_n}_2.csv.gz')
        df_inn_neighbour[reg_n] = pd.read_csv(
            f'{path}/df_inn_{reg_n}_2.csv.gz',
            parse_dates=['begin_date','end_date','start_reg_date','end_reg_date'],
            dtype={
                'inn':str,
                'tax_agency_code':str,
                'end_code':str,
                'kladr_code':str,
                'kladr_11':str,
                'inn_eruz':str
            },
            sep=';',
            compression='gzip'
        )
    except:
        print('no inn file')

    df_inn_neighbour[reg_n]['kladr_11'] = df_inn_neighbour[reg_n]['kladr_code'].str[:11]
    df_inn_neighbour[reg_n] = df_inn_neighbour[reg_n].merge(
        df_kladr_upd[['kladr_11', 'latitude', 'longitude']],
        how='left',
        on='kladr_11'
    )

    df_inn_neighbour[reg_n] = df_inn_neighbour[reg_n].merge(
        df_fias_upd[['fias_code', 'latitude', 'longitude']].rename(columns={'latitude':'latitude_fias', 'longitude':'longitude_fias'}),
        how='left',
        on='fias_code'
    )

    df_inn_neighbour[reg_n].loc[df_inn_neighbour[reg_n]['latitude'].isnull(), 'latitude'] = df_inn_neighbour[reg_n]['latitude_fias']
    df_inn_neighbour[reg_n].loc[df_inn_neighbour[reg_n]['longitude'].isnull(), 'longitude'] = df_inn_neighbour[reg_n]['longitude_fias']
    df_inn_neighbour[reg_n].drop(columns=['latitude_fias', 'longitude_fias'], inplace=True)


2022-11-22 14:45.18 [info     ] check for directory            fs=s3fs path=df_kladr_59_upd.csv.gz
2022-11-22 14:45.18 [info     ] load file                      path=df_kladr_59_upd.csv.gz
2022-11-22 14:45.18 [info     ] open file for read             fs=s3fs path=df_kladr_59_upd.csv.gz
2022-11-22 14:45.18 [info     ] open file for write            fs=localfs path=df_kladr_59_upd.csv.gz
2022-11-22 14:45.18 [info     ] check for directory            fs=s3fs path=df_fias_59_upd.csv.gz
2022-11-22 14:45.18 [info     ] load file                      path=df_fias_59_upd.csv.gz
2022-11-22 14:45.18 [info     ] open file for read             fs=s3fs path=df_fias_59_upd.csv.gz
2022-11-22 14:45.18 [info     ] open file for write            fs=localfs path=df_fias_59_upd.csv.gz
2022-11-22 14:45.19 [info     ] check for directory            fs=s3fs path=df_inn_59_2.csv.gz
2022-11-22 14:45.19 [info     ] load file                      path=df_inn_59_2.csv.gz
2022-11-22 14:45.19 [info     ] open fil

In [72]:
print('------------')
print(reg)
print(df_inn.shape)
shape_sum = df_inn.shape[0]

for reg_n in reg_neighbours:
    print('------------')
    print(reg_n)
    print(df_inn_neighbour[reg_n].shape)
    shape_sum = shape_sum + df_inn_neighbour[reg_n].shape[0]
print('------------')
print(f'shape_sum: {shape_sum}')

------------
66
(139883, 27)
------------
59
(65902, 26)
------------
11
(17138, 26)
------------
86
(33393, 26)
------------
72
(42951, 26)
------------
45
(12971, 26)
------------
74
(87101, 26)
------------
02
(89742, 26)
------------
shape_sum: 489081


In [73]:
df_inn_neighbours = df_inn

for reg_n in reg_neighbours:
    df_inn_neighbours = pd.concat(
        [df_inn_neighbours, df_inn_neighbour[reg_n]],
        axis=0,
        join='outer',
        ignore_index=True
    ) 

In [74]:
df_inn_neighbours.shape

(489081, 27)

# Hexes

In [29]:
h3_resolution = 5

In [30]:
storage.personal.load(f'{int(reg)}.parquet.gzip')

2022-11-22 14:25.21 [info     ] check for directory            fs=s3fs path=66.parquet.gzip
2022-11-22 14:25.21 [info     ] load file                      path=66.parquet.gzip
2022-11-22 14:25.21 [info     ] open file for read             fs=s3fs path=66.parquet.gzip
2022-11-22 14:25.21 [info     ] open file for write            fs=localfs path=66.parquet.gzip


In [31]:
hexes_gdf = gpd.read_parquet(f'{path}/{int(reg)}.parquet.gzip')

In [32]:
hexes_gdf.head()

Unnamed: 0,hex_id,geometry
0,8510ce6bfffffff,"POLYGON ((63.88577 57.68141, 63.96742 57.76498..."
1,8510c033fffffff,"POLYGON ((59.87681 58.28886, 59.80328 58.20345..."
2,8510c18bfffffff,"POLYGON ((60.58101 58.89537, 60.50473 58.81061..."
3,8510dd47fffffff,"POLYGON ((63.18246 56.76335, 63.27692 56.68569..."
4,8510ee67fffffff,"POLYGON ((59.82744 59.96634, 60.01239 59.97814..."


In [33]:
hexes_gdf.shape

(690, 2)

In [34]:
hexes_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   hex_id    690 non-null    object  
 1   geometry  690 non-null    geometry
dtypes: geometry(1), object(1)
memory usage: 10.9+ KB


In [35]:
hexes_gdf['hex_id'].nunique()

690

In [36]:
map_h3_hexes = folium.Map(location=[58.5, 63], zoom_start=5, width=900, height=700, control_scale=True)

for h in hexes_gdf['geometry']:
    folium.GeoJson(
        h.__geo_interface__,
        style_function=lambda feature: {
            "fillColor": "red",
            "color": "blue",
            "weight": 1,
            "fillOpacity": 0.5,
        },
    ).add_to(map_h3_hexes)

map_h3_hexes

In [37]:
df_inn[f'h3_id_{h3_resolution}'] = None

for index, row in df_inn.iterrows():
    h3_id = h3.geo_to_h3(
        lat = row['latitude'], 
        lng = row['longitude'], 
        resolution = h3_resolution
    )
    df_inn.loc[index, f'h3_id_{h3_resolution}'] = h3_id

In [38]:
df_inn[f'h3_id_{h3_resolution}'].nunique()

331

In [75]:
df_inn_neighbours[f'h3_id_{h3_resolution}'] = None

for index, row in df_inn_neighbours.iterrows():
    h3_id = h3.geo_to_h3(
        lat = row['latitude'], 
        lng = row['longitude'], 
        resolution = h3_resolution
    )
    df_inn_neighbours.loc[index, f'h3_id_{h3_resolution}'] = h3_id

In [76]:
df_inn_neighbours[f'h3_id_{h3_resolution}'].nunique()

2221

In [39]:
df_inn_gr = df_inn[[
    'assets_balance',
    'short_borrowed_funds',
    'long_borrowed_funds',
    'revenue',
    'net_profit_loss',
    'count_2020',
    'count_2021',
    'begin_2021',
    'end_2021',
    f'h3_id_{h3_resolution}'
]].groupby(f'h3_id_{h3_resolution}', as_index=False).sum()

In [40]:
df_inn_gr.shape

(331, 10)

In [41]:
df_inn_gr.head()

Unnamed: 0,h3_id_5,assets_balance,short_borrowed_funds,long_borrowed_funds,revenue,net_profit_loss,count_2020,count_2021,begin_2021,end_2021
0,0,206732423803,35051377043,79361323964,97683833618,9247847057,771,1071,309,17
1,8510c007fffffff,226449715,11265000,15451027,379768448,54758964,67,66,4,9
2,8510c00bfffffff,16726000,113000,0,44986000,807000,6,6,0,1
3,8510c00ffffffff,1864000,0,0,7432000,-183000,3,4,1,0
4,8510c017fffffff,0,0,0,0,0,2,2,0,0


In [77]:
df_inn_neighbours_gr = df_inn_neighbours[[
    'assets_balance',
    'short_borrowed_funds',
    'long_borrowed_funds',
    'revenue',
    'net_profit_loss',
    'count_2020',
    'count_2021',
    'begin_2021',
    'end_2021',
    f'h3_id_{h3_resolution}'
]].groupby(f'h3_id_{h3_resolution}', as_index=False).sum()

In [78]:
df_inn_neighbours_gr.shape

(2221, 10)

In [79]:
df_inn_neighbours_gr.head()

Unnamed: 0,h3_id_5,assets_balance,short_borrowed_funds,long_borrowed_funds,revenue,net_profit_loss,count_2020,count_2021,begin_2021,end_2021
0,0,566007643084,58467381132,232115174794,293623534783,31816791117,3051,4187,1257,148
1,850aa537fffffff,139468000,0,193000,38139000,289000,1,1,0,0
2,850b000ffffffff,88420564,620575,0,158490579,7980684,52,44,0,18
3,850b0053fffffff,2159372602,80195507,118205000,3168630340,218241211,122,110,1,5
4,850b005bfffffff,15835,0,0,8667,-439,4,7,3,0


In [43]:
hexes_gdf.shape

(690, 2)

In [44]:
hexes_gdf_reg = hexes_gdf[['geometry', 'hex_id']].merge(
    df_inn_gr,
    how='left',
    left_on = 'hex_id',
    right_on = f'h3_id_{h3_resolution}'
)

In [45]:
hexes_gdf_reg.shape

(690, 12)

In [80]:
hexes_gdf_reg.head()

Unnamed: 0,geometry,hex_id,h3_id_5,assets_balance,short_borrowed_funds,long_borrowed_funds,revenue,net_profit_loss,count_2020,count_2021,begin_2021,end_2021
0,"POLYGON ((63.88577 57.68141, 63.96742 57.76498...",8510ce6bfffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"POLYGON ((59.87681 58.28886, 59.80328 58.20345...",8510c033fffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"POLYGON ((60.58101 58.89537, 60.50473 58.81061...",8510c18bfffffff,8510c18bfffffff,276173305.0,6902000.0,40420000.0,498560428.0,31825240.0,100.0,91.0,2.0,11.0
3,"POLYGON ((63.18246 56.76335, 63.27692 56.68569...",8510dd47fffffff,8510dd47fffffff,100.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,"POLYGON ((59.82744 59.96634, 60.01239 59.97814...",8510ee67fffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
hexes_gdf_reg.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   geometry              690 non-null    geometry
 1   hex_id                690 non-null    object  
 2   h3_id_5               317 non-null    object  
 3   assets_balance        317 non-null    float64 
 4   short_borrowed_funds  317 non-null    float64 
 5   long_borrowed_funds   317 non-null    float64 
 6   revenue               317 non-null    float64 
 7   net_profit_loss       317 non-null    float64 
 8   count_2020            317 non-null    float64 
 9   count_2021            317 non-null    float64 
 10  begin_2021            317 non-null    float64 
 11  end_2021              317 non-null    float64 
dtypes: float64(9), geometry(1), object(2)
memory usage: 70.1+ KB


In [48]:
hexes_gdf_reg.fillna(0, inplace=True)

In [49]:
hexes_gdf_reg.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   geometry              690 non-null    geometry
 1   hex_id                690 non-null    object  
 2   h3_id_5               690 non-null    object  
 3   assets_balance        690 non-null    float64 
 4   short_borrowed_funds  690 non-null    float64 
 5   long_borrowed_funds   690 non-null    float64 
 6   revenue               690 non-null    float64 
 7   net_profit_loss       690 non-null    float64 
 8   count_2020            690 non-null    float64 
 9   count_2021            690 non-null    float64 
 10  begin_2021            690 non-null    float64 
 11  end_2021              690 non-null    float64 
dtypes: float64(9), geometry(1), object(2)
memory usage: 70.1+ KB


In [81]:
hexes_gdf.shape

(690, 2)

In [82]:
hexes_gdf_reg_neighbours = hexes_gdf[['geometry', 'hex_id']].merge(
    df_inn_neighbours_gr,
    how='left',
    left_on = 'hex_id',
    right_on = f'h3_id_{h3_resolution}'
)

In [83]:
hexes_gdf_reg_neighbours.shape

(690, 12)

In [84]:
hexes_gdf_reg_neighbours.head()

Unnamed: 0,geometry,hex_id,h3_id_5,assets_balance,short_borrowed_funds,long_borrowed_funds,revenue,net_profit_loss,count_2020,count_2021,begin_2021,end_2021
0,"POLYGON ((63.88577 57.68141, 63.96742 57.76498...",8510ce6bfffffff,,,,,,,,,,
1,"POLYGON ((59.87681 58.28886, 59.80328 58.20345...",8510c033fffffff,,,,,,,,,,
2,"POLYGON ((60.58101 58.89537, 60.50473 58.81061...",8510c18bfffffff,8510c18bfffffff,276173305.0,6902000.0,40420000.0,498560428.0,31825240.0,100.0,91.0,2.0,11.0
3,"POLYGON ((63.18246 56.76335, 63.27692 56.68569...",8510dd47fffffff,8510dd47fffffff,100.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,"POLYGON ((59.82744 59.96634, 60.01239 59.97814...",8510ee67fffffff,,,,,,,,,,


In [85]:
hexes_gdf_reg_neighbours.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   geometry              690 non-null    geometry
 1   hex_id                690 non-null    object  
 2   h3_id_5               324 non-null    object  
 3   assets_balance        324 non-null    float64 
 4   short_borrowed_funds  324 non-null    float64 
 5   long_borrowed_funds   324 non-null    float64 
 6   revenue               324 non-null    float64 
 7   net_profit_loss       324 non-null    float64 
 8   count_2020            324 non-null    float64 
 9   count_2021            324 non-null    float64 
 10  begin_2021            324 non-null    float64 
 11  end_2021              324 non-null    float64 
dtypes: float64(9), geometry(1), object(2)
memory usage: 70.1+ KB


In [86]:
hexes_gdf_reg_neighbours.fillna(0, inplace=True)

In [87]:
hexes_gdf_reg_neighbours.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   geometry              690 non-null    geometry
 1   hex_id                690 non-null    object  
 2   h3_id_5               690 non-null    object  
 3   assets_balance        690 non-null    float64 
 4   short_borrowed_funds  690 non-null    float64 
 5   long_borrowed_funds   690 non-null    float64 
 6   revenue               690 non-null    float64 
 7   net_profit_loss       690 non-null    float64 
 8   count_2020            690 non-null    float64 
 9   count_2021            690 non-null    float64 
 10  begin_2021            690 non-null    float64 
 11  end_2021              690 non-null    float64 
dtypes: float64(9), geometry(1), object(2)
memory usage: 70.1+ KB


In [89]:
hexes_gdf_compare = hexes_gdf_reg.merge(
    hexes_gdf_reg_neighbours,
    how = 'left',
    on = 'hex_id'
)

In [90]:
hexes_gdf_compare.shape

(690, 23)

In [92]:
hexes_gdf_compare.head()

Unnamed: 0,geometry_x,hex_id,h3_id_5_x,assets_balance_x,short_borrowed_funds_x,long_borrowed_funds_x,revenue_x,net_profit_loss_x,count_2020_x,count_2021_x,begin_2021_x,end_2021_x,geometry_y,h3_id_5_y,assets_balance_y,short_borrowed_funds_y,long_borrowed_funds_y,revenue_y,net_profit_loss_y,count_2020_y,count_2021_y,begin_2021_y,end_2021_y
0,"POLYGON ((63.88577 57.68141, 63.96742 57.76498...",8510ce6bfffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((63.88577 57.68141, 63.96742 57.76498...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"POLYGON ((59.87681 58.28886, 59.80328 58.20345...",8510c033fffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((59.87681 58.28886, 59.80328 58.20345...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"POLYGON ((60.58101 58.89537, 60.50473 58.81061...",8510c18bfffffff,8510c18bfffffff,276173305.0,6902000.0,40420000.0,498560428.0,31825240.0,100.0,91.0,2.0,11.0,"POLYGON ((60.58101 58.89537, 60.50473 58.81061...",8510c18bfffffff,276173305.0,6902000.0,40420000.0,498560428.0,31825240.0,100.0,91.0,2.0,11.0
3,"POLYGON ((63.18246 56.76335, 63.27692 56.68569...",8510dd47fffffff,8510dd47fffffff,100.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,"POLYGON ((63.18246 56.76335, 63.27692 56.68569...",8510dd47fffffff,100.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,"POLYGON ((59.82744 59.96634, 60.01239 59.97814...",8510ee67fffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((59.82744 59.96634, 60.01239 59.97814...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
len(hexes_gdf_compare.loc[hexes_gdf_compare['count_2021_x'] != hexes_gdf_compare['count_2021_y']])

13

In [96]:
324 - 317

7

In [95]:
hexes_gdf_compare.loc[hexes_gdf_compare['count_2021_x'] != hexes_gdf_compare['count_2021_y']]

Unnamed: 0,geometry_x,hex_id,h3_id_5_x,assets_balance_x,short_borrowed_funds_x,long_borrowed_funds_x,revenue_x,net_profit_loss_x,count_2020_x,count_2021_x,begin_2021_x,end_2021_x,geometry_y,h3_id_5_y,assets_balance_y,short_borrowed_funds_y,long_borrowed_funds_y,revenue_y,net_profit_loss_y,count_2020_y,count_2021_y,begin_2021_y,end_2021_y
30,"POLYGON ((58.81363 57.81098, 58.88419 57.89703...",8510c0c3fffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((58.81363 57.81098, 58.88419 57.89703...",8510c0c3fffffff,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0
149,"POLYGON ((58.02189 57.01961, 57.91866 57.09240...",8510c647fffffff,8510c647fffffff,190656000.0,0.0,440000.0,106139000.0,6338000.0,3.0,3.0,0.0,0.0,"POLYGON ((58.02189 57.01961, 57.91866 57.09240...",8510c647fffffff,190656000.0,0.0,440000.0,106139000.0,6338000.0,4.0,4.0,0.0,0.0
158,"POLYGON ((57.35288 56.55595, 57.52089 56.57055...",8510d597fffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((57.35288 56.55595, 57.52089 56.57055...",8510d597fffffff,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
217,"POLYGON ((61.40886 56.43579, 61.23839 56.42643...",8510dc1bfffffff,8510dc1bfffffff,122921000.0,22178000.0,0.0,29636000.0,-14179000.0,3.0,3.0,0.0,0.0,"POLYGON ((61.40886 56.43579, 61.23839 56.42643...",8510dc1bfffffff,222247000.0,29234000.0,1050000.0,61326000.0,-11515000.0,8.0,8.0,0.0,1.0
264,"POLYGON ((60.92254 56.24530, 61.01940 56.16927...",8510dccffffffff,8510dccffffffff,17476000.0,500000.0,124000.0,61849000.0,-7105000.0,11.0,11.0,0.0,0.0,"POLYGON ((60.92254 56.24530, 61.01940 56.16927...",8510dccffffffff,17476000.0,500000.0,124000.0,61849000.0,-7105000.0,12.0,12.0,0.0,0.0
277,"POLYGON ((65.25551 56.99093, 65.08088 56.98679...",8510d9bbfffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((65.25551 56.99093, 65.08088 56.98679...",8510d9bbfffffff,16815000.0,0.0,167000.0,8177000.0,2521000.0,9.0,9.0,0.0,0.0
348,"POLYGON ((64.81394 57.06122, 64.98878 57.06574...",8510ca4bfffffff,8510ca4bfffffff,484827000.0,76810000.0,206975000.0,143974000.0,-646000.0,19.0,19.0,0.0,1.0,"POLYGON ((64.81394 57.06122, 64.98878 57.06574...",8510ca4bfffffff,495075023.0,76810000.0,207033000.0,157199493.0,-1361003.0,25.0,25.0,1.0,1.0
349,"POLYGON ((62.18771 56.39401, 62.01694 56.38572...",8510dc77fffffff,8510dc77fffffff,139402313.0,6837000.0,5590000.0,137891989.0,-8938534.0,30.0,28.0,1.0,4.0,"POLYGON ((62.18771 56.39401, 62.01694 56.38572...",8510dc77fffffff,521958314.0,8071000.0,5590000.0,619200989.0,-4297534.0,34.0,32.0,2.0,5.0
431,"POLYGON ((57.44587 56.88984, 57.54926 56.81737...",8510c643fffffff,8510c643fffffff,271006136.0,0.0,15265000.0,155448000.0,51237000.0,11.0,11.0,0.0,1.0,"POLYGON ((57.44587 56.88984, 57.54926 56.81737...",8510c643fffffff,271016136.0,0.0,15265000.0,155448000.0,51237000.0,14.0,13.0,0.0,1.0
452,"POLYGON ((57.36341 56.14873, 57.46513 56.07590...",8510d5c7fffffff,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((57.36341 56.14873, 57.46513 56.07590...",8510d5c7fffffff,50.0,0.0,0.0,310.0,80.0,7.0,7.0,0.0,3.0


In [99]:
(
    hexes_gdf_compare.loc[hexes_gdf_compare['count_2021_x'] != hexes_gdf_compare['count_2021_y'], 'count_2021_y'] -
    hexes_gdf_compare.loc[hexes_gdf_compare['count_2021_x'] != hexes_gdf_compare['count_2021_y'], 'count_2021_x']
)

30      2.0
149     1.0
158     1.0
217     5.0
264     1.0
277     9.0
348     6.0
349     4.0
431     2.0
452     7.0
490     2.0
526     5.0
535    14.0
dtype: float64

In [100]:
(
    hexes_gdf_compare.loc[hexes_gdf_compare['count_2021_x'] != hexes_gdf_compare['count_2021_y'], 'revenue_y'] -
    hexes_gdf_compare.loc[hexes_gdf_compare['count_2021_x'] != hexes_gdf_compare['count_2021_y'], 'revenue_x']
)

30             0.0
149            0.0
158            0.0
217     31690000.0
264            0.0
277      8177000.0
348     13225493.0
349    481309000.0
431            0.0
452          310.0
490            0.0
526        79000.0
535      8821776.0
dtype: float64

In [101]:
(
    hexes_gdf_compare.loc[hexes_gdf_compare['count_2021_x'] != hexes_gdf_compare['count_2021_y'], 'assets_balance_y'] -
    hexes_gdf_compare.loc[hexes_gdf_compare['count_2021_x'] != hexes_gdf_compare['count_2021_y'], 'assets_balance_x']
)

30             0.0
149            0.0
158            0.0
217     99326000.0
264            0.0
277     16815000.0
348     10248023.0
349    382556001.0
431        10000.0
452           50.0
490        18010.0
526        27000.0
535      2053000.0
dtype: float64