# Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import math
import datetime as dt
import json
import requests
from tqdm import tqdm

In [2]:
import geopandas as gpd
import geojson
import h3
import osm2geojson
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon, LineString
from geopy.distance import geodesic, great_circle

In [3]:
import plotly.graph_objects as go
import plotly.express as px

In [4]:
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap

In [5]:
pd.options.display.max_rows = 300
pd.options.display.max_columns = 300

# Login to DWH

In [2]:
from tochka.datalab.datasources import connect_to_dwh

In [3]:
dwh_login = 'BOCHKAREV_DV'

In [4]:
dwh_pass = 'r5eN3Y6z'

In [5]:
dwh_connection = connect_to_dwh(dwh_login, dwh_pass)

# Region

In [6]:
# reg = 72 # Тюмень
# reg = 45 # Курган
reg = 50 # Московская

# EGRUL

In [7]:
df_inn = pd.read_sql_query(f"""
    SELECT
          inn
        , tax_agency_code
        , begin_date
        , end_date
        , end_code
        , full_address
        , kladr_code
    FROM DATAMART.EGRUL
    where
        SUBSTR(inn, 0, 2) = '{reg}'
    """,
    dwh_connection)

  % ((self.server_version_info,))


In [None]:
df_inn.shape

In [None]:
df_inn.info()

In [None]:
df_inn.head()

In [None]:
df_inn.to_csv(f'df_inn_{reg}.csv.gz', index=False, sep=';', compression='gzip')

In [7]:
df_inn = pd.read_csv(
    f'df_inn_{reg}.csv.gz',
    parse_dates=['begin_date','end_date'],
    dtype={
        'inn':str,
        'tax_agency_code':str,
        'end_code':str,
        'kladr_code':str
    },
    sep=';',
    compression='gzip'
)

In [8]:
df_inn.shape

(1216616, 7)

In [9]:
df_inn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1216616 entries, 0 to 1216615
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   inn              1216616 non-null  object        
 1   tax_agency_code  1179424 non-null  object        
 2   begin_date       1216616 non-null  datetime64[ns]
 3   end_date         815783 non-null   datetime64[ns]
 4   end_code         815783 non-null   object        
 5   full_address     1080060 non-null  object        
 6   kladr_code       445236 non-null   object        
dtypes: datetime64[ns](2), object(5)
memory usage: 65.0+ MB


In [10]:
df_inn.head()

Unnamed: 0,inn,tax_agency_code,begin_date,end_date,end_code,full_address,kladr_code
0,5007115962,5007,2022-06-02,NaT,,"141865, МОСКОВСКАЯ ОБЛАСТЬ, РП. НЕКРАСОВСКИЙ, ...",
1,5027306482,5027,2022-06-02,NaT,,"140000, МОСКОВСКАЯ ОБЛАСТЬ, Г. ЛЮБЕРЦЫ, ПРОЕЗД...",
2,505396196740,5031,2022-06-02,NaT,,,
3,504408634580,5044,2022-06-02,NaT,,,
4,507404399235,7751,2022-06-02,NaT,,,


In [11]:
df_inn.groupby(df_inn.inn.map(len)).size()

inn
10    486995
12    729621
dtype: int64

In [12]:
df_inn.groupby(df_inn.kladr_code.apply(lambda x: len(str(x)))).size()

kladr_code
3     771380
11     62191
15    383045
dtype: int64

In [13]:
df_inn.groupby([df_inn.inn.map(len), df_inn.full_address.notnull(), df_inn.kladr_code.notnull()]).size()

inn  full_address  kladr_code
10   True          False          41759
                   True          445236
12   False         False         136556
     True          False         593065
dtype: int64

# KLADR_Code

In [14]:
len(df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull())
])

445236

In [15]:
df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull()),
    'kladr_code'
].nunique()

39356

In [16]:
df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull())
].groupby(df_inn.kladr_code.apply(lambda x: len(str(x)))).size()

kladr_code
11     62191
15    383045
dtype: int64

In [17]:
df_inn['kladr_11'] = df_inn['kladr_code'].str[:11]

In [18]:
df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull())
].head()

Unnamed: 0,inn,tax_agency_code,begin_date,end_date,end_code,full_address,kladr_code,kladr_11
1055,5029269902,5029,2022-05-30,NaT,,"141031, ОБЛАСТЬ МОСКОВСКАЯ, Д БОРОДИНО, Г. МЫТ...",50000044086,50000044086
5804,5029271700,5029,2022-08-18,NaT,,"141008, ОБЛАСТЬ МОСКОВСКАЯ, Г. МЫТИЩИ, УЛ. МИР...",500000440000188,50000044000
8602,5052004598,5050,2002-11-29,2016-12-09,201.0,"141191, ОБЛАСТЬ МОСКОВСКАЯ, ГОРОД ФРЯЗИНО, УЛИ...",500000190000005,50000019000
8603,5043013437,5043,2002-12-20,NaT,,"142207, ОБЛАСТЬ МОСКОВСКАЯ, Г. СЕРПУХОВ, УЛ. Ц...",500000280000276,50000028000
8604,5045044776,5045,2009-06-19,2015-08-28,407.0,"142800, ОБЛАСТЬ МОСКОВСКАЯ, ГОРОД СТУПИНО, УЛИ...",500340010000179,50034001000


In [19]:
for i in df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull()),
    'full_address'
][:15]:
    print(i)

141031, ОБЛАСТЬ МОСКОВСКАЯ, Д БОРОДИНО, Г. МЫТИЩИ, СТР. 26 ЭТАЖ/ПОМЕЩЕНИЕ 1/1
141008, ОБЛАСТЬ МОСКОВСКАЯ, Г. МЫТИЩИ, УЛ. МИРА, Д. 30 ПОМЕЩ. XVII
141191, ОБЛАСТЬ МОСКОВСКАЯ, ГОРОД ФРЯЗИНО, УЛИЦА ГОРЬКОГО, 12 1, 4
142207, ОБЛАСТЬ МОСКОВСКАЯ, Г. СЕРПУХОВ, УЛ. ЦЕНТРАЛЬНАЯ, Д.160 К.8
142800, ОБЛАСТЬ МОСКОВСКАЯ, ГОРОД СТУПИНО, УЛИЦА БОЛЬШЕ-ОБРАЗЦОВСКАЯ, 10
141220, ОБЛАСТЬ МОСКОВСКАЯ, РАЙОН ПУШКИНСКИЙ, ПОСЕЛОК ЧЕЛЮСКИНСКИЙ, УЛИЦА БОЛЬШАЯ ТАРАСОВСКАЯ, 117
141220, ОБЛАСТЬ МОСКОВСКАЯ, Г ПУШКИНО, П ЧЕЛЮСКИНСКИЙ, УЛ. БОЛЬШАЯ ТАРАСОВСКАЯ, Д. 2/112 КВ. 24
140400, ОБЛАСТЬ МОСКОВСКАЯ, Г КОЛОМНА, УЛ. МОСКВОРЕЦКАЯ, Д. 14
141240, ОБЛАСТЬ МОСКОВСКАЯ, Г ПУШКИНО, УЛ ПИОНЕРСКАЯ (МАМОНТОВКА МКР.), Д. 15
142542, ОБЛАСТЬ МОСКОВСКАЯ, Д ВАСЮТИНО, Г. ПАВЛОВСКИЙ ПОСАД, ТЕР. СНТ СТРОЙБАНКОВЕЦ
142155, ОБЛАСТЬ МОСКОВСКАЯ, Г. ПОДОЛЬСК, УЛ ВОСТОЧНАЯ (ЛЬВОВСКИЙ МКР.), Д. 7Б СТР. 1, ОФИС 5
140032, ОБЛАСТЬ МОСКОВСКАЯ, РАЙОН ЛЮБЕРЕЦКИЙ, ПОСЕЛОК МАЛАХОВКА, УЛИЦА ШОССЕЙНАЯ, 40 -, -
443029, ОБЛАСТЬ САМАРСКАЯ, ГОРОД САМАРА, УЛИ

  for i in df_inn.loc[


In [20]:
df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull()),
    'kladr_11'
].nunique()

10302

In [21]:
df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull()) &
    (df_inn.kladr_code.str[:2]==f'{reg}'),
    'kladr_11'
].nunique()

9014

In [22]:
df_kladr = pd.DataFrame(df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull()) &
    (df_inn.kladr_code.str[:2]==f'{reg}'),
    'kladr_11'
].unique())

In [23]:
df_kladr.head()

Unnamed: 0,0
0,50000044086
1,50000044000
2,50000019000
3,50000028000
4,50034001000


In [24]:
df_kladr.rename(columns={0:'kladr_11'}, inplace=True)

In [25]:
df_kladr.head()

Unnamed: 0,kladr_11
0,50000044086
1,50000044000
2,50000019000
3,50000028000
4,50034001000


In [26]:
df_kladr.to_csv(f'df_kladr_{reg}.csv.gz', index=False, compression='gzip')

In [27]:
df_kladr_upd = pd.read_csv(
    f'df_kladr_{reg}_upd.csv.gz',
    sep=';',
    compression='gzip',
    dtype={'kladr_11':str, 'kladr':str}
)

In [28]:
df_kladr_upd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9014 entries, 0 to 9013
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   kladr_11   9014 non-null   object 
 1   kladr      9014 non-null   object 
 2   address    8337 non-null   object 
 3   latitude   8335 non-null   float64
 4   longitude  8335 non-null   float64
dtypes: float64(2), object(3)
memory usage: 352.2+ KB


In [29]:
df_kladr_upd.head()

Unnamed: 0,kladr_11,kladr,address,latitude,longitude
0,50000044086,5000004408600,"141031, Московская обл, г Мытищи, деревня Боро...",55.925117,37.661419
1,50000044000,5000004400000,"141000, Московская обл, г Мытищи",55.910503,37.73633
2,50000019000,5000001900000,"141190, Московская обл, г Фрязино",55.958973,38.040996
3,50000028000,5000002800000,"142200, Московская обл, г Серпухов",54.922597,37.40334
4,50034001000,5003400100000,"142800, Московская обл, г Ступино",54.886274,38.078228


# Analysis

In [30]:
df_inn.groupby([df_inn.inn.map(len), df_inn.full_address.notnull(), df_inn.kladr_code.notnull()]).size()

inn  full_address  kladr_code
10   True          False          41759
                   True          445236
12   False         False         136556
     True          False         593065
dtype: int64

In [31]:
df_inn_ul = df_inn.loc[
    (df_inn.inn.map(len)==10) &
    (df_inn.kladr_code.notnull()) &
    (df_inn.kladr_code.str[:2]==f'{reg}')
].copy()

In [32]:
df_inn_ul.shape

(419162, 8)

In [33]:
df_inn_ul.head()

Unnamed: 0,inn,tax_agency_code,begin_date,end_date,end_code,full_address,kladr_code,kladr_11
1055,5029269902,5029,2022-05-30,NaT,,"141031, ОБЛАСТЬ МОСКОВСКАЯ, Д БОРОДИНО, Г. МЫТ...",50000044086,50000044086
5804,5029271700,5029,2022-08-18,NaT,,"141008, ОБЛАСТЬ МОСКОВСКАЯ, Г. МЫТИЩИ, УЛ. МИР...",500000440000188,50000044000
8602,5052004598,5050,2002-11-29,2016-12-09,201.0,"141191, ОБЛАСТЬ МОСКОВСКАЯ, ГОРОД ФРЯЗИНО, УЛИ...",500000190000005,50000019000
8603,5043013437,5043,2002-12-20,NaT,,"142207, ОБЛАСТЬ МОСКОВСКАЯ, Г. СЕРПУХОВ, УЛ. Ц...",500000280000276,50000028000
8604,5045044776,5045,2009-06-19,2015-08-28,407.0,"142800, ОБЛАСТЬ МОСКОВСКАЯ, ГОРОД СТУПИНО, УЛИ...",500340010000179,50034001000


In [34]:
df_inn_ul['count_2020'] = 0
df_inn_ul.loc[
    (df_inn_ul.begin_date.dt.year <= 2020) &
    ((df_inn_ul.end_date.isnull()) | (df_inn_ul.end_date.dt.year >= 2020)),
    'count_2020'
] = 1

In [35]:
df_inn_ul.groupby('count_2020').size()

count_2020
0    233908
1    185254
dtype: int64

In [36]:
df_inn_ul['count_2021'] = 0
df_inn_ul.loc[
    (df_inn_ul.begin_date.dt.year <= 2021) &
    ((df_inn_ul.end_date.isnull()) | (df_inn_ul.end_date.dt.year >= 2021)),
    'count_2021'
] = 1

In [37]:
df_inn_ul.groupby('count_2021').size()

count_2021
0    257001
1    162161
dtype: int64

In [38]:
df_inn_ul['begin_2021'] = 0
df_inn_ul.loc[
    (df_inn_ul.begin_date.dt.year == 2021),
    'begin_2021'
] = 1

In [39]:
df_inn_ul.groupby('begin_2021').size()

begin_2021
0    416355
1      2807
dtype: int64

In [40]:
df_inn_ul['end_2021'] = 0
df_inn_ul.loc[
    (df_inn_ul.end_date.dt.year == 2021),
    'end_2021'
] = 1

In [41]:
df_inn_ul.groupby('end_2021').size()

end_2021
0    401666
1     17496
dtype: int64

In [42]:
df_inn_ul_grouped = df_inn_ul[
    ['kladr_11', 'count_2020', 'count_2021', 'begin_2021', 'end_2021']
].groupby('kladr_11', as_index=False).sum()

In [43]:
df_inn_ul_grouped.shape

(9014, 5)

In [44]:
df_inn_ul_grouped.head()

Unnamed: 0,kladr_11,count_2020,count_2021,begin_2021,end_2021
0,50000000000,4,4,0,1
1,50000000002,0,0,0,0
2,50000000003,9,9,0,1
3,50000000006,0,0,0,0
4,50000000008,131,122,5,14


In [45]:
df_inn_ul_grouped['begin_2021_rel'] = None
df_inn_ul_grouped.loc[
    df_inn_ul_grouped.count_2021 > 0,
    'begin_2021_rel'
] = df_inn_ul_grouped['begin_2021'] / df_inn_ul_grouped['count_2021']

In [46]:
df_inn_ul_grouped['end_2021_rel'] = None
df_inn_ul_grouped.loc[
    df_inn_ul_grouped.count_2021 > 0,
    'end_2021_rel'
] = df_inn_ul_grouped['end_2021'] / df_inn_ul_grouped['count_2021']

In [47]:
df_inn_ul_grouped['count_2021_2020_diff'] = df_inn_ul_grouped['count_2021'] - df_inn_ul_grouped['count_2020']

In [48]:
df_inn_ul_grouped['count_2021_2020_rel'] = None
df_inn_ul_grouped.loc[
    df_inn_ul_grouped.count_2020 > 0,
    'count_2021_2020_rel'
] = df_inn_ul_grouped['count_2021'] / df_inn_ul_grouped['count_2020']

In [49]:
df_inn_ul_grouped.head()

Unnamed: 0,kladr_11,count_2020,count_2021,begin_2021,end_2021,begin_2021_rel,end_2021_rel,count_2021_2020_diff,count_2021_2020_rel
0,50000000000,4,4,0,1,0.0,0.25,0,1.0
1,50000000002,0,0,0,0,,,0,
2,50000000003,9,9,0,1,0.0,0.111111,0,1.0
3,50000000006,0,0,0,0,,,0,
4,50000000008,131,122,5,14,0.040984,0.114754,-9,0.931298


In [50]:
df_inn_ul_grouped.shape

(9014, 9)

In [51]:
df_inn_ul.kladr_11.nunique()

9014

In [52]:
df_kladr_upd.shape

(9014, 5)

In [53]:
df_kladr_upd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9014 entries, 0 to 9013
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   kladr_11   9014 non-null   object 
 1   kladr      9014 non-null   object 
 2   address    8337 non-null   object 
 3   latitude   8335 non-null   float64
 4   longitude  8335 non-null   float64
dtypes: float64(2), object(3)
memory usage: 352.2+ KB


In [54]:
df_kladr_upd.head()

Unnamed: 0,kladr_11,kladr,address,latitude,longitude
0,50000044086,5000004408600,"141031, Московская обл, г Мытищи, деревня Боро...",55.925117,37.661419
1,50000044000,5000004400000,"141000, Московская обл, г Мытищи",55.910503,37.73633
2,50000019000,5000001900000,"141190, Московская обл, г Фрязино",55.958973,38.040996
3,50000028000,5000002800000,"142200, Московская обл, г Серпухов",54.922597,37.40334
4,50034001000,5003400100000,"142800, Московская обл, г Ступино",54.886274,38.078228


In [55]:
df_kladr_ul = df_kladr_upd.merge(
    df_inn_ul_grouped,
    how='left',
    on='kladr_11'
)

In [56]:
df_kladr_ul.shape

(9014, 13)

In [57]:
df_kladr_ul.head().T

Unnamed: 0,0,1,2,3,4
kladr_11,50000044086,50000044000,50000019000,50000028000,50034001000
kladr,5000004408600,5000004400000,5000001900000,5000002800000,5003400100000
address,"141031, Московская обл, г Мытищи, деревня Боро...","141000, Московская обл, г Мытищи","141190, Московская обл, г Фрязино","142200, Московская обл, г Серпухов","142800, Московская обл, г Ступино"
latitude,55.925117,55.910503,55.958973,54.922597,54.886274
longitude,37.661419,37.73633,38.040996,37.40334,38.078228
count_2020,32,5613,1082,2451,15
count_2021,24,4881,945,2219,11
begin_2021,0,128,9,27,0
end_2021,2,462,100,215,1
begin_2021_rel,0.0,0.026224,0.009524,0.012168,0.0


In [58]:
df_kladr_ul[df_kladr_ul.count_2020 > 500].sort_values('count_2021_2020_rel', ascending=True)

Unnamed: 0,kladr_11,kladr,address,latitude,longitude,count_2020,count_2021,begin_2021,end_2021,begin_2021_rel,end_2021_rel,count_2021_2020_diff,count_2021_2020_rel
77,50000007000,5000000700000,"141280, Московская обл, г Пушкино, г Ивантеевка",55.974177,37.920811,713,415,34,327,0.081928,0.787952,-298,0.582048
205,50000067000,5000006700000,"141200, Московская обл, г Пушкино",56.010428,37.847155,1215,726,64,595,0.088154,0.819559,-489,0.597531
29,50000050022,5000005002200,"140050, Московская обл, г Люберцы, дп Красково",55.658886,37.98846,578,449,10,61,0.022272,0.135857,-129,0.776817
78,50000023000,5000002300000,"140090, Московская обл, г Дзержинский",55.624118,37.844083,1406,1103,24,101,0.021759,0.091568,-303,0.784495
574,50000050019,5000005001900,"140070, Московская обл, г Люберцы, рп Томилино",55.661399,37.950838,790,622,9,65,0.014469,0.104502,-168,0.787342
37,50019001000,5001900100000,"141000, Московская обл, г Мытищи",55.910503,37.73633,3345,2665,0,315,0.0,0.118199,-680,0.796712
17,50000050000,5000005000000,"140000, Московская обл, г Люберцы",55.676499,37.898125,5397,4340,106,520,0.024424,0.119816,-1057,0.80415
200,50000013000,5000001300000,"140080, Московская обл, г Лыткарино",55.577851,37.903443,919,741,18,80,0.024291,0.107962,-178,0.806311
156,50047000000,5004700000000,"141601, Московская обл, г Клин",56.342514,36.72406,2328,1902,25,218,0.013144,0.114616,-426,0.81701
16,50000049000,5000004900000,"143401, Московская обл, г Красногорск",55.831812,37.329429,6545,5370,98,504,0.01825,0.093855,-1175,0.820474


In [59]:
df_kladr_ul.to_csv(f'df_kladr_{reg}_ul.csv.gz', sep=';', index=False, compression='gzip')

# Nalog
### https://www.nalogia.ru/useful/inspektsii/regions/66.php

In [61]:
df_inn.head()

Unnamed: 0,inn,tax_agency_code,begin_date,end_date,end_code,full_address,kladr_code,kladr_11
0,450108541400,4500,2022-06-02,NaT,,,,
1,451104273046,4500,2022-05-26,NaT,,,,
2,450105604549,5032,2022-05-27,NaT,,,,
3,450126180298,4500,2022-05-27,NaT,,,,
4,451000897378,4500,2022-05-30,NaT,,,,


In [62]:
df_inn[df_inn.tax_agency_code.str[:2]==f'{reg}'].groupby('tax_agency_code').size()

tax_agency_code
4500    26797
4501    46007
4502    11296
4503       61
4504       77
4505       76
4506     2834
4507       73
4508     5439
4509       34
4510     9015
4511      525
4512     3894
4513      230
4514      171
4515      157
4516      600
4517      126
4518       93
4519       55
4520      256
4521      162
4522       48
4523       54
4524     6654
4525       69
4526     3181
dtype: int64

In [63]:
with open(f'/home/tochka/Documents/Other/Work/Geo/Nalog/nalog_urls_{reg}.pkl', 'rb') as f:
    urls = pickle.load(f)
        
with open(f'/home/tochka/Documents/Other/Work/Geo/Nalog/nalog_addrs_{reg}.pkl', 'rb') as f:
    addrs = pickle.load(f)

In [64]:
len(urls)

8

In [65]:
len(addrs)

8

In [66]:
df_nalog = pd.DataFrame(list(zip(urls, addrs)), columns=['url', 'addr'])

In [67]:
df_nalog

Unnamed: 0,url,addr
0,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641018, Курганская область, г.Курган, ул.М.Гор..."
1,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641802, Курганская область, г.Шадринск, ул.Орд..."
2,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641730, Курганская область, Далматовский район..."
3,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641910, Курганская область, Каргапольский райо..."
4,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641310, Курганская область, Кетовский район, с..."
5,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641500, Курганская область, Лебяжьевский район..."
6,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641100, Курганская область, Шумихинский район,..."
7,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641200, Курганская область, Юргамышский район,..."


In [68]:
df_nalog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     8 non-null      object
 1   addr    8 non-null      object
dtypes: object(2)
memory usage: 256.0+ bytes


In [69]:
df_nalog['url'][0]

'/useful/inspektsii/html/nalogovaya_inspektsiya_4501.php'

In [70]:
df_nalog['nalog_code'] = df_nalog['url'].str[-8:-4]

In [71]:
df_nalog.head()

Unnamed: 0,url,addr,nalog_code
0,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641018, Курганская область, г.Курган, ул.М.Гор...",4501
1,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641802, Курганская область, г.Шадринск, ул.Орд...",4502
2,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641730, Курганская область, Далматовский район...",4506
3,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641910, Курганская область, Каргапольский райо...",4508
4,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641310, Курганская область, Кетовский район, с...",4510


In [72]:
df_nalog['addr'][0]

'641018, Курганская область, г.Курган, ул.М.Горького, 132'

In [73]:
api_key = '97965ebc-e14d-4585-85bf-ad2c2d64cf82'

In [74]:
url = 'https://geocode-maps.yandex.ru/1.x'

In [75]:
import warnings
warnings.filterwarnings("ignore")

In [76]:
coords = []

In [79]:
for addr in tqdm(df_nalog['addr']):
    try:
        r = requests.get(f'{url}/?apikey={api_key}&geocode={addr}&format=json', verify=False)
        coord = r.json()['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['Point']['pos']
        coords.append(coord)
    except:
        coords.append(None)

100%|█████████████████████████████████████████████| 8/8 [00:06<00:00,  1.28it/s]


In [80]:
coords

['65.353784 55.43986',
 '63.606328 56.081552',
 '62.937155 56.258348',
 '64.435383 55.952293',
 '65.329099 55.333799',
 '66.496208 55.273062',
 '63.28597 55.228038',
 '64.455721 55.375398']

In [81]:
df_nalog['coord'] = coords

In [82]:
df_nalog.head()

Unnamed: 0,url,addr,nalog_code,coord
0,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641018, Курганская область, г.Курган, ул.М.Гор...",4501,65.353784 55.43986
1,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641802, Курганская область, г.Шадринск, ул.Орд...",4502,63.606328 56.081552
2,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641730, Курганская область, Далматовский район...",4506,62.937155 56.258348
3,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641910, Курганская область, Каргапольский райо...",4508,64.435383 55.952293
4,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641310, Курганская область, Кетовский район, с...",4510,65.329099 55.333799


In [83]:
df_nalog['coords'] = df_nalog['coord'].str.split()

In [84]:
df_nalog.head()

Unnamed: 0,url,addr,nalog_code,coord,coords
0,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641018, Курганская область, г.Курган, ул.М.Гор...",4501,65.353784 55.43986,"[65.353784, 55.43986]"
1,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641802, Курганская область, г.Шадринск, ул.Орд...",4502,63.606328 56.081552,"[63.606328, 56.081552]"
2,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641730, Курганская область, Далматовский район...",4506,62.937155 56.258348,"[62.937155, 56.258348]"
3,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641910, Курганская область, Каргапольский райо...",4508,64.435383 55.952293,"[64.435383, 55.952293]"
4,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641310, Курганская область, Кетовский район, с...",4510,65.329099 55.333799,"[65.329099, 55.333799]"


In [85]:
df_nalog['lat'] = df_nalog['coords'].apply(lambda x: x[1])
df_nalog['lon'] = df_nalog['coords'].apply(lambda x: x[0])

In [86]:
df_nalog.head()

Unnamed: 0,url,addr,nalog_code,coord,coords,lat,lon
0,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641018, Курганская область, г.Курган, ул.М.Гор...",4501,65.353784 55.43986,"[65.353784, 55.43986]",55.43986,65.353784
1,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641802, Курганская область, г.Шадринск, ул.Орд...",4502,63.606328 56.081552,"[63.606328, 56.081552]",56.081552,63.606328
2,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641730, Курганская область, Далматовский район...",4506,62.937155 56.258348,"[62.937155, 56.258348]",56.258348,62.937155
3,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641910, Курганская область, Каргапольский райо...",4508,64.435383 55.952293,"[64.435383, 55.952293]",55.952293,64.435383
4,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641310, Курганская область, Кетовский район, с...",4510,65.329099 55.333799,"[65.329099, 55.333799]",55.333799,65.329099


In [87]:
with open(f'/home/tochka/Documents/Other/Work/Geo/Nalog/df_nalog_{reg}.pkl', 'wb') as f:
    pickle.dump(df_nalog, f, pickle.HIGHEST_PROTOCOL)

In [88]:
df_nalog.to_csv(f'/home/tochka/Documents/Other/Work/Geo/Nalog/df_nalog_{reg}.csv', sep=';', index=False)

In [89]:
map_nalog = folium.Map(location=[58.5, 63], zoom_start=6, width=900, height=700, control_scale=True)

for index, row in df_nalog.iterrows():
    folium.CircleMarker(
        location=(row['lat'], row['lon']),
        radius = 4,
        popup=row['nalog_code'],
        fill_color='red',
        color=None,
        fill_opacity = 1
    ).add_to(map_nalog)

map_nalog

In [90]:
df_inn_ul.head()

Unnamed: 0,inn,tax_agency_code,begin_date,end_date,end_code,full_address,kladr_code,kladr_11,count_2020,count_2021,begin_2021,end_2021
674,4501142934,4501,2008-07-25,2015-01-13,304,"640003, ОБЛАСТЬ КУРГАНСКАЯ, Г. КУРГАН, УЛ. КОЛ...",450000010000187,45000001000,0,0,0,0
675,4501110570,4501,2004-09-28,2016-07-26,407,"640000, ОБЛАСТЬ КУРГАНСКАЯ, Г. КУРГАН, УЛ. ГОГ...",450000010000059,45000001000,0,0,0,0
705,4511001298,4500,2002-12-02,2022-04-07,201,"641431, ОБЛАСТЬ КУРГАНСКАЯ, Р-Н КУРТАМЫШСКИЙ, ...",450100010000090,45010001000,1,1,0,0
712,4501002454,4501,2011-08-03,2011-08-03,407,"640023, ОБЛАСТЬ КУРГАНСКАЯ, П ЗАОЗЕРНЫЙ, Г. КУ...",450000010100002,45000001010,0,0,0,0
720,4501108700,4501,2004-05-28,2018-11-30,201,"640011, ОБЛАСТЬ КУРГАНСКАЯ, ГОРОД КУРГАН, УЛИЦ...",450000010000209,45000001000,0,0,0,0


In [91]:
df_kladr_upd.head()

Unnamed: 0,kladr_11,kladr,address,latitude,longitude
0,45000001000,4500000100000,"640002, Курганская обл, г Курган",55.444415,65.316199
1,45010001000,4501000100000,"641430, Курганская обл, Куртамышский р-н, г Ку...",54.93693,64.42033
2,45000001010,4500000101000,,,
3,45000001023,4500000102300,"640016, Курганская обл, г Курган, тер Увал пос...",55.382314,65.382459
4,45009000007,4500900000700,"641322, Курганская обл, Кетовский р-н, село Вв...",55.471449,65.087452


In [92]:
df_nalog.head()

Unnamed: 0,url,addr,nalog_code,coord,coords,lat,lon
0,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641018, Курганская область, г.Курган, ул.М.Гор...",4501,65.353784 55.43986,"[65.353784, 55.43986]",55.43986,65.353784
1,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641802, Курганская область, г.Шадринск, ул.Орд...",4502,63.606328 56.081552,"[63.606328, 56.081552]",56.081552,63.606328
2,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641730, Курганская область, Далматовский район...",4506,62.937155 56.258348,"[62.937155, 56.258348]",56.258348,62.937155
3,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641910, Курганская область, Каргапольский райо...",4508,64.435383 55.952293,"[64.435383, 55.952293]",55.952293,64.435383
4,/useful/inspektsii/html/nalogovaya_inspektsiya...,"641310, Курганская область, Кетовский район, с...",4510,65.329099 55.333799,"[65.329099, 55.333799]",55.333799,65.329099


In [93]:
df_nalog_zone = df_inn_ul.loc[
    df_inn_ul.tax_agency_code.str[:2]==f'{reg}',
    ['tax_agency_code', 'kladr_11', 'count_2021']
].groupby(['tax_agency_code', 'kladr_11'], as_index=False).count().merge(
    df_kladr_upd[['kladr_11', 'latitude', 'longitude']],
    how='left',
    on='kladr_11'
).merge(
    df_nalog[['nalog_code', 'lat', 'lon']],
    how='left',
    left_on='tax_agency_code',
    right_on='nalog_code'
)

In [94]:
df_nalog_zone.head(10)

Unnamed: 0,tax_agency_code,kladr_11,count_2021,latitude,longitude,nalog_code,lat,lon
0,4500,45000001000,5183,55.444415,65.316199,,,
1,4500,45000001002,1,55.379224,65.212425,,,
2,4500,45000001010,2,,,,,
3,4500,45000002000,773,56.087042,63.629747,,,
4,4500,45002000001,29,54.944409,63.580402,,,
5,4500,45002000005,3,54.903516,62.92721,,,
6,4500,45002000009,3,55.040905,63.42918,,,
7,4500,45002000012,4,54.800315,63.242591,,,
8,4500,45002000016,1,54.995447,63.02739,,,
9,4500,45002000017,3,54.93031,63.41955,,,


In [95]:
df_nalog_zone.info(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1917 entries, 0 to 1916
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tax_agency_code  1917 non-null   object 
 1   kladr_11         1917 non-null   object 
 2   count_2021       1917 non-null   int64  
 3   latitude         1878 non-null   float64
 4   longitude        1878 non-null   float64
 5   nalog_code       1028 non-null   object 
 6   lat              1028 non-null   object 
 7   lon              1028 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 134.8+ KB


In [96]:
df_nalog_gr = df_nalog_zone.loc[
    df_nalog_zone.nalog_code.notnull(),
    ['tax_agency_code', 'count_2021', 'lat', 'lon']
].groupby(['tax_agency_code', 'lat', 'lon'], as_index=False).sum()

In [97]:
df_nalog_gr

Unnamed: 0,tax_agency_code,lat,lon,count_2021
0,4501,55.43986,65.353784,12711
1,4502,56.081552,63.606328,1976
2,4506,56.258348,62.937155,470
3,4508,55.952293,64.435383,1312
4,4510,55.333799,65.329099,2175
5,4512,55.273062,66.496208,1225
6,4524,55.228038,63.28597,1428
7,4526,55.375398,64.455721,678


In [98]:
df_inn_ul[df_inn_ul.tax_agency_code.str[:2]==f'{reg}'].groupby('tax_agency_code').size()

tax_agency_code
4500     9061
4501    12711
4502     1976
4503       25
4504       46
4505       53
4506      470
4507       36
4508     1312
4509       29
4510     2175
4511      130
4512     1225
4513       59
4514       95
4515       29
4516      187
4517       77
4518       81
4519       35
4520      131
4521       76
4522       40
4523       32
4524     1428
4525       47
4526      678
dtype: int64