In [1]:
import pandas as pd
import numpy as np
from numpy import inf, mean, median
import math
import datetime
import json
import os
import glob
import pickle
import requests
# import requests_cache
import plotly.graph_objects as go
import plotly.express as px
import folium
from folium.plugins import MarkerCluster, HeatMap
import geopandas as gpd
import osm2geojson
import geojson
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon, LineString

In [2]:
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

# Cities

## OSM Cities

In [3]:
with open('data_osm_cities.pickle', 'rb') as _f:
    data_osm_cities = pickle.load(_f)

In [4]:
data_osm_cities.shape

(2652, 5)

In [5]:
data_osm_cities.head()

Unnamed: 0,region,city_name,city_lon,city_lat,city_population
0,Адыгея,Адыгейск,39.190593,44.883197,12689.0
1,Адыгея,Гиагинская,40.060501,44.873023,13954.0
2,Адыгея,Кошехабль,40.499439,44.897871,7183.0
3,Адыгея,Красногвардейское,39.581746,45.134793,9419.0
4,Адыгея,Майкоп,40.104261,44.605519,144055.0


In [6]:
data_osm_cities.region.unique()

array(['Адыгея', 'Алтайский край', 'Амурская область',
       'Архангельская область', 'Астраханская область', 'Башкортостан',
       'Белгородская область', 'Брянская область', 'Владимирская область',
       'Волгоградская область', 'Вологодская область',
       'Воронежская область', 'Дагестан', 'Еврейская автономная область',
       'Забайкальский край', 'Ивановская область', 'Ингушетия',
       'Иркутская область', 'Кабардино-Балкария',
       'Калининградская область', 'Калужская область', 'Камчатский край',
       'Карачаево-Черкесия', 'Кемеровская область', 'Кировская область',
       'Костромская область', 'Краснодарский край', 'Красноярский край',
       'Курганская область', 'Курская область', 'Ленинградская область',
       'Липецкая область', 'Магаданская область', 'Марий Эл', 'Мордовия',
       'Москва', 'Московская область', 'Мурманская область',
       'Ненецкий автономный округ', 'Нижегородская область',
       'Новгородская область', 'Новосибирская область', 'Омская об

## Regions

In [7]:
with open('data_regions.pickle', 'rb') as _f:
    data_regions = pickle.load(_f)

In [8]:
data_regions.shape

(85, 12)

In [9]:
data_regions.head()

Unnamed: 0,region_name,region_type,center_name,region_area,region_population,region_code,region_fo,region_density,region,center_lon,center_lat,center_population
0,Адыгея,республика,Майкоп,7792,449171,1,Южный,57.645149,Адыгея,40.104261,44.605519,144055.0
1,Алтай,республика,Горно-Алтайск,92903,213703,4,Сибирский,2.300281,Республика Алтай,85.968646,51.944865,63845.0
2,Алтайский,край,Барнаул,167996,2384812,22,Сибирский,14.195648,Алтайский край,83.749388,53.340879,632723.0
3,Амурская,область,Благовещенск,361913,809873,28,Дальневосточный,2.237756,Амурская область,127.544173,50.320583,225091.0
4,Архангельская,область,Архангельск,589913,1183323,29,Северо-Западный,2.005928,Архангельская область,40.548922,64.564142,350982.0


In [10]:
data_regions.region.unique()

array(['Адыгея', 'Республика Алтай', 'Алтайский край', 'Амурская область',
       'Архангельская область', 'Астраханская область', 'Башкортостан',
       'Белгородская область', 'Брянская область', 'Республика Бурятия',
       'Владимирская область', 'Волгоградская область',
       'Вологодская область', 'Воронежская область', 'Дагестан',
       'Еврейская автономная область', 'Забайкальский край',
       'Ивановская область', 'Ингушетия', 'Иркутская область',
       'Кабардино-Балкария', 'Калининградская область',
       'Республика Калмыкия', 'Калужская область', 'Камчатский край',
       'Карачаево-Черкесия', 'Республика Карелия', 'Кемеровская область',
       'Кировская область', 'Республика Коми', 'Костромская область',
       'Краснодарский край', 'Красноярский край', 'Республика Крым',
       'Курганская область', 'Курская область', 'Санкт-Петербург',
       'Липецкая область', 'Магаданская область', 'Марий Эл', 'Мордовия',
       'Москва', 'Мурманская область', 'Ненецкий автоно

# Points

In [11]:
with open('data_points.pickle', 'rb') as _f:
    data_points = pickle.load(_f)

In [12]:
data_points.shape

(19467, 34)

In [14]:
data_points.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
date,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45
OwnerKLADRCode,0100200001300,0100500000900,0100500003200,0100600002500,0200000102700
count_pol,1,5,1,1,4
premium,2944.37,34148.81,7494.76,5353.4,39791.98
count_pol_with_claim,0,0,0,0,0
claim_count,0,0,0,0,0
claim_sum,0.0,0.0,0.0,0.0,0.0
paid_sum,0.0,0.0,0.0,0.0,0.0
claim_sum_infl,0.0,0.0,0.0,0.0,0.0


# Points To Cities

In [17]:
%%time
data_points2cities = pd.merge(data_points[['index','latitude','longitude']], data_osm_cities[['city_lon','city_lat','city_population']], how='cross')

CPU times: user 6.53 s, sys: 8.06 s, total: 14.6 s
Wall time: 14.6 s


In [18]:
data_points2cities.shape

(51626484, 6)

## P2C Dist

In [19]:
def f_dist(df, col_lat_1, col_lon_1, col_lat_2, col_lon_2, col_dist):
    
    lat_1 = math.pi/180 * df[col_lat_1].to_numpy()
    lon_1 = math.pi/180 * df[col_lon_1].to_numpy()
    lat_2 = math.pi/180 * df[col_lat_2].to_numpy()
    lon_2 = math.pi/180 * df[col_lon_2].to_numpy()
    
    dlon = lon_1 - lon_2
    dlat = lat_1 - lat_2
    a = np.sin(dlat/2)**2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    dist = c * 6371.009
    
    df[col_dist] = dist
    
    return df

In [20]:
%%time
data_points2cities = f_dist(data_points2cities, 'city_lat', 'city_lon', 'latitude', 'longitude', 'dist')

CPU times: user 7.22 s, sys: 3.11 s, total: 10.3 s
Wall time: 10.3 s


In [21]:
data_points2cities.head()

Unnamed: 0,index,latitude,longitude,city_lon,city_lat,city_population,dist
0,0,44.780445,40.54908,39.190593,44.883197,12689.0,107.732841
1,0,44.780445,40.54908,40.060501,44.873023,13954.0,39.882739
2,0,44.780445,40.54908,40.499439,44.897871,7183.0,13.631258
3,0,44.780445,40.54908,39.581746,45.134793,9419.0,85.707568
4,0,44.780445,40.54908,40.104261,44.605519,144055.0,40.182897


In [24]:
print(f"dist_min = {min(data_points2cities['dist'])} km")
print(f"dist_max = {max(data_points2cities['dist'])} km")

dist_min = 0.0 km
dist_max = 8724.322141021768 km


## Circle 50 km

### Population In The Circle

In [23]:
dist_max_50 = 50

In [25]:
%%time
data_points2cities_pop_50 = data_points2cities.loc[(data_points2cities['dist'] < dist_max_50)&(pd.notnull(data_points2cities['city_population'])), ['index','city_population']].groupby('index').sum().reset_index()

CPU times: user 5.72 s, sys: 3.89 s, total: 9.6 s
Wall time: 9.59 s


In [26]:
data_points2cities_pop_50.rename(columns={'city_population':'population_50'}, inplace=True)

In [27]:
data_points2cities_pop_50.shape

(17892, 2)

In [28]:
data_points2cities_pop_50.head()

Unnamed: 0,index,population_50
0,0,358584.0
1,1,1797154.0
2,2,1806573.0
3,3,1771284.0
4,4,1156872.0


### The Biggest City In The Circle

In [32]:
%%time
data_points2cities_max_50 = data_points2cities.loc[(data_points2cities['dist'] < dist_max_50)&(pd.notnull(data_points2cities['city_population'])), ['index','city_population']].groupby('index').idxmax().reset_index()

CPU times: user 19.1 s, sys: 358 ms, total: 19.5 s
Wall time: 19.5 s


In [35]:
data_points2cities_max_50.shape

(17892, 2)

In [33]:
data_points2cities_max_50.head()

Unnamed: 0,index,city_population
0,0,4
1,1,3456
2,2,6108
3,3,8760
4,4,10803


In [34]:
data_points2cities_max_50.rename(columns={'city_population':'city_population_idxmax_50'}, inplace=True)

In [37]:
%%time
data_points2cities_max_50 = data_points2cities_max_50.merge(data_points2cities, how='left', left_on='city_population_idxmax_50', right_index=True)

CPU times: user 18 s, sys: 4.23 s, total: 22.2 s
Wall time: 22.2 s


In [38]:
data_points2cities_max_50.shape

(17892, 9)

In [39]:
data_points2cities_max_50.head()

Unnamed: 0,index_x,city_population_idxmax_50,index_y,latitude,longitude,city_lon,city_lat,city_population,dist
0,0,4,0,44.780445,40.54908,40.104261,44.605519,144055.0,40.182897
1,1,3456,1,45.024727,38.934097,38.986224,45.047522,1405000.0,4.81683
2,2,6108,2,45.006897,38.979748,38.986224,45.047522,1405000.0,4.545896
3,3,8760,3,44.98083,39.096226,38.986224,45.047522,1405000.0,11.391434
4,4,10803,4,54.564808,55.924335,55.993042,54.728227,1110976.0,18.701382


In [40]:
data_points2cities_max_50.loc[data_points2cities_max_50.index_x != data_points2cities_max_50.index_y]

Unnamed: 0,index_x,city_population_idxmax_50,index_y,latitude,longitude,city_lon,city_lat,city_population,dist


In [47]:
data_points2cities_max_50.rename(columns={'index_x':'index', 'city_lon':'city_lon_max_50', 'city_lat':'city_lat_max_50', 'city_population':'city_population_max_50', 'dist':'city_dist_max_50'}, inplace=True)

In [43]:
data_points2cities_max_50.drop('index_y', axis=1, inplace=True)

## Update

In [44]:
%%time
data_points = data_points.merge(data_points2cities_pop_50, how='left', left_on='index', right_on='index')

CPU times: user 39.1 ms, sys: 6.03 ms, total: 45.1 ms
Wall time: 42.8 ms


In [45]:
data_points.shape

(19467, 35)

In [48]:
%%time
data_points = data_points.merge(data_points2cities_max_50[['index','city_lon_max_50','city_lat_max_50','city_population_max_50','city_dist_max_50']], how='left', left_on='index', right_on='index')

CPU times: user 26.9 ms, sys: 3.54 ms, total: 30.4 ms
Wall time: 27.9 ms


In [49]:
data_points.shape

(19467, 39)

In [51]:
data_points.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
date,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45
OwnerKLADRCode,0100200001300,0100500000900,0100500003200,0100600002500,0200000102700
count_pol,1,5,1,1,4
premium,2944.37,34148.81,7494.76,5353.4,39791.98
count_pol_with_claim,0,0,0,0,0
claim_count,0,0,0,0,0
claim_sum,0.0,0.0,0.0,0.0,0.0
paid_sum,0.0,0.0,0.0,0.0,0.0
claim_sum_infl,0.0,0.0,0.0,0.0,0.0


# Update Regions

In [52]:
data_points['region_code'] = data_points['OwnerKLADRCode'].str[:2].astype('int')

In [53]:
data_points[['region_code']].groupby('region_code').size()

region_code
1       27
2      301
3      272
4      184
5     1085
6       32
7       46
8       66
9       82
10       6
11      10
12      23
13      81
14     450
15      29
16      59
17     150
18      19
19     256
20      82
21      18
22    1236
23     883
24    1378
25     594
26     580
27     364
28     581
29       6
30      13
31     118
32      43
33      34
34     231
35      12
36      89
37      58
38    1358
39      15
40      35
41      33
42     468
43      16
44      15
45      31
46      34
47      31
48      67
49      59
50    1266
51      11
52      82
53       8
54    1125
55     369
56      50
57      13
58      46
59      18
60      26
61    1410
62      16
63      41
64    1089
65     159
66      75
67      24
68      25
69      29
70      33
71      43
72     585
73      48
74      53
75     685
76      16
77      96
78       9
79     107
83       1
86      89
87      20
89      71
91      63
92       3
99       3
dtype: int64

In [54]:
%%time
data_points.loc[data_points['region_code']==80, 'region_code'] = 75
data_points.loc[data_points['region_code']==81, 'region_code'] = 59
data_points.loc[data_points['region_code']==82, 'region_code'] = 41
data_points.loc[data_points['region_code']==85, 'region_code'] = 38
data_points.loc[data_points['region_code']==88, 'region_code'] = 24

CPU times: user 6.94 ms, sys: 1.11 ms, total: 8.05 ms
Wall time: 6.36 ms


In [55]:
data_points[['region_code']].groupby('region_code').size()

region_code
1       27
2      301
3      272
4      184
5     1085
6       32
7       46
8       66
9       82
10       6
11      10
12      23
13      81
14     450
15      29
16      59
17     150
18      19
19     256
20      82
21      18
22    1236
23     883
24    1378
25     594
26     580
27     364
28     581
29       6
30      13
31     118
32      43
33      34
34     231
35      12
36      89
37      58
38    1358
39      15
40      35
41      33
42     468
43      16
44      15
45      31
46      34
47      31
48      67
49      59
50    1266
51      11
52      82
53       8
54    1125
55     369
56      50
57      13
58      46
59      18
60      26
61    1410
62      16
63      41
64    1089
65     159
66      75
67      24
68      25
69      29
70      33
71      43
72     585
73      48
74      53
75     685
76      16
77      96
78       9
79     107
83       1
86      89
87      20
89      71
91      63
92       3
99       3
dtype: int64

In [56]:
len(data_points['region_code'].unique())

86

In [57]:
%%time
data_points = data_points.merge(data_regions, right_on='region_code', left_on='region_code', how='left')

CPU times: user 50.9 ms, sys: 3.51 ms, total: 54.4 ms
Wall time: 51.3 ms


In [58]:
data_points.shape

(19467, 51)

## Points to Regions' Centers Dist

In [59]:
%%time
data_points = f_dist(data_points, 'center_lat', 'center_lon', 'latitude', 'longitude', 'center_dist')

CPU times: user 11.8 ms, sys: 0 ns, total: 11.8 ms
Wall time: 8.38 ms


In [60]:
data_points.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
date,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45
OwnerKLADRCode,0100200001300,0100500000900,0100500003200,0100600002500,0200000102700
count_pol,1,5,1,1,4
premium,2944.37,34148.81,7494.76,5353.4,39791.98
count_pol_with_claim,0,0,0,0,0
claim_count,0,0,0,0,0
claim_sum,0.0,0.0,0.0,0.0,0.0
paid_sum,0.0,0.0,0.0,0.0,0.0
claim_sum_infl,0.0,0.0,0.0,0.0,0.0


In [62]:
data_points.to_csv('data_points_1.csv', sep=';', index=False)

In [None]:
data_points = pd.read_csv('data_points_1.csv', sep=';')

In [61]:
with open('data_points_1.pickle', 'wb') as _f:
    pickle.dump(data_points, _f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('data_points_1.pickle', 'rb') as _f:
    data_points = pickle.load(_f)