In [1]:
import pandas as pd
import numpy as np
from numpy import inf, mean, median
import math
import datetime
import json
import os
import glob
import pickle
import requests
# import requests_cache
import plotly.graph_objects as go
import plotly.express as px
import folium
from folium.plugins import MarkerCluster, HeatMap
import geopandas as gpd
import osm2geojson
import geojson
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon, LineString

In [2]:
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

# Points

In [3]:
data_points = pd.read_excel(io='data_kladr_20210706.xlsx', sheet_name='Лист1')

In [4]:
data_points.shape

(19701, 39)

In [5]:
data_points = data_points.loc[pd.notnull(data_points.longitude)]

In [6]:
data_points.shape

(19467, 39)

In [7]:
data_points.columns

Index(['date', 'OwnerKLADRCode', 'count_pol', 'premium',
       'count_pol_with_claim', 'claim_count', 'claim_sum', 'paid_sum',
       'claim_sum_infl', 'paid_sum_infl', 'address', 'FullAddress', 'Region',
       'Rayon', 'City', 'latitude', 'longitude', 'count_pol_200',
       'premium_200', 'count_pol_with_claim_200', 'claim_count_200',
       'claim_sum_200', 'paid_sum_200', 'claim_sum_infl_200',
       'paid_sum_infl_200', 'count_pol_500', 'premium_500',
       'count_pol_with_claim_500', 'claim_count_500', 'claim_sum_500',
       'paid_sum_500', 'claim_sum_infl_500', 'paid_sum_infl_500',
       'population_200', 'District_city', 'Area_city', 'District_city_dist',
       'Area_city_dist', 'Population_density'],
      dtype='object')

In [8]:
data_points.drop(['count_pol_200',
       'premium_200', 'count_pol_with_claim_200', 'claim_count_200',
       'claim_sum_200', 'paid_sum_200', 'claim_sum_infl_200',
       'paid_sum_infl_200', 'count_pol_500', 'premium_500',
       'count_pol_with_claim_500', 'claim_count_500', 'claim_sum_500',
       'paid_sum_500', 'claim_sum_infl_500', 'paid_sum_infl_500',
       'population_200', 'District_city', 'Area_city', 'District_city_dist',
       'Area_city_dist', 'Population_density'], axis=1, inplace=True)

In [9]:
data_points.reset_index(inplace=True)

In [10]:
data_points.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
date,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45
OwnerKLADRCode,0100200001300,0100500000900,0100500003200,0100600002500,0200000102700
count_pol,1,5,1,1,4
premium,2944.37,34148.81,7494.76,5353.4,39791.98
count_pol_with_claim,0,0,0,0,0
claim_count,0,0,0,0,0
claim_sum,0.0,0.0,0.0,0.0,0.0
paid_sum,0.0,0.0,0.0,0.0,0.0
claim_sum_infl,0.0,0.0,0.0,0.0,0.0


# Points To Points

In [19]:
%%time
data_points2points = pd.merge(data_points[['index','latitude','longitude']], data_points[['index','latitude','longitude']], how='cross')

CPU times: user 56.2 s, sys: 1min 5s, total: 2min 1s
Wall time: 1min 59s


In [20]:
data_points2points.shape

(378964089, 6)

In [21]:
data_points2points.head().T

Unnamed: 0,0,1,2,3,4
index_x,0.0,0.0,0.0,0.0,0.0
latitude_x,44.780445,44.780445,44.780445,44.780445,44.780445
longitude_x,40.54908,40.54908,40.54908,40.54908,40.54908
index_y,0.0,1.0,2.0,3.0,4.0
latitude_y,44.780445,45.024727,45.006897,44.98083,54.564808
longitude_y,40.54908,38.934097,38.979748,39.096226,55.924335


## P2P Dist

In [22]:
def f_dist(df, col_lat_1, col_lon_1, col_lat_2, col_lon_2, col_dist):
    
    lat_1 = math.pi/180 * df[col_lat_1].to_numpy()
    lon_1 = math.pi/180 * df[col_lon_1].to_numpy()
    lat_2 = math.pi/180 * df[col_lat_2].to_numpy()
    lon_2 = math.pi/180 * df[col_lon_2].to_numpy()
    
    dlon = lon_1 - lon_2
    dlat = lat_1 - lat_2
    a = np.sin(dlat/2)**2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    dist = c * 6371.009
    
    df[col_dist] = dist
    
    return df

In [23]:
%%time
data_points2points = f_dist(data_points2points, 'latitude_x', 'longitude_x', 'latitude_y', 'longitude_y', 'dist')

CPU times: user 53 s, sys: 22.8 s, total: 1min 15s
Wall time: 1min 15s


In [24]:
data_points2points.head().T

Unnamed: 0,0,1,2,3,4
index_x,0.0,0.0,0.0,0.0,0.0
latitude_x,44.780445,44.780445,44.780445,44.780445,44.780445
longitude_x,40.54908,40.54908,40.54908,40.54908,40.54908
index_y,0.0,1.0,2.0,3.0,4.0
latitude_y,44.780445,45.024727,45.006897,44.98083,54.564808
longitude_y,40.54908,38.934097,38.979748,39.096226,55.924335
dist,0.0,130.062227,126.156569,116.617684,1545.373569


In [17]:
%%time
print(f"dist_min = {min(data_points2points['dist'])} km")
print(f"dist_max = {max(data_points2points['dist'])} km")

dist_min = 0.0 km
dist_max = 8769.602693526094 km
CPU times: user 2min 2s, sys: 3.06 s, total: 2min 6s
Wall time: 2min 1s


## Circle 200 km

In [25]:
dist_max_200 = 200

In [28]:
%%time
data_points2points_200 = data_points2points.loc[data_points2points.dist < dist_max_200].merge(data_points[['index',
                                                                                                           'count_pol',
                                                                                                           'premium',
                                                                                                           'count_pol_with_claim',
                                                                                                           'claim_count',
                                                                                                           'claim_sum',
                                                                                                           'paid_sum',
                                                                                                           'claim_sum_infl',
                                                                                                           'paid_sum_infl',
                                                                                                          ]], how='left', left_on='index_y', right_on='index')

CPU times: user 5.31 s, sys: 3.72 s, total: 9.03 s
Wall time: 9.03 s


In [29]:
data_points2points_200.shape

(12973247, 16)

In [30]:
%%time
data_points2points_200_gr = data_points2points_200[['index_x',
                                                    'count_pol',
                                                    'premium',
                                                    'count_pol_with_claim',
                                                    'claim_count',
                                                    'claim_sum',
                                                    'paid_sum',
                                                    'claim_sum_infl',
                                                    'paid_sum_infl',
                                                   ]].groupby('index_x').sum().reset_index()

CPU times: user 2.26 s, sys: 2.65 s, total: 4.91 s
Wall time: 4.91 s


In [31]:
data_points2points_200_gr.shape

(19467, 9)

In [32]:
data_points2points_200_gr.rename(columns={'count_pol':'count_pol_200',
                                          'premium':'premium_200',
                                          'count_pol_with_claim':'count_pol_with_claim_200',
                                          'claim_count':'claim_count_200',
                                          'claim_sum':'claim_sum_200',
                                          'paid_sum':'paid_sum_200',
                                          'claim_sum_infl':'claim_sum_infl_200',
                                          'paid_sum_infl':'paid_sum_infl_200',
                                         }, inplace=True)

In [33]:
data_points2points_200_gr.head().T

Unnamed: 0,0,1,2,3,4
index_x,0.0,1.0,2.0,3.0,4.0
count_pol_200,209872.0,143745.0,145076.0,147590.0,3128.0
premium_200,693432300.0,546575100.0,549810300.0,554673400.0,21107590.0
count_pol_with_claim_200,3657.0,2729.0,2753.0,2784.0,178.0
claim_count_200,3927.0,2926.0,2952.0,2984.0,225.0
claim_sum_200,313330100.0,229520100.0,231724500.0,234822000.0,12121110.0
paid_sum_200,304587600.0,223282700.0,225424600.0,228744200.0,11879490.0
claim_sum_infl_200,339392300.0,248399300.0,250756500.0,254101000.0,13371810.0
paid_sum_infl_200,330539800.0,242097400.0,244391400.0,247961200.0,13126990.0


## Circle 500 km

In [34]:
dist_max_500 = 500

In [35]:
%%time
data_points2points_500 = data_points2points.loc[data_points2points.dist < dist_max_500].merge(data_points[['index',
                                                                                                           'count_pol',
                                                                                                           'premium',
                                                                                                           'count_pol_with_claim',
                                                                                                           'claim_count',
                                                                                                           'claim_sum',
                                                                                                           'paid_sum',
                                                                                                           'claim_sum_infl',
                                                                                                           'paid_sum_infl',
                                                                                                          ]], how='left', left_on='index_y', right_on='index')

CPU times: user 15.8 s, sys: 11.3 s, total: 27.2 s
Wall time: 27.1 s


In [36]:
data_points2points_500.shape

(40319661, 16)

In [37]:
%%time
data_points2points_500_gr = data_points2points_500[['index_x',
                                                    'count_pol',
                                                    'premium',
                                                    'count_pol_with_claim',
                                                    'claim_count',
                                                    'claim_sum',
                                                    'paid_sum',
                                                    'claim_sum_infl',
                                                    'paid_sum_infl',
                                                   ]].groupby('index_x').sum().reset_index()

CPU times: user 6.88 s, sys: 8.15 s, total: 15 s
Wall time: 15 s


In [38]:
data_points2points_500_gr.shape

(19467, 9)

In [39]:
data_points2points_500_gr.rename(columns={'count_pol':'count_pol_500',
                                          'premium':'premium_500',
                                          'count_pol_with_claim':'count_pol_with_claim_500',
                                          'claim_count':'claim_count_500',
                                          'claim_sum':'claim_sum_500',
                                          'paid_sum':'paid_sum_500',
                                          'claim_sum_infl':'claim_sum_infl_500',
                                          'paid_sum_infl':'paid_sum_infl_500',
                                         }, inplace=True)

In [40]:
data_points2points_500_gr.head().T

Unnamed: 0,0,1,2,3,4
index_x,0.0,1.0,2.0,3.0,4.0
count_pol_500,464661.0,465784.0,465753.0,466291.0,7798.0
premium_500,1472574000.0,1475477000.0,1475352000.0,1476951000.0,40860800.0
count_pol_with_claim_500,7842.0,7789.0,7791.0,7821.0,301.0
claim_count_500,8400.0,8328.0,8331.0,8376.0,380.0
claim_sum_500,666560200.0,658648600.0,659018700.0,665079200.0,37528370.0
paid_sum_500,646454200.0,638651700.0,639021700.0,645009000.0,37125630.0
claim_sum_infl_500,720537000.0,711981200.0,712380200.0,718837800.0,40961000.0
paid_sum_infl_500,700191500.0,691752000.0,692151000.0,698529200.0,40553510.0


## Update

In [41]:
data_points.shape

(19467, 18)

In [42]:
%%time
data_points = data_points.merge(data_points2points_200_gr, how='left', left_on='index', right_on='index_x')

CPU times: user 29.8 ms, sys: 4.78 ms, total: 34.5 ms
Wall time: 32.6 ms


In [43]:
data_points.drop('index_x', axis=1, inplace=True)

In [44]:
data_points.shape

(19467, 26)

In [45]:
%%time
data_points = data_points.merge(data_points2points_500_gr, how='left', left_on='index', right_on='index_x')

CPU times: user 768 ms, sys: 58.4 ms, total: 826 ms
Wall time: 757 ms


In [46]:
data_points.drop('index_x', axis=1, inplace=True)

In [47]:
data_points.shape

(19467, 34)

In [48]:
data_points.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
date,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45,2021-07-07 11:32:45
OwnerKLADRCode,0100200001300,0100500000900,0100500003200,0100600002500,0200000102700
count_pol,1,5,1,1,4
premium,2944.37,34148.81,7494.76,5353.4,39791.98
count_pol_with_claim,0,0,0,0,0
claim_count,0,0,0,0,0
claim_sum,0.0,0.0,0.0,0.0,0.0
paid_sum,0.0,0.0,0.0,0.0,0.0
claim_sum_infl,0.0,0.0,0.0,0.0,0.0


In [49]:
data_points.to_csv('data_points.csv', sep=';', index=False)

In [None]:
data_points = pd.read_csv('data_points.csv', sep=';')

In [50]:
with open('data_points.pickle', 'wb') as _f:
    pickle.dump(data_points, _f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('data_points.pickle', 'rb') as _f:
    data_points = pickle.load(_f)