In [2]:
import json
import os

import requests
# import requests_cache

import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_rows = 100

# Load Data

In [4]:
overpass_url = "http://overpass-api.de/api/interpreter"

## Regions

In [5]:
overpass_query_regions = """
[out:json];
area["ISO3166-1"="RU"][admin_level=2];
(
    node["place"="state"](area);
    way["place"="state"](area);
    relation["place"="state"](area);
);
out center;
"""

In [6]:
response_regions = requests.get(overpass_url, 
                                params={'data': overpass_query_regions})

In [7]:
data_regions = response_regions.json()

In [8]:
data_regions['elements'][0]

{'type': 'node',
 'id': 502842538,
 'lat': 45.0819456,
 'lon': 134.726645,
 'tags': {'name': 'Приморский край',
  'name:be': 'Прыморскі край',
  'name:da': 'Primorskij kraj',
  'name:de': 'Region Primorje',
  'name:en': 'Primorsky Krai',
  'name:eo': 'Ĉemara Regiono',
  'name:es': 'Primorie',
  'name:fr': 'Primorie',
  'name:he': 'מחוז פרימוריה',
  'name:hr': 'Primorski kraj',
  'name:hu': 'Tengermelléki határterület',
  'name:ja': '沿海地方',
  'name:ko': '프리모르스키 지방',
  'name:lt': 'Primorės kraštas',
  'name:pl': 'Kraj Nadmorski',
  'name:ru': 'Приморский край',
  'name:sk': 'Prímorský kraj',
  'name:tt': 'Диңгез буе крае',
  'name:uk': 'Приморський край',
  'name:zh': '滨海边疆区',
  'official_name:es': 'Krai de Primorie',
  'official_name:fr': 'Kraï du Primorie',
  'place': 'state',
  'ref': 'ПРИ',
  'ref:en': 'PRI',
  'wikidata': 'Q4341',
  'wikipedia': 'ru:Приморский край'}}

In [9]:
len(data_regions['elements'])

85

In [10]:
type_regions = []
lon_regions = []
lat_regions = []
name_regions = []
population_regions = []



for element in data_regions['elements']:
  

    if element['type'] == 'node':
        
        type_regions.append(element['type'])
        
        lon_regions.append(element['lon'])

        lat_regions.append(element['lat'])

        name_regions.append(element['tags']['name'])

        try:
            population_regions.append(element['tags']['population'])
        except Exception:
            population_regions.append(None)
 

    elif 'center' in element:

        type_regions.append(element['type'])
        
        lon_regions.append(element['center']['lon'])

        lat_regions.append(element['center']['lat'])

        name_regions.append(element['tags']['name'])

        try:
            population_regions.append(element['population'])
        except Exception:
            population_regions.append(None)
  


df_regions = pd.DataFrame(data={'type':type_regions,'lon':lon_regions,'lat':lat_regions,'name':name_regions,'population':population_regions})

In [11]:
df_regions
# Отутствует 'Дагестан'

Unnamed: 0,type,lon,lat,name,population
0,node,134.726645,45.081946,Приморский край,
1,node,128.437295,52.803237,Амурская область,
2,node,82.047532,58.612428,Томская область,
3,node,74.341549,67.147163,Ямало-Ненецкий автономный округ,
4,node,68.641816,61.879343,Ханты-Мансийский автономный округ — Югра,
5,node,70.365884,58.820649,Тюменская область,
6,node,73.509994,56.093526,Омская область,
7,node,79.481392,54.972017,Новосибирская область,
8,node,87.342861,54.533578,Кемеровская область,
9,node,82.693142,52.693224,Алтайский край,


In [12]:
df_regions['type'].unique()

array(['node'], dtype=object)

## Cities

In [24]:
directory = './data_cities'

In [13]:
r = 'abc'
a = '''
    [out:json];
    area
        ["boundary"="administrative"]
        ["name"="''' + r + '''"]
        ->.a;
    (
        node["place"~"town|city"](area.a);
        way["place"~"town|city"](area.a);
        relation["place"~"town|city"](area.a);
    );
    out center;
    '''

a

'\n    [out:json];\n    area\n        ["boundary"="administrative"]\n        ["name"="abc"]\n        ->.a;\n    (\n        node["place"~"town|city"](area.a);\n        way["place"~"town|city"](area.a);\n        relation["place"~"town|city"](area.a);\n    );\n    out center;\n    '

In [19]:
regions_to_load = ['Республика Бурятия', 'Республика Коми'] # 'Дагестан'

In [20]:
for r in regions_to_load: # df_regions['name']: #[:3]:
    
    files = os.listdir(directory)
    files_json = [f for f in os.listdir(directory) if f.endswith('.json')]
    
    if f'data_cities_{r}.json' in files_json:
        pass

    else:
        response_ = requests.get(overpass_url, 
                            params={'data': 
                                        '''
                                        [out:json][timeout:1500];
                                        area
                                            ["boundary"="administrative"]
                                            ["name"="''' + r + '''"]
                                            ->.a;
                                        (
                                            node["place"~"town|city"](area.a);
                                            way["place"~"town|city"](area.a);
                                            relation["place"~"town|city"](area.a);
                                        );
                                        out center;
                                        '''
                                    })

        try:
            data_ = response_.json()
        except Exception:
            data_ = None

        if data_:           
            with open(f'{directory}/data_cities_{r}.json', "w") as f:
                json.dump(data_, f)

# Combine Data

In [25]:
files = os.listdir(directory)

In [26]:
len(files)

86

In [27]:
files_json = [f for f in os.listdir(directory) if f.endswith('.json')]

In [28]:
len(files_json)

86

In [29]:
type_ = []
lon_ = []
lat_ = []
region_ = []
region2_ = []
name_ = []
population_ = []


for f in files_json:
        
    with open(f'./{directory}/{f}', "r") as file_json_i:
        
        data_ = json.load(file_json_i)
        
        for element in data_['elements']:

            if element['type'] == 'node':

                region_.append(f[12:-5])   
                type_.append(element['type'])
                lon_.append(element['lon'])
                lat_.append(element['lat'])

                try:
                    region2_.append(element['tags']['addr:region'])
                except Exception:
                    region2_.append('n/d')

                try:
                    name_.append(element['tags']['name'])
                except Exception:
                    name_.append('n/d')

                try:
                    population_.append(element['tags']['population'])
                except Exception:
                    population_.append('n/d')

            elif 'center' in element:

                region_.append(f[12:-5])               
                type_.append(element['type'])
                lon_.append(element['center']['lon'])
                lat_.append(element['center']['lat'])

                try:
                    region2_.append(element['tags']['addr:region'])
                except Exception:
                    region2_.append('n/d')

                try:
                    name_.append(element['tags']['name'])
                except Exception:
                    name_.append('n/d')

                try:
                    population_.append(element['population'])
                except Exception:
                    population_.append('n/d')

In [None]:
df_cities = pd.DataFrame(data={'type':type_,'lon':lon_,'lat':lat_,'region':region_,'region2':region2_,'name':name_,'population':population_})

In [30]:
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5386 entries, 0 to 5385
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   type        5386 non-null   object 
 1   lon         5386 non-null   float64
 2   lat         5386 non-null   float64
 3   region      5386 non-null   object 
 4   region2     5386 non-null   object 
 5   name        5386 non-null   object 
 6   population  5386 non-null   object 
dtypes: float64(2), object(5)
memory usage: 294.7+ KB


In [31]:
df_cities.groupby('region').size()

region
Автономна Республіка Крим                   142
Адыгея                                       24
Алтайский край                              122
Амурская область                             46
Архангельская область                        50
Астраханская область                         25
Башкортостан                                117
Белгородская область                         58
Брянская область                             56
Владимирская область                         56
Волгоградская область                        80
Вологодская область                          54
Воронежская область                          74
Дагестан                                     98
Еврейская автономная область                 14
Забайкальский край                           62
Ивановская область                           46
Ингушетия                                    26
Иркутская область                            95
Кабардино-Балкария                           32
Калининградская область          

In [32]:
def f_space(text):
    if pd.isnull(text):
        b = None
    elif text == 'n/d':
        b = None
    else:
        b = text.split()
        b = ''.join(b)
    return b

In [33]:
df_cities['population'] = df_cities['population'].apply(f_space)

In [34]:
df_cities['population'] = pd.to_numeric(df_cities['population'], downcast='float')

In [35]:
df_cities.to_csv('data_cities.csv', sep=';')

# Prepare Data