In [83]:
import os
from collections import defaultdict
from datetime import date

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from geopy.geocoders import GoogleV3
from sklearn.cluster import KMeans
from IPython.display import Image

In [86]:
load_dotenv()
CLIENT_ID = os.environ.get('CLIENT_ID')
CLIENT_SECRET = os.environ.get('CLIENT_SECRET')
GOOGLE_GEO_API_KEY = os.environ.get('GOOGLE_GEO_API_KEY')

# Load and save html with zip codes

In [3]:
page_url = 'http://belpost.by/branch/post/otdeleniya-sviazi/'
html_page_name = 'otdeleniya-sviazi.html'

In [4]:
source_html = requests.get(page_url).text

with open(html_page_name, 'w') as html_file:
    html_file.write(source_html)

print('Save source page to html file - done!')

Save source page to html file - done!


In [5]:
with open(html_page_name) as html_file:
    soup = BeautifulSoup(html_file, 'html.parser')

print('soup is ready:', soup.title.text)

soup is ready: Адреса и режим работы отделений почтовой связи (ОПС) и пунктов почтовой связи производства “Минская почта” - Белпочта


# Zip Code Parser 

In [6]:
table_body = soup.find_all('table', attrs={'border': 1})
len(table_body)

4

In [7]:
all_table_rows = table_body[1].find_all('tr')

In [44]:
error = 0
zip_codes_minsk_list = []
for i, row in enumerate(all_table_rows, 0):
    cols_all = row.find_all('td')
    try:
        address_full = cols_all[1].find_all('div')
        
        if address_full and len(address_full) > 1:
            
            if len(address_full) > 1:
                street = address_full[0].string
                if not street:
                    street = address_full[0].text
                zip_code = (address_full[1].string)
                if not zip_code:
                    zip_code = address_full[1].text
            else:
                continue

        elif not address_full or len(address_full) == 1:
            some_content = cols_all[1].contents
            if len(some_content) == 1:
                some_content = cols_all[1].find('div').contents

            address = [(q.string).replace('\xa0', ' ').strip() 
                       for q in some_content if q.string]
            
            street = address[0]
            zip_code = address[1]
        else:
            continue
            
        street = street.replace('\xa0', ' ').strip()
        zip_code = zip_code[:6]
        
        zip_codes_minsk_list.append([street, int(zip_code), 0.0, 0.0])
        
    except Exception as ex:
        error += 1
        print('Exception in address_full: %s' % ex, i)

print('errors:', error)
print('zip_codes_minsk_list:', len(zip_codes_minsk_list))

Exception in address_full: 'NoneType' object has no attribute 'contents' 0
Exception in address_full: list index out of range 116
errors: 2
zip_codes_minsk_list: 125


# Create Pandas data frame

In [36]:
pd_columns = ['address', 'zip-code', 'lat', 'lng']
df_minsk = pd.DataFrame(data=zip_codes_minsk_list, columns=pd_columns)

In [37]:
zip_codes = df_minsk['zip-code']
df_minsk[zip_codes.isin(zip_codes[zip_codes.duplicated()])]

Unnamed: 0,address,zip-code,lat,lng
6,"ул. Жуковского, 6, к.2",220007,0.0,0.0
7,"ул. Жуковского, 6, к.2",220007,0.0,0.0
47,"ул. Нестерова, 51",220047,0.0,0.0
48,"ул. Нестерова, 51",220047,0.0,0.0
119,"ул. Одинцова, 36,к.1",220136,0.0,0.0
120,"ул. Одинцова, 36,к.1",220136,0.0,0.0


# delete repeated zip codes

In [38]:
df_minsk.sort_values('zip-code', inplace=True)
df_minsk.drop_duplicates(subset='zip-code', inplace=True)
df_minsk.reset_index(inplace=True)
df_minsk.head()

Unnamed: 0,index,address,zip-code,lat,lng
0,0,"ул. Московская,16",220001,0.0,0.0
1,1,"ул. Сторожевская, 8",220002,0.0,0.0
2,2,"ул. Одинцова, 113",220003,0.0,0.0
3,3,"ул. М.Танка,36, к.2,",220004,0.0,0.0
4,4,"пр. Независимости, 46",220005,0.0,0.0


In [39]:
df_minsk.drop(columns=['index'], inplace=True)

In [40]:
print(len(df_minsk['zip-code'].unique()))
print(df_minsk.shape)

122
(122, 4)


In [41]:
# df_minsk.to_csv('zip_codes_minsk_list.csv')

In [42]:
df_minsk.head()

Unnamed: 0,address,zip-code,lat,lng
0,"ул. Московская,16",220001,0.0,0.0
1,"ул. Сторожевская, 8",220002,0.0,0.0
2,"ул. Одинцова, 113",220003,0.0,0.0
3,"ул. М.Танка,36, к.2,",220004,0.0,0.0
4,"пр. Независимости, 46",220005,0.0,0.0


# Find coordinates for all zip codes from Google Geo API

In [100]:
for i in range(df_minsk.shape[0]):
    try:
        address = df_minsk.iloc[i, 0] + ', Minsk'
        
        google_location = GoogleV3(api_key=GOOGLE_GEO_API_KEY)
        coords = google_location.geocode(address)
        
        df_minsk.iloc[i, 2:4] = float(coords.latitude), float(coords.longitude)

        print(i, coords.latitude, coords.longitude)
    except Exception as ex:
        print('Exception in google_location: %s' % ex)

0 53.887919 27.5381191
1 53.9129489 27.5554527
2 53.9006514 27.4275318
3 53.9086282 27.5294632
4 53.91281499999999 27.5807098
5 53.8835694 27.5664649
6 53.88244659999999 27.5497952
7 53.8999924 27.5586274
8 53.8899568 27.6112189
9 53.896201 27.5448597
10 53.8906326 27.550621
11 53.9238362 27.6102492
12 53.9242816 27.5911224
13 53.8670165 27.5112945
14 53.8970339 27.4914757
15 53.9003685 27.5644425
16 53.9144154 27.4348301
17 53.8971674 27.4521196
18 53.8822002 27.4324538
19 53.9311816 27.490671
20 53.8693841 27.650115
21 53.8829154 27.4395681
22 53.9280942 27.626541
23 53.8509136 27.5456794
24 53.8495964 27.4652115
25 53.8696759 27.6269446
26 53.8606217 27.4979263
27 53.8633581 27.5708799
28 53.91166639999999 27.5651746
29 53.8989762 27.5635574
30 53.9364251 27.7200152
31 53.922778 27.636838
32 53.8849653 27.5943195
33 53.90838249999999 27.5817171
34 53.9180661 27.529327
35 53.8950403 27.5203662
36 53.9023605 27.6171665
37 53.90108129999999 27.6015507
38 53.8767896 27.5316228
39 53.932

In [101]:
df_minsk.isnull().sum()

address     0
zip-code    0
lat         0
lng         0
dtype: int64

In [102]:
0.0 in df_minsk.values

False

In [103]:
df_minsk.to_csv('zip_codes_minsk_list.csv')