In [83]:
import os

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from geopy.geocoders import GoogleV3

In [86]:
load_dotenv()
GOOGLE_GEO_API_KEY = os.environ.get('GOOGLE_GEO_API_KEY')

# Load and save html with zip codes

In [3]:
page_url = 'http://belpost.by/branch/post/otdeleniya-sviazi/'
html_page_name = 'otdeleniya-sviazi.html'

In [4]:
source_html = requests.get(page_url).text

with open(html_page_name, 'w') as html_file:
    html_file.write(source_html)

print('Save source page to html file - done!')

Save source page to html file - done!


In [5]:
with open(html_page_name) as html_file:
    soup = BeautifulSoup(html_file, 'html.parser')

print('soup is ready:', soup.title.text)

soup is ready: Адреса и режим работы отделений почтовой связи (ОПС) и пунктов почтовой связи производства “Минская почта” - Белпочта


# Zip Code Parser 

In [6]:
table_body = soup.find_all('table', attrs={'border': 1})
len(table_body)

4

In [7]:
all_table_rows = table_body[1].find_all('tr')

In [44]:
error = 0
zip_codes_minsk_list = []
for i, row in enumerate(all_table_rows, 0):
    cols_all = row.find_all('td')
    try:
        address_full = cols_all[1].find_all('div')
        
        if address_full and len(address_full) > 1:
            street = address_full[0].string
            if not street:
                street = address_full[0].text
            zip_code = address_full[1].string
            if not zip_code:
                zip_code = address_full[1].text

        elif not address_full or len(address_full) == 1:
            some_content = cols_all[1].contents
            if len(some_content) == 1:
                some_content = cols_all[1].find('div').contents

            address = [(q.string).replace('\xa0', ' ').strip() 
                       for q in some_content if q.string]
            
            street = address[0]
            zip_code = address[1]
        else:
            continue
            
        street = street.replace('\xa0', ' ').strip()
        zip_code = zip_code[:6]
        
        zip_codes_minsk_list.append([street, int(zip_code), 0.0, 0.0])
        
    except Exception as ex:
        error += 1
        print('Exception in address_full: %s' % ex, i)

print('errors:', error)
print('zip_codes_minsk_list:', len(zip_codes_minsk_list))

Exception in address_full: 'NoneType' object has no attribute 'contents' 0
Exception in address_full: list index out of range 116
errors: 2
zip_codes_minsk_list: 125


# Create Pandas data frame

In [36]:
pd_columns = ['address', 'zip-code', 'lat', 'lng']
df_minsk = pd.DataFrame(data=zip_codes_minsk_list, columns=pd_columns)

In [37]:
zip_codes = df_minsk['zip-code']
df_minsk[zip_codes.isin(zip_codes[zip_codes.duplicated()])]

Unnamed: 0,address,zip-code,lat,lng
6,"ул. Жуковского, 6, к.2",220007,0.0,0.0
7,"ул. Жуковского, 6, к.2",220007,0.0,0.0
47,"ул. Нестерова, 51",220047,0.0,0.0
48,"ул. Нестерова, 51",220047,0.0,0.0
119,"ул. Одинцова, 36,к.1",220136,0.0,0.0
120,"ул. Одинцова, 36,к.1",220136,0.0,0.0


# delete repeated zip codes

In [38]:
df_minsk.sort_values('zip-code', inplace=True)
df_minsk.drop_duplicates(subset='zip-code', inplace=True)
df_minsk.reset_index(inplace=True)
df_minsk.head()

Unnamed: 0,index,address,zip-code,lat,lng
0,0,"ул. Московская,16",220001,0.0,0.0
1,1,"ул. Сторожевская, 8",220002,0.0,0.0
2,2,"ул. Одинцова, 113",220003,0.0,0.0
3,3,"ул. М.Танка,36, к.2,",220004,0.0,0.0
4,4,"пр. Независимости, 46",220005,0.0,0.0


In [39]:
df_minsk.drop(columns=['index'], inplace=True)

In [40]:
print(len(df_minsk['zip-code'].unique()))
print(df_minsk.shape)

122
(122, 4)


In [41]:
# df_minsk.to_csv('zip_codes_minsk_list.csv')

# Find coordinates for all zip codes with Google Geo API

In [None]:
for i in range(df_minsk.shape[0]):
    try:
        address = df_minsk.iloc[i, 0] + ', Minsk'
        
        google_geo = GoogleV3(api_key=GOOGLE_GEO_API_KEY)
        coords = google_geo.geocode(address)
        
        lat = float(coords.latitude)
        lng = float(coords.longitude)
        
        df_minsk.iloc[i, 2:4] = lat, lng
        #  print(i, lat, lng)
    except Exception as ex:
        print('Exception in google_location: %s' % ex)

In [101]:
df_minsk.isnull().sum()

address     0
zip-code    0
lat         0
lng         0
dtype: int64

In [102]:
0.0 in df_minsk.values

False

In [103]:
df_minsk.to_csv('zip_codes_minsk_list.csv')

In [104]:
df_minsk.head()

Unnamed: 0,address,zip-code,lat,lng
0,"ул. Московская,16",220001,53.887919,27.538119
1,"ул. Сторожевская, 8",220002,53.912949,27.555453
2,"ул. Одинцова, 113",220003,53.900651,27.427532
3,"ул. М.Танка,36, к.2,",220004,53.908628,27.529463
4,"пр. Независимости, 46",220005,53.912815,27.58071


# Done