In [1]:
# import numpy and pandas (dataframe)
import pandas as pd
import numpy as np

# import packages for web scrapping: beautifulsoup & requests
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen

# import map rendering libraries
%matplotlib inline
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# import library for Geocoding
import geopy
import geopandas

# import map rendering library
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

In [2]:
# wikipedia site to obtain postal code of Canada
url = 'https://www.travelchinaguide.com/essential/area_zip/beijing.htm'

html = urlopen(url) # open url
html = BeautifulSoup(html) # use Beautifulsoup to download html data and store into variable "html"

In [3]:
# use .prettify to improve visual of HTML structure

print(html.prettify())

<!DOCTYPE html>
<html>
 <head class="function-videoCity">
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Beijing Area/Telephone Code: 10, Zip/Postal Code of Chaoyang, Haidian…
  </title>
  <meta content="Here lists Beijing area code and zip code of the 16 districts, including Dongcheng, Xicheng, Chaoyang, Fengtai, Shijingshan, Haidian, Mentougou, Fangshan, Tongzhou, Shunyi, Changping, Daxing, Huairou, Pinggu, Miyun, and Yanqing. Besides, you can input the address to search the zip code." name="description"/>
  <link href="/essential/area_zip/beijing.htm" rel="canonical"/>
  <script type="text/javascript">
   var ptype=1;var pid=2834;var ptitle="Beijing Area & Zip Code";var purl="/essential/area_zip/beijing.htm";
  </script>
  <!--cityguide-new.html-->
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
  <!--\\server05\wwwroot\_01_tcgwww\inc\bus-->
  <style type="text/css">
   * {padding:0;m

In [4]:
# identify <table> with class="c_tableX".
# this table contains table header (i.e columns): "district","area code" & 'zip code' data
# and data for each column in <tr></tr>

tbl = html.find('table', class_="c_tableX")
print(tbl)

<table border="1" cellpadding="1" cellspacing="1" class="c_tableX"> <tbody> <tr> <th>District</th> <th>Beijing Area Code</th> <th>Beijing Zip Code</th> </tr> <tr> <td>Dongcheng</td> <td>10</td> <td>100000</td> </tr> <tr> <td>Xicheng</td> <td>10</td> <td>100000</td> </tr> <tr> <td>Chaoyang</td> <td>10</td> <td>100000</td> </tr> <tr> <td>Fengtai</td> <td>10</td> <td>100000</td> </tr> <tr> <td>Shijingshan</td> <td>10</td> <td>100000</td> </tr> <tr> <td>Haidian</td> <td>10</td> <td>100000</td> </tr> <tr> <td>Mentougou</td> <td>10</td> <td>102300</td> </tr> <tr> <td>Fangshan</td> <td>10</td> <td>102400</td> </tr> <tr> <td>Tongzhou</td> <td>10</td> <td>101100</td> </tr> <tr> <td>Shunyi</td> <td>10</td> <td>101300</td> </tr> <tr> <td>Changping</td> <td>10</td> <td>102200</td> </tr> <tr> <td>Daxing</td> <td>10</td> <td>102600</td> </tr> <tr> <td>Huairou</td> <td>10</td> <td>101400</td> </tr> <tr> <td>Pinggu</td> <td>10</td> <td>101200</td> </tr> <tr> <td>Miyun</td> <td>10</td> <td>101500</td> 

<H2>Extract data from each row</H2>

In [5]:
tbl_data = tbl.find_all('td')
print(tbl_data)

[<td>Dongcheng</td>, <td>10</td>, <td>100000</td>, <td>Xicheng</td>, <td>10</td>, <td>100000</td>, <td>Chaoyang</td>, <td>10</td>, <td>100000</td>, <td>Fengtai</td>, <td>10</td>, <td>100000</td>, <td>Shijingshan</td>, <td>10</td>, <td>100000</td>, <td>Haidian</td>, <td>10</td>, <td>100000</td>, <td>Mentougou</td>, <td>10</td>, <td>102300</td>, <td>Fangshan</td>, <td>10</td>, <td>102400</td>, <td>Tongzhou</td>, <td>10</td>, <td>101100</td>, <td>Shunyi</td>, <td>10</td>, <td>101300</td>, <td>Changping</td>, <td>10</td>, <td>102200</td>, <td>Daxing</td>, <td>10</td>, <td>102600</td>, <td>Huairou</td>, <td>10</td>, <td>101400</td>, <td>Pinggu</td>, <td>10</td>, <td>101200</td>, <td>Miyun</td>, <td>10</td>, <td>101500</td>, <td>Yanqing</td>, <td>10</td>, <td>102100</td>]


In [6]:
district = [tbl_data[x].string for x in np.arange(len(tbl_data), step=3)]
zip_code = [tbl_data[x].string for x in np.arange(2, len(tbl_data), 3)]

In [7]:
print("Districts in Beijing city are:\n\n {}".format(district))

Districts in Beijing city are:

 ['Dongcheng', 'Xicheng', 'Chaoyang', 'Fengtai', 'Shijingshan', 'Haidian', 'Mentougou', 'Fangshan', 'Tongzhou', 'Shunyi', 'Changping', 'Daxing', 'Huairou', 'Pinggu', 'Miyun', 'Yanqing']


In [8]:
# create data frame based on district & zip code

df = pd.DataFrame(zip(district, zip_code), columns=['District', 'Zip_Code'])
df.head()

Unnamed: 0,District,Zip_Code
0,Dongcheng,100000
1,Xicheng,100000
2,Chaoyang,100000
3,Fengtai,100000
4,Shijingshan,100000


In [16]:
# Since few districts share zip_code 100000, we exclude zip_code 100000

new = df.query("Zip_Code != '100000'").reset_index(drop=True)
new

Unnamed: 0,District,Zip_Code
0,Mentougou,102300
1,Fangshan,102400
2,Tongzhou,101100
3,Shunyi,101300
4,Changping,102200
5,Daxing,102600
6,Huairou,101400
7,Pinggu,101200
8,Miyun,101500
9,Yanqing,102100


<H2>Retrieve Latitude/Longitude based on Zip_Code</H2>

In [40]:
# use geopy to obtain latitude & longitude for each zip code in Beijing

geo = []

location = geopy.Nominatim(user_agent="Detector", timeout=20)

for a, b in zip(new['Zip_Code'], new['District']):
    
    loc = location.geocode("{} {}, Beijing".format(a, b))
    
    lat, lng = loc.latitude, loc.longitude # extract latitude & longitude info
    geo.append((b, lat, lng))

In [42]:
# create data frame based on 'geo'

geo_tbl = pd.DataFrame(geo, columns=['District', 'latitude', 'longitude'])
geo_tbl

Unnamed: 0,District,latitude,longitude
0,Mentougou,39.938998,116.094757
1,Fangshan,39.731258,116.163715
2,Tongzhou,39.906345,116.628416
3,Shunyi,40.103521,116.650037
4,Changping,40.199867,116.245821
5,Daxing,39.713371,116.315456
6,Huairou,40.46474,116.51215
7,Pinggu,40.417925,117.149612
8,Miyun,40.384026,116.830116
9,Yanqing,40.510521,115.788877


In [50]:
city_df = new.join(geo_tbl.set_index('District'), on='District')
city_df

Unnamed: 0,District,Zip_Code,latitude,longitude
0,Mentougou,102300,39.938998,116.094757
1,Fangshan,102400,39.731258,116.163715
2,Tongzhou,101100,39.906345,116.628416
3,Shunyi,101300,40.103521,116.650037
4,Changping,102200,40.199867,116.245821
5,Daxing,102600,39.713371,116.315456
6,Huairou,101400,40.46474,116.51215
7,Pinggu,101200,40.417925,117.149612
8,Miyun,101500,40.384026,116.830116
9,Yanqing,102100,40.510521,115.788877


In [51]:
# Get latitude & longitude of Beijing

address = 'Beijing, China'

geolocator = geopy.Nominatim(user_agent="CN_explorer", timeout=30)
explore = geolocator.geocode(address)
cn_lat, cn_lng = explore.latitude, explore.longitude
print('The geograpical coordinate of Beijing is {}, {}.'.format(cn_lat, cn_lng))

The geograpical coordinate of Beijing is 39.906217, 116.3912757.


In [54]:
# create map of Beijing using latitude and longitude values
map_CN = folium.Map(location=[cn_lat, cn_lng], zoom_start=8)

# add markers to map based on coordinates of postcode
for lat, lng, district in zip(city_df.latitude, city_df.longitude, city_df.District):
    label = '{}'.format(district)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_CN)  
    
map_CN