# Data sourcing

Using Python's built in web scraper to get the data needed for the Foursquare API

In [2]:
import requests #library to handle requests
import random # library for random number generation
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # to tranform JSON file into a pandas dataframe

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!pip install geopy # Uncomment if not installed
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

#! pip install folium==0.5.0  #Uncomment to install folium library
import folium # map rendering library


print("Libraries imported!")

Libraries imported!


In [None]:
# https://www.postoffice.co.za/Tools/postalcodes.html
#temp_df= pd.read_excel("postalcodes.xls")
#temp_df.head(25)

In [None]:

#url = "http://www.codes-sa.co.za/postal-gauteng.html" 

#johannesburg_postal_codes = requests.get(url)

# Checking to see if the request was successful 
#response = requests.get(url)

#if response.status_code == 200:
   # print('Success!')
#elif response.status_code == 404:
   # print('Not Found.')

## Data Scraping
finding codes

In [3]:
# Scrape the html page for the 6th table
johannesburg_codes = pd.read_html('http://www.codes-sa.co.za/postal-gauteng.html', header=0)[6]
johannesburg_codes

Unnamed: 0,Town / City,Street Code,PO Box Code
0,JOHANNESBURG,2000,2001
1,Bedford Gardens,n.a,2007
2,Bedfordpark,n.a,2007
3,Bedfordview,2008,2007
4,Belgravia,2043,2094
5,Bergvlei,2012,2090
6,Boskruin,2154,2188
7,Braamfontein,2017,2001
8,Bramley View,n.a,2090
9,Bromhof,2154,2188


In [4]:
# Drop NaN values
johannesburg_codes.dropna(axis=0, inplace= True)
johannesburg_codes.reset_index(drop = True, inplace = True)
johannesburg_codes.head(10)

Unnamed: 0,Town / City,Street Code,PO Box Code
0,JOHANNESBURG,2000,2001
1,Bedford Gardens,n.a,2007
2,Bedfordpark,n.a,2007
3,Bedfordview,2008,2007
4,Belgravia,2043,2094
5,Bergvlei,2012,2090
6,Boskruin,2154,2188
7,Braamfontein,2017,2001
8,Bramley View,n.a,2090
9,Bromhof,2154,2188


In [5]:
# Replacing all variations of null values with NaN
johannesburg_codes[["Street Code"]] = johannesburg_codes[["Street Code"]].replace(dict.fromkeys(["N/a", "n/a", "n.a"], np.nan))
johannesburg_codes.head(10)

Unnamed: 0,Town / City,Street Code,PO Box Code
0,JOHANNESBURG,2000.0,2001
1,Bedford Gardens,,2007
2,Bedfordpark,,2007
3,Bedfordview,2008.0,2007
4,Belgravia,2043.0,2094
5,Bergvlei,2012.0,2090
6,Boskruin,2154.0,2188
7,Braamfontein,2017.0,2001
8,Bramley View,,2090
9,Bromhof,2154.0,2188


In [6]:
# Drop the newly converted NaN values
johannesburg_codes.dropna( axis= 0, inplace=True)
johannesburg_codes.reset_index(drop = True, inplace = True)

johannesburg_codes.head(10)

Unnamed: 0,Town / City,Street Code,PO Box Code
0,JOHANNESBURG,2000,2001
1,Bedfordview,2008,2007
2,Belgravia,2043,2094
3,Bergvlei,2012,2090
4,Boskruin,2154,2188
5,Braamfontein,2017,2001
6,Bromhof,2154,2188
7,Bryanston,2021,2191
8,Bryanston East,2152,2191
9,Cresta,2118,2194


In [7]:
# Lead the Postal Codes with zero's to have 4 characters
johannesburg_codes['Street Code'] = johannesburg_codes['Street Code'].apply(lambda x: '{0:0>4}'.format(x))
johannesburg_codes

Unnamed: 0,Town / City,Street Code,PO Box Code
0,JOHANNESBURG,2000,2001
1,Bedfordview,2008,2007
2,Belgravia,2043,2094
3,Bergvlei,2012,2090
4,Boskruin,2154,2188
5,Braamfontein,2017,2001
6,Bromhof,2154,2188
7,Bryanston,2021,2191
8,Bryanston East,2152,2191
9,Cresta,2118,2194


In [8]:
# Rename the columns
johannesburg_codes.rename(columns={"Street Code": "Postal Code", "Town / City": "Suburb"}, inplace= True)
johannesburg_codes

Unnamed: 0,Suburb,Postal Code,PO Box Code
0,JOHANNESBURG,2000,2001
1,Bedfordview,2008,2007
2,Belgravia,2043,2094
3,Bergvlei,2012,2090
4,Boskruin,2154,2188
5,Braamfontein,2017,2001
6,Bromhof,2154,2188
7,Bryanston,2021,2191
8,Bryanston East,2152,2191
9,Cresta,2118,2194


In [9]:
#Drop duplicate rows

johannesburg_codes.drop_duplicates(subset=['Suburb'], inplace= True)
johannesburg_codes

Unnamed: 0,Suburb,Postal Code,PO Box Code
0,JOHANNESBURG,2000,2001
1,Bedfordview,2008,2007
2,Belgravia,2043,2094
3,Bergvlei,2012,2090
4,Boskruin,2154,2188
5,Braamfontein,2017,2001
6,Bromhof,2154,2188
7,Bryanston,2021,2191
8,Bryanston East,2152,2191
9,Cresta,2118,2194


In [10]:
# Combine rows with the same Postal Code
joburg_data = johannesburg_codes.groupby('Postal Code')['Suburb'].apply(', '.join).reset_index()
joburg_data

Unnamed: 0,Postal Code,Suburb
0,18,Danville
1,41,Die Wilgers
2,44,"Morletapark, Moreletapark"
3,54,Silver Lakes
4,78,Pretoria South
5,116,Pretoria North
6,117,Pretoria West
7,150,Magalieskruin
8,151,Montana
9,188,Dorandia


In [11]:
joburg_data.shape

(42, 2)

In [12]:
johannesburg_codes.head()

Unnamed: 0,Suburb,Postal Code,PO Box Code
0,JOHANNESBURG,2000,2001
1,Bedfordview,2008,2007
2,Belgravia,2043,2094
3,Bergvlei,2012,2090
4,Boskruin,2154,2188


In [13]:
# http://download.geonames.org/export/zip/
za = pd.read_csv("za_postal_codes.txt", sep="\t", header = None, names= ["0","1","2","3","4","5","6","7","8","9","10","11"])
za.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,ZA,2,Pretoria,,,,,,,-25.7069,28.2294,
1,ZA,3,Pretoria,,,,,,,-25.7069,28.2294,4.0
2,ZA,4,Pretoria,,,,,,,-25.7069,28.2294,4.0
3,ZA,5,Pretoria,,,,,,,-25.7069,28.2294,4.0
4,ZA,6,Pretoria,,,,,,,-25.7069,28.2294,4.0


In [14]:
za['1'] = za['1'].apply(lambda x: '{0:0>4}'.format(x))
za.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,ZA,2,Pretoria,,,,,,,-25.7069,28.2294,
1,ZA,3,Pretoria,,,,,,,-25.7069,28.2294,4.0
2,ZA,4,Pretoria,,,,,,,-25.7069,28.2294,4.0
3,ZA,5,Pretoria,,,,,,,-25.7069,28.2294,4.0
4,ZA,6,Pretoria,,,,,,,-25.7069,28.2294,4.0


In [15]:
# Drop blank columns as well as the country code column as all the data is for the same country
za.drop(['0', '3','4', '5', '6', '7','8','11'], axis=1, inplace= True)
za.head()

Unnamed: 0,1,2,9,10
0,2,Pretoria,-25.7069,28.2294
1,3,Pretoria,-25.7069,28.2294
2,4,Pretoria,-25.7069,28.2294
3,5,Pretoria,-25.7069,28.2294
4,6,Pretoria,-25.7069,28.2294


In [16]:
za.rename(columns = {"1":"Postal Code","2":"Suburb", "9":"Latitude", "10":"Longitude"}, inplace = True)
za.head()

Unnamed: 0,Postal Code,Suburb,Latitude,Longitude
0,2,Pretoria,-25.7069,28.2294
1,3,Pretoria,-25.7069,28.2294
2,4,Pretoria,-25.7069,28.2294
3,5,Pretoria,-25.7069,28.2294
4,6,Pretoria,-25.7069,28.2294


In [17]:
# Joining the two dataframes on the same postal code

joburg_data = pd.merge(joburg_data,za,on="Postal Code")
joburg_data.head()

Unnamed: 0,Postal Code,Suburb_x,Suburb_y,Latitude,Longitude
0,18,Danville,Pretoria,-25.7069,28.2294
1,41,Die Wilgers,Pretoria,-25.7069,28.2294
2,44,"Morletapark, Moreletapark",Moreletapark,-25.7069,28.2294
3,44,"Morletapark, Moreletapark",Pretoria,-25.7069,28.2294
4,54,Silver Lakes,Pretoria,-25.7069,28.2294
5,78,Pretoria South,Pretoria,-25.7069,28.2294
6,116,Pretoria North,Pretoria,-25.7069,28.2294
7,117,Pretoria West,Pretoria,-25.7069,28.2294
8,150,Magalieskruin,Magalieskruin,-25.7069,28.2294
9,150,Magalieskruin,Pretoria,-25.7069,28.2294


In [18]:
joburg_data.drop_duplicates(subset=['Suburb_x'], inplace= True)
joburg_data

Unnamed: 0,Postal Code,Suburb_x,Suburb_y,Latitude,Longitude
0,18,Danville,Pretoria,-25.7069,28.2294
1,41,Die Wilgers,Pretoria,-25.7069,28.2294
2,44,"Morletapark, Moreletapark",Moreletapark,-25.7069,28.2294
4,54,Silver Lakes,Pretoria,-25.7069,28.2294
5,78,Pretoria South,Pretoria,-25.7069,28.2294
6,116,Pretoria North,Pretoria,-25.7069,28.2294
7,117,Pretoria West,Pretoria,-25.7069,28.2294
8,150,Magalieskruin,Magalieskruin,-25.7069,28.2294
10,151,Montana,Pretoria,-25.7069,28.2294
11,188,Dorandia,Pretoria,-25.7069,28.2294


In [20]:
# Dropping the extra suburb column

joburg_data.drop(["Suburb_y"], axis = 1, inplace= True)

In [21]:
joburg_data.head()

Unnamed: 0,Postal Code,Suburb_x,Latitude,Longitude
0,18,Danville,-25.7069,28.2294
1,41,Die Wilgers,-25.7069,28.2294
2,44,"Morletapark, Moreletapark",-25.7069,28.2294
4,54,Silver Lakes,-25.7069,28.2294
5,78,Pretoria South,-25.7069,28.2294


In [30]:
joburg_data.rename(columns = {"Suburb_x": "Suburb"}, inplace = True)
za.head(100)

Unnamed: 0,Postal Code,Suburb,Latitude,Longitude
0,2,Pretoria,-25.7069,28.2294
1,3,Pretoria,-25.7069,28.2294
2,4,Pretoria,-25.7069,28.2294
3,5,Pretoria,-25.7069,28.2294
4,6,Pretoria,-25.7069,28.2294
5,7,Pretoria,-25.7069,28.2294
6,8,Atteridgeville,-25.7728,28.0678
7,8,Pretoria,-25.7069,28.2294
8,9,Pretoria,-25.7069,28.2294
9,10,Glenstantia,-25.7069,28.2294


In [33]:
joburg_data[-20:]

Unnamed: 0,Postal Code,Suburb,Latitude,Longitude
26,2062,Lonehill,-26.2,28.0833
27,2063,Marlboro,-26.2,28.0833
29,2064,Naturena,-26.2,28.0833
30,2103,Eastgate,-26.2,28.0833
31,2104,Linden,-26.2,28.0833
32,2110,Mondeor,-26.2,28.0833
33,2115,Northcliff,-26.2,28.0833
34,2118,Cresta,-26.2,28.0833
35,2128,Rivonia,-26.05,28.05
38,2148,Eastgate Ext,-26.2,28.0833


In [23]:
# Map of Johannesburg
address = 'Johannesburg'

geolocator = Nominatim(user_agent="Johannesburg_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Johannesburg are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Johannesburg are -26.205, 28.049722.


In [28]:
# create map of joburg_data using latitude and longitude values
map_johannesburg = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, suburb, postal_code in zip(joburg_data["Latitude"], joburg_data["Longitude"], joburg_data["Suburb"], joburg_data["Postal Code"]):
    label = '{}, {}'.format(suburb, postal_code)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_johannesburg) 
    
map_johannesburg
