<h1> Shopping Mall Web Scraper </h1>

<h2> Scraping Malls from Wikipedia </h2>

In [None]:
#Import libraries
from bs4 import BeautifulSoup
import requests
import re

In [None]:
url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore"
response = requests.get(url, timeout = 5)
content = BeautifulSoup(response.content, "html.parser")

In [None]:
content

In [None]:
original_content = content.findAll("a", href = re.compile("wiki"))

In [None]:
new_content = content.findAll("a", attrs = { "class" : "new" })

In [None]:
malls = []

for node in new_content:
    mallName = "".join(node.findAll(text = True))
    if len(mallName) > 1:
        malls.append("".join(node.findAll(text = True)))

for node in original_content:
    mallName = "".join(node.findAll(text = True))
    if len(mallName) > 1:
        malls.append("".join(node.findAll(text = True)))

In [None]:
malls

<h2> Data Cleaning </h2>

In [None]:
# Remove non-mall data
malls_sliced = malls[:286]

In [None]:
malls_sliced.sort()
malls_sliced

In [None]:
# Make all names uppercase
def uppercase(dataset):
    result = []
    for i in dataset:
        result.append(i.upper())
    return result

In [None]:
malls_sliced = uppercase(malls_sliced) 

In [None]:
# Remove "the" in mall names (to better clear duplicates)
def remove_the(dataset):
    result = []
    for i in dataset:
        result.append(i.replace("the", ""))
    return result

In [None]:
malls_sliced = remove_the(malls_sliced)

In [None]:
# Remove extra spaces in mall names
def remove_whitespace(dataset):
    result = []
    for i in dataset:
        result.append(i.strip())
    return result

In [None]:
malls_sliced = remove_whitespace(malls_sliced)

In [None]:
# Remove duplicates
def remove_duplicates(dataset): 
    result = []
    for i in dataset: 
        if i not in result:
            result.append(i)
    return result

In [None]:
malls_sliced = remove_duplicates(malls_sliced)

In [None]:
malls_sliced

In [None]:
malls_sliced.remove("REMOVED")
malls_sliced.remove("SOURCES")
malls_sliced.remove("IMPROVE THIS ARTICLE")
malls_sliced.remove("ADDING CITATIONS TO RELIABLE SOURCES")
malls_sliced.remove("DEMOLISHED")
malls_sliced.remove("LEARN HOW AND WHEN TO REMOVE THIS TEMPLATE MESSAGE")
malls_sliced.remove("SINGAPORE")
malls_sliced.remove("CITE")

# Remove demolished malls
malls_sliced.remove("SPECIALISTS' SHOPPING CENTRE")
malls_sliced.remove("FITZPATRICK'S")
malls_sliced.remove("MULTIPLEXES")
malls_sliced.remove("THE VERGE")

# Remove unopened malls
malls_sliced.remove("TEKKA PLACE")

# Remove identical malls recorded in different names
malls_sliced.remove("WESTGATE MALL")
malls_sliced.remove("CAPITOL CENTRE")
malls_sliced.remove("DJITSUN MALL BEDOK")
malls_sliced.remove("HDB")
malls_sliced.remove("HOLLAND V SHOPPING MALL")
malls_sliced.remove("MUSTAFA CENTRE")
malls_sliced.remove("RAFFLES CITY")
malls_sliced.remove("SCOTTS SHOPPING CENTRE")
malls_sliced.remove("SUNTEC CITY MALL")

<h2> Retrieve Coordinates for Shopping Malls </h2>
<h3> Retrieve Postal Codes for Shopping Malls </h3>
Postal codes of malls are retrieved from the 'buildings.json' file on https://github.com/xkjyeah/singapore-postal-codes/blob/master/download_postal_codes.py

In [None]:
# Import libraries
import json
import csv
from googlesearch import search

In [None]:
with open("buildings.json", "r") as f:
    datastore = json.load(f)

In [None]:
# View datastore
datastore

In [None]:
# Retrieve mall coordinates from datastore
mall_codes = {}
MISSING = 1

for mall in malls_sliced:
    for i in range(len(datastore)):
        if mall in datastore[i]['BUILDING'].upper():
            mall_codes[mall] = datastore[i]['POSTAL']
    if mall not in mall_codes.keys():
        mall_codes[mall] = MISSING

In [None]:
mall_codes

In [None]:
# Retrieve the list of malls with coordinates not in database
missing_mall_codes = []
for mall in mall_codes:
    if mall_codes[mall] == MISSING:
        missing_mall_codes.append(mall)

In [None]:
missing_mall_codes

In [None]:
# Find postal codes for misisng malls
for mall in missing_mall_codes:
    query = mall.lower() + " postal code"
    for j in search(query, tld="co.in", num=10, stop=1, pause=2): 
        print(mall + ": " + j)

In [None]:
# Add coordinates based on searches
mall_codes['CITY GATE MALL'] = '199597'
mall_codes['CITY VIBE'] = '129581'
mall_codes['CLARKE QUAY CENTRAL'] = '059815'
mall_codes['FERNVALE POINT'] = '791436'
mall_codes['GV YISHUN'] = '768794'
mall_codes['HOLLAND VILLAGE SHOPPING MALL'] = '278967'
mall_codes['JURONG ENTERTAINMENT CENTRE'] = '609731'
mall_codes['MARINA BAY FINANCIAL CENTRE TOWER 3'] = '018982'
mall_codes['MUSTAFA SHOPPING CENTRE'] = '207704'
mall_codes['ORCHARD MIDPOINT'] = '238852'
mall_codes['SHAW HOUSE AND CENTRE'] = '238868'
mall_codes['TEKKA CENTRE'] = '210665'
mall_codes["UNITED SQUARE SHOPPING MALL"] = '307591'
mall_codes["YUE HWA BUILDING"] = '059805'
mall_codes["MYVILLAGE @ SERANGOON"] = '556679'

<h3> Get Coordinates from Postal Codes </h3>
Using selenium webdriver to create an automatic fetcher for coordinates of malls in Singapore. Export the list of dictionaries to csv.

In [None]:
# Import webdriver
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.alert import Alert

# Import pandas
import pandas as pd

# Import time
import time

In [None]:
# Create mall list with latitude and longitude
malls_lat_long = []

for mall in mall_codes.keys():
    search_term = mall + " Singapore " + mall_codes[mall]
    
    # Create browser
    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    browser = webdriver.Firefox(capabilities=cap, executable_path="C:\\path\\to\\geckodriver.exe")
    browser.get("https://www.latlong.net/convert-address-to-lat-long.html")
    
    # input search text into search bar
    search_bar = browser.find_element_by_xpath("//input[@placeholder='Type address here to get lat long']")
    search_bar.send_keys(search_term)
    search_bar.submit()

    time.sleep(8)

    # get latitude and longitude
    lat_long = browser.find_element_by_id('latlngspan')
    lat_long_strip = lat_long.text.strip('() ').split(',')
    lat_long_clean = [float(n) for n in lat_long_strip]

    # create dictionary of mall, latitude and longitude
    mall_details = {}
    mall_details['name'] = mall
    mall_details['latitude'] = lat_long_clean[0]
    mall_details['longitude'] = lat_long_clean[1]
        
    # add dictionary to malls_lat_long list
    malls_lat_long.append(mall_details)
    
    browser.close()
    time.sleep(10)

In [None]:
# View malls_lat_long
malls_lat_long

In [None]:
# Convert to malls_lat_long to pandas dataframe
df_malls = pd.DataFrame(malls_lat_long)

In [None]:
# Export to csv
df_malls.to_csv('mall_coordinates.csv')