# Website Scraping
This is a simple program to scrap carpark information from a website using Pandas and BeautifulSoup and return it in csv.

In [None]:
import collections
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Set up variables

In [None]:
# Set up dictionary
keys = ['Carpark Name', 'Carpark Address', 'Mon-Fri before 5/6pm', 'Mon-Fri after 5/6pm', 'Sat', 'Sun/Public Holiday']
d = collections.OrderedDict((key,[]) for key in keys)

# Set up variables
full_table = 0
location = 'na'
address = 'na'
url = 'na'

### Define functions: web scraping, saving to dataframe, saving to csv

In [None]:
"""
This function takes the url and returns the parking fees of the carpark
"""
    
def readurl(url):
    dfs = pd.read_html(url)
    data = dfs[6] # dataframe 6 contains the required info
    global full_table
    full_table = data.iloc[:,0] # required info is in first column
    return full_table


"""
This function takes the url and returns the location and address of the carpark
"""

def add_and_loc(url):
    resp = requests.get(url)
    html_doc = resp.text
    soup = BeautifulSoup(html_doc, 'html.parser')

    title = soup.find(class_="grayboxborder") # this class contains the table of information
    global location
    location = title.text.strip()
    loc = location.split("\n")
    location = loc[0]
    global address
    address = loc[1].rstrip()
    return location, address


"""
This function appends all the information to the dictionary
"""

def append(full_table, location, address):
    global d
    d['Carpark Name'].append(location)
    d['Carpark Address'].append(address)
    d['Mon-Fri before 5/6pm'].append(full_table[1])
    d['Mon-Fri after 5/6pm'].append(full_table[3])
    d['Sat'].append(full_table[5])
    d['Sun/Public Holiday'].append(full_table[7])
    

"""
This function passes the dictionary into a dataframe and saves it into a csv file
"""

def convert(dictionary, n):
    df = pd.DataFrame.from_dict(dictionary, orient='index')
    dft = df.transpose()
    dft.to_csv('Parking_'+str(n)+'.csv', index=False)

### Run functions

In [None]:
for n in range(0,10):
    for i in range( 1+(100*n) , 101+(100*n) ): # there are 926 webpages
        try:
            front_url = 'http://www.sgcarmart.com/news/carpark_index.php?ID='
            back_url = '&LOC=all&TYP=carpark&SRH=#carparkrates'
            url = front_url + str("%03d" % i) + back_url
            readurl(url)
            add_and_loc(url)
            append(full_table,location,address)
        except AttributeError: # skip blank/error pages and returns url for reference
            print(url + ' attr')
            i += 1
            continue
        except ValueError: # skip blank/error pages and returns url for reference
            print(url + ' val')
            i += 1
            continue
        except IndexError: # skip blank/error pages and returns url for reference
            print(url + ' index')
            i += 1
            continue

        convert(d,n) # creates a csv for every 100 in case of error midway

### Create new column (Info)

In [None]:
# Load csv
full_df = pd.read_csv('Parking_9.csv')

In [None]:
# Create new column ['Info']
full_df['Info'] = '<b>Carpark: </b>' + full_df['Carpark Name']
full_df['Info'] += '\n<b>Address: </b>' + full_df['Carpark Address']
full_df['Info'] += '\n<b>Mon-Fri before 5/6pm: </b>' + full_df['Mon-Fri before 5/6pm']
full_df['Info'] += '\n<b>Mon-Fri after 5/6pm: </b>' + full_df['Mon-Fri after 5/6pm']
full_df['Info'] += '\n<b>Sat: </b>' + full_df['Sat']
full_df['Info'] += '\n<b>Sun/ Public Holiday: </b>' + full_df['Sun/Public Holiday']

### Clean data: remove carparks not in use

In [None]:
searchfor = ['Carpark closed', 'Private Car Park', 'Carpark not in use', 'Season Parking Only','Building Demolished']
clean_df = full_df[~full_df['Mon-Fri before 5/6pm'].str.contains('|'.join(searchfor))]

### Create new csv file

In [None]:
# Create csv file
clean_df.to_csv('Parking.csv', index=False)